From 53fb87a54526c40b792da8c2fada591b5d835d55 Mon Sep 17 00:00:00 2001 From: Matthew Darnell Date: Thu, 29 Feb 2024 22:18:05 -0500 Subject: [PATCH 1/5] Begin stripping out FFI; replacing with Rust --- api/src/response.rs | 1 + api/src/response/response_entity.rs | 9 +- crypto/Cargo.toml | 6 +- crypto/build.rs | 86 - crypto/src/fourq.rs | 7 + crypto/src/fourq/consts.rs | 402 + crypto/src/fourq/ops.rs | 1335 ++ crypto/src/fourq/types.rs | 47 + crypto/src/lib.rs | 144 +- ffi-deps/FourQlib/FourQ_32bit/FourQ.h | 173 - ffi-deps/FourQlib/FourQ_32bit/FourQ_api.h | 115 - .../FourQlib/FourQ_32bit/FourQ_internal.h | 354 - ffi-deps/FourQlib/FourQ_32bit/FourQ_params.h | 35 - ffi-deps/FourQlib/FourQ_32bit/FourQ_tables.h | 369 - ffi-deps/FourQlib/FourQ_32bit/README.md | 74 - .../FourQ_32bit/Visual Studio/FourQ/FourQ.sln | 40 - .../Visual Studio/FourQ/FourQ.vcxproj | 240 - .../Visual Studio/FourQ/FourQ.vcxproj.filters | 66 - .../crypto_tests/crypto_tests.vcxproj | 243 - .../crypto_tests/crypto_tests.vcxproj.filters | 33 - .../Visual Studio/ecc_tests/ecc_tests.vcxproj | 243 - .../ecc_tests/ecc_tests.vcxproj.filters | 33 - .../Visual Studio/fp_tests/fp_tests.vcxproj | 226 - .../fp_tests/fp_tests.vcxproj.filters | 33 - ffi-deps/FourQlib/FourQ_32bit/crypto_util.c | 174 - ffi-deps/FourQlib/FourQ_32bit/eccp2.c | 1146 -- ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c | 157 - ffi-deps/FourQlib/FourQ_32bit/generic/fp.h | 523 - ffi-deps/FourQlib/FourQ_32bit/kex.c | 181 - ffi-deps/FourQlib/FourQ_32bit/makefile | 98 - ffi-deps/FourQlib/FourQ_32bit/schnorrq.c | 191 - ffi-deps/FourQlib/FourQ_32bit/table_lookup.h | 167 - .../FourQlib/FourQ_32bit/tests/crypto_tests.c | 368 - .../FourQlib/FourQ_32bit/tests/ecc_tests.c | 656 - .../FourQlib/FourQ_32bit/tests/fp_tests.c | 368 - .../FourQlib/FourQ_32bit/tests/test_extras.c | 216 - .../FourQlib/FourQ_32bit/tests/test_extras.h | 62 - .../FourQ_64bit_and_portable/AMD64/consts.c | 15 - .../FourQ_64bit_and_portable/AMD64/fp2_1271.S | 354 - .../AMD64/fp2_1271_AVX2.S | 446 - .../FourQ_64bit_and_portable/AMD64/fp_x64.h | 409 - .../FourQ_64bit_and_portable/ARM64/fp_arm64.h | 327 - .../FourQlib/FourQ_64bit_and_portable/FourQ.h | 217 - .../FourQ_64bit_and_portable/FourQ_api.h | 147 - .../FourQ_64bit_and_portable/FourQ_internal.h | 418 - .../FourQ_64bit_and_portable/FourQ_params.h | 52 - .../FourQ_64bit_and_portable/FourQ_tables.h | 365 - .../FourQ_64bit_and_portable/README.md | 125 - .../Visual Studio/FourQ/FourQ.sln | 72 - .../Visual Studio/FourQ/FourQ.vcxproj | 245 - .../Visual Studio/FourQ/FourQ.vcxproj.filters | 78 - .../crypto_tests/crypto_tests.vcxproj | 237 - .../crypto_tests/crypto_tests.vcxproj.filters | 33 - .../Visual Studio/ecc_tests/ecc_tests.vcxproj | 237 - .../ecc_tests/ecc_tests.vcxproj.filters | 33 - .../Visual Studio/fp_tests/fp_tests.vcxproj | 219 - .../fp_tests/fp_tests.vcxproj.filters | 33 - .../FourQ_64bit_and_portable/crypto_util.c | 239 - .../FourQlib/FourQ_64bit_and_portable/eccp2.c | 486 - .../FourQ_64bit_and_portable/eccp2_core.c | 727 - .../FourQ_64bit_and_portable/eccp2_no_endo.c | 160 - .../FourQ_64bit_and_portable/generic/fp.h | 409 - .../FourQ_64bit_and_portable/hash_to_curve.c | 237 - .../FourQlib/FourQ_64bit_and_portable/kex.c | 181 - .../FourQ_64bit_and_portable/makefile | 188 - .../FourQ_64bit_and_portable/schnorrq.c | 190 - .../FourQ_64bit_and_portable/table_lookup.h | 290 - .../tests/crypto_tests.c | 456 - .../tests/ecc_tests.c | 718 - .../FourQ_64bit_and_portable/tests/fp_tests.c | 357 - .../tests/test_extras.c | 389 - .../tests/test_extras.h | 53 - ffi-deps/FourQlib/LICENSE | 21 - ffi-deps/FourQlib/README.md | 128 - ffi-deps/FourQlib/SECURITY.md | 41 - ffi-deps/FourQlib/random/random.c | 86 - ffi-deps/FourQlib/random/random.h | 20 - ffi-deps/FourQlib/sha512/sha512.c | 306 - ffi-deps/FourQlib/sha512/sha512.h | 20 - ffi-deps/K12/README.markdown | 84 - .../lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S | 623 - .../K12/lib/ARMv8Asha3/KeccakP-1600-SnP.h | 65 - .../K12/lib/ARMv8Asha3/KeccakP-1600-opt64.c | 227 - .../K12/lib/Inplace32BI/KeccakP-1600-SnP.h | 35 - .../Inplace32BI/KeccakP-1600-inplace32BI.c | 1068 -- ffi-deps/K12/lib/KangarooTwelve.c | 333 - ffi-deps/K12/lib/KangarooTwelve.h | 134 - .../K12/lib/Optimized64/KeccakP-1600-AVX2.s | 664 - .../Optimized64/KeccakP-1600-AVX512-plainC.c | 241 - .../K12/lib/Optimized64/KeccakP-1600-AVX512.s | 551 - .../K12/lib/Optimized64/KeccakP-1600-SnP.h | 74 - .../K12/lib/Optimized64/KeccakP-1600-opt64.c | 1026 -- .../KeccakP-1600-runtimeDispatch.c | 406 - .../Optimized64/KeccakP-1600-timesN-AVX2.c | 419 - .../Optimized64/KeccakP-1600-timesN-AVX512.c | 458 - .../Optimized64/KeccakP-1600-timesN-SSSE3.c | 438 - ffi-deps/K12/lib/Plain64/KeccakP-1600-SnP.h | 48 - .../K12/lib/Plain64/KeccakP-1600-plain64.c | 24 - ffi-deps/K12/lib/align.h | 34 - ffi-deps/K12/lib/brg_endian.h | 143 - ffi-deps/chopper-linux.cpp | 364 - ffi-deps/chopper-win.cpp | 3169 ---- ffi-deps/simde/CONTRIBUTING.md | 85 - ffi-deps/simde/COPYING | 20 - ffi-deps/simde/README.md | 496 - ffi-deps/simde/simde/check.h | 276 - ffi-deps/simde/simde/debug-trap.h | 85 - ffi-deps/simde/simde/hedley.h | 2044 --- ffi-deps/simde/simde/simde-aes.h | 265 - ffi-deps/simde/simde/simde-align.h | 450 - ffi-deps/simde/simde/simde-arch.h | 622 - ffi-deps/simde/simde/simde-bf16.h | 131 - ffi-deps/simde/simde/simde-common.h | 1192 -- ffi-deps/simde/simde/simde-complex.h | 148 - ffi-deps/simde/simde/simde-constify.h | 397 - ffi-deps/simde/simde/simde-detect-clang.h | 125 - ffi-deps/simde/simde/simde-diagnostic.h | 456 - ffi-deps/simde/simde/simde-f16.h | 319 - ffi-deps/simde/simde/simde-features.h | 752 - ffi-deps/simde/simde/simde-math.h | 2065 --- ffi-deps/simde/simde/x86/aes.h | 417 - ffi-deps/simde/simde/x86/avx.h | 6267 -------- ffi-deps/simde/simde/x86/avx2.h | 5758 -------- ffi-deps/simde/simde/x86/avx512.h | 149 - ffi-deps/simde/simde/x86/avx512/2intersect.h | 249 - ffi-deps/simde/simde/x86/avx512/4dpwssd.h | 67 - ffi-deps/simde/simde/x86/avx512/4dpwssds.h | 67 - ffi-deps/simde/simde/x86/avx512/abs.h | 580 - ffi-deps/simde/simde/x86/avx512/add.h | 641 - ffi-deps/simde/simde/x86/avx512/adds.h | 529 - ffi-deps/simde/simde/x86/avx512/and.h | 305 - ffi-deps/simde/simde/x86/avx512/andnot.h | 193 - ffi-deps/simde/simde/x86/avx512/avg.h | 258 - ffi-deps/simde/simde/x86/avx512/bitshuffle.h | 202 - ffi-deps/simde/simde/x86/avx512/blend.h | 293 - ffi-deps/simde/simde/x86/avx512/broadcast.h | 897 -- ffi-deps/simde/simde/x86/avx512/cast.h | 357 - ffi-deps/simde/simde/x86/avx512/cmp.h | 1714 --- ffi-deps/simde/simde/x86/avx512/cmpeq.h | 241 - ffi-deps/simde/simde/x86/avx512/cmpge.h | 1434 -- ffi-deps/simde/simde/x86/avx512/cmpgt.h | 212 - ffi-deps/simde/simde/x86/avx512/cmple.h | 1432 -- ffi-deps/simde/simde/x86/avx512/cmplt.h | 123 - ffi-deps/simde/simde/x86/avx512/cmpneq.h | 490 - ffi-deps/simde/simde/x86/avx512/compress.h | 755 - ffi-deps/simde/simde/x86/avx512/conflict.h | 351 - ffi-deps/simde/simde/x86/avx512/copysign.h | 86 - ffi-deps/simde/simde/x86/avx512/cvt.h | 402 - ffi-deps/simde/simde/x86/avx512/cvts.h | 781 - ffi-deps/simde/simde/x86/avx512/cvtt.h | 130 - ffi-deps/simde/simde/x86/avx512/cvtus.h | 67 - ffi-deps/simde/simde/x86/avx512/dbsad.h | 388 - ffi-deps/simde/simde/x86/avx512/div.h | 162 - ffi-deps/simde/simde/x86/avx512/dpbf16.h | 281 - ffi-deps/simde/simde/x86/avx512/dpbusd.h | 292 - ffi-deps/simde/simde/x86/avx512/dpbusds.h | 344 - ffi-deps/simde/simde/x86/avx512/dpwssd.h | 269 - ffi-deps/simde/simde/x86/avx512/dpwssds.h | 299 - ffi-deps/simde/simde/x86/avx512/expand.h | 97 - ffi-deps/simde/simde/x86/avx512/extract.h | 267 - ffi-deps/simde/simde/x86/avx512/fixupimm.h | 900 -- .../simde/simde/x86/avx512/fixupimm_round.h | 687 - .../simde/simde/x86/avx512/flushsubnormal.h | 91 - ffi-deps/simde/simde/x86/avx512/fmadd.h | 136 - ffi-deps/simde/simde/x86/avx512/fmsub.h | 276 - ffi-deps/simde/simde/x86/avx512/fnmadd.h | 108 - ffi-deps/simde/simde/x86/avx512/fnmsub.h | 108 - ffi-deps/simde/simde/x86/avx512/fpclass.h | 99 - ffi-deps/simde/simde/x86/avx512/gather.h | 312 - ffi-deps/simde/simde/x86/avx512/insert.h | 490 - ffi-deps/simde/simde/x86/avx512/kand.h | 53 - ffi-deps/simde/simde/x86/avx512/knot.h | 106 - ffi-deps/simde/simde/x86/avx512/kshift.h | 152 - ffi-deps/simde/simde/x86/avx512/kxor.h | 107 - ffi-deps/simde/simde/x86/avx512/load.h | 115 - ffi-deps/simde/simde/x86/avx512/loadu.h | 297 - ffi-deps/simde/simde/x86/avx512/lzcnt.h | 220 - ffi-deps/simde/simde/x86/avx512/madd.h | 157 - ffi-deps/simde/simde/x86/avx512/maddubs.h | 159 - ffi-deps/simde/simde/x86/avx512/max.h | 611 - ffi-deps/simde/simde/x86/avx512/min.h | 611 - ffi-deps/simde/simde/x86/avx512/mov.h | 865 -- ffi-deps/simde/simde/x86/avx512/mov_mask.h | 372 - ffi-deps/simde/simde/x86/avx512/movm.h | 460 - ffi-deps/simde/simde/x86/avx512/mul.h | 279 - ffi-deps/simde/simde/x86/avx512/mulhi.h | 65 - ffi-deps/simde/simde/x86/avx512/mulhrs.h | 65 - ffi-deps/simde/simde/x86/avx512/mullo.h | 169 - ffi-deps/simde/simde/x86/avx512/multishift.h | 170 - ffi-deps/simde/simde/x86/avx512/negate.h | 88 - ffi-deps/simde/simde/x86/avx512/or.h | 308 - ffi-deps/simde/simde/x86/avx512/packs.h | 122 - ffi-deps/simde/simde/x86/avx512/packus.h | 122 - ffi-deps/simde/simde/x86/avx512/permutex.h | 101 - .../simde/simde/x86/avx512/permutex2var.h | 1645 --- ffi-deps/simde/simde/x86/avx512/permutexvar.h | 1194 -- ffi-deps/simde/simde/x86/avx512/popcnt.h | 1346 -- ffi-deps/simde/simde/x86/avx512/range.h | 745 - ffi-deps/simde/simde/x86/avx512/range_round.h | 686 - ffi-deps/simde/simde/x86/avx512/rcp.h | 65 - ffi-deps/simde/simde/x86/avx512/reduce.h | 355 - ffi-deps/simde/simde/x86/avx512/rol.h | 410 - ffi-deps/simde/simde/x86/avx512/rolv.h | 415 - ffi-deps/simde/simde/x86/avx512/ror.h | 410 - ffi-deps/simde/simde/x86/avx512/rorv.h | 391 - ffi-deps/simde/simde/x86/avx512/round.h | 282 - ffi-deps/simde/simde/x86/avx512/roundscale.h | 616 - .../simde/simde/x86/avx512/roundscale_round.h | 690 - ffi-deps/simde/simde/x86/avx512/sad.h | 77 - ffi-deps/simde/simde/x86/avx512/scalef.h | 389 - ffi-deps/simde/simde/x86/avx512/set.h | 572 - ffi-deps/simde/simde/x86/avx512/set1.h | 352 - ffi-deps/simde/simde/x86/avx512/set4.h | 140 - ffi-deps/simde/simde/x86/avx512/setone.h | 72 - ffi-deps/simde/simde/x86/avx512/setr.h | 144 - ffi-deps/simde/simde/x86/avx512/setr4.h | 140 - ffi-deps/simde/simde/x86/avx512/setzero.h | 105 - ffi-deps/simde/simde/x86/avx512/shldv.h | 157 - ffi-deps/simde/simde/x86/avx512/shuffle.h | 417 - ffi-deps/simde/simde/x86/avx512/sll.h | 247 - ffi-deps/simde/simde/x86/avx512/slli.h | 179 - ffi-deps/simde/simde/x86/avx512/sllv.h | 122 - ffi-deps/simde/simde/x86/avx512/sqrt.h | 127 - ffi-deps/simde/simde/x86/avx512/sra.h | 81 - ffi-deps/simde/simde/x86/avx512/srai.h | 96 - ffi-deps/simde/simde/x86/avx512/srav.h | 67 - ffi-deps/simde/simde/x86/avx512/srl.h | 216 - ffi-deps/simde/simde/x86/avx512/srli.h | 180 - ffi-deps/simde/simde/x86/avx512/srlv.h | 282 - ffi-deps/simde/simde/x86/avx512/store.h | 93 - ffi-deps/simde/simde/x86/avx512/storeu.h | 218 - ffi-deps/simde/simde/x86/avx512/sub.h | 351 - ffi-deps/simde/simde/x86/avx512/subs.h | 222 - .../simde/simde/x86/avx512/ternarylogic.h | 3769 ----- ffi-deps/simde/simde/x86/avx512/test.h | 232 - ffi-deps/simde/simde/x86/avx512/testn.h | 63 - ffi-deps/simde/simde/x86/avx512/types.h | 821 -- ffi-deps/simde/simde/x86/avx512/unpackhi.h | 753 - ffi-deps/simde/simde/x86/avx512/unpacklo.h | 752 - ffi-deps/simde/simde/x86/avx512/xor.h | 319 - ffi-deps/simde/simde/x86/avx512/xorsign.h | 72 - ffi-deps/simde/simde/x86/clmul.h | 387 - ffi-deps/simde/simde/x86/f16c.h | 172 - ffi-deps/simde/simde/x86/fma.h | 732 - ffi-deps/simde/simde/x86/gfni.h | 1295 -- ffi-deps/simde/simde/x86/mmx.h | 2398 --- ffi-deps/simde/simde/x86/sse.h | 4830 ------ ffi-deps/simde/simde/x86/sse2.h | 7737 ---------- ffi-deps/simde/simde/x86/sse3.h | 515 - ffi-deps/simde/simde/x86/sse4.1.h | 2367 --- ffi-deps/simde/simde/x86/sse4.2.h | 381 - ffi-deps/simde/simde/x86/ssse3.h | 1057 -- ffi-deps/simde/simde/x86/svml.h | 12129 ---------------- ffi-deps/simde/simde/x86/xop.h | 3740 ----- identity/Cargo.toml | 1 - identity/src/lib.rs | 92 +- 256 files changed, 1950 insertions(+), 132728 deletions(-) delete mode 100644 crypto/build.rs create mode 100644 crypto/src/fourq.rs create mode 100644 crypto/src/fourq/consts.rs create mode 100644 crypto/src/fourq/ops.rs create mode 100644 crypto/src/fourq/types.rs delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/FourQ.h delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/FourQ_api.h delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/FourQ_internal.h delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/FourQ_params.h delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/FourQ_tables.h delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/README.md delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/Visual Studio/FourQ/FourQ.sln delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/Visual Studio/FourQ/FourQ.vcxproj delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/Visual Studio/FourQ/FourQ.vcxproj.filters delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/Visual Studio/crypto_tests/crypto_tests.vcxproj delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/Visual Studio/crypto_tests/crypto_tests.vcxproj.filters delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/Visual Studio/ecc_tests/ecc_tests.vcxproj delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/Visual Studio/ecc_tests/ecc_tests.vcxproj.filters delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/Visual Studio/fp_tests/fp_tests.vcxproj delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/Visual Studio/fp_tests/fp_tests.vcxproj.filters delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/crypto_util.c delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/eccp2.c delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/generic/fp.h delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/kex.c delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/makefile delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/schnorrq.c delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/table_lookup.h delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/tests/crypto_tests.c delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/tests/ecc_tests.c delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/tests/fp_tests.c delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/tests/test_extras.c delete mode 100644 ffi-deps/FourQlib/FourQ_32bit/tests/test_extras.h delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/AMD64/consts.c delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/AMD64/fp2_1271.S delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/AMD64/fp2_1271_AVX2.S delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/AMD64/fp_x64.h delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/ARM64/fp_arm64.h delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/FourQ.h delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/FourQ_api.h delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/FourQ_internal.h delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/FourQ_params.h delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/FourQ_tables.h delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/README.md delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/FourQ/FourQ.sln delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/FourQ/FourQ.vcxproj delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/FourQ/FourQ.vcxproj.filters delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/crypto_tests/crypto_tests.vcxproj delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/crypto_tests/crypto_tests.vcxproj.filters delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/ecc_tests/ecc_tests.vcxproj delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/ecc_tests/ecc_tests.vcxproj.filters delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/fp_tests/fp_tests.vcxproj delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/fp_tests/fp_tests.vcxproj.filters delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/crypto_util.c delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/eccp2.c delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/eccp2_core.c delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/eccp2_no_endo.c delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/generic/fp.h delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/hash_to_curve.c delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/kex.c delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/makefile delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/schnorrq.c delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/table_lookup.h delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/tests/crypto_tests.c delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/tests/ecc_tests.c delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/tests/fp_tests.c delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/tests/test_extras.c delete mode 100644 ffi-deps/FourQlib/FourQ_64bit_and_portable/tests/test_extras.h delete mode 100644 ffi-deps/FourQlib/LICENSE delete mode 100644 ffi-deps/FourQlib/README.md delete mode 100644 ffi-deps/FourQlib/SECURITY.md delete mode 100644 ffi-deps/FourQlib/random/random.c delete mode 100644 ffi-deps/FourQlib/random/random.h delete mode 100644 ffi-deps/FourQlib/sha512/sha512.c delete mode 100644 ffi-deps/FourQlib/sha512/sha512.h delete mode 100644 ffi-deps/K12/README.markdown delete mode 100644 ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S delete mode 100644 ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-SnP.h delete mode 100644 ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-opt64.c delete mode 100644 ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-SnP.h delete mode 100644 ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c delete mode 100644 ffi-deps/K12/lib/KangarooTwelve.c delete mode 100644 ffi-deps/K12/lib/KangarooTwelve.h delete mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s delete mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c delete mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s delete mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-SnP.h delete mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c delete mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c delete mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c delete mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c delete mode 100644 ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c delete mode 100644 ffi-deps/K12/lib/Plain64/KeccakP-1600-SnP.h delete mode 100644 ffi-deps/K12/lib/Plain64/KeccakP-1600-plain64.c delete mode 100644 ffi-deps/K12/lib/align.h delete mode 100644 ffi-deps/K12/lib/brg_endian.h delete mode 100644 ffi-deps/chopper-linux.cpp delete mode 100644 ffi-deps/chopper-win.cpp delete mode 100644 ffi-deps/simde/CONTRIBUTING.md delete mode 100644 ffi-deps/simde/COPYING delete mode 100644 ffi-deps/simde/README.md delete mode 100644 ffi-deps/simde/simde/check.h delete mode 100644 ffi-deps/simde/simde/debug-trap.h delete mode 100644 ffi-deps/simde/simde/hedley.h delete mode 100644 ffi-deps/simde/simde/simde-aes.h delete mode 100644 ffi-deps/simde/simde/simde-align.h delete mode 100644 ffi-deps/simde/simde/simde-arch.h delete mode 100644 ffi-deps/simde/simde/simde-bf16.h delete mode 100644 ffi-deps/simde/simde/simde-common.h delete mode 100644 ffi-deps/simde/simde/simde-complex.h delete mode 100644 ffi-deps/simde/simde/simde-constify.h delete mode 100644 ffi-deps/simde/simde/simde-detect-clang.h delete mode 100644 ffi-deps/simde/simde/simde-diagnostic.h delete mode 100644 ffi-deps/simde/simde/simde-f16.h delete mode 100644 ffi-deps/simde/simde/simde-features.h delete mode 100644 ffi-deps/simde/simde/simde-math.h delete mode 100644 ffi-deps/simde/simde/x86/aes.h delete mode 100644 ffi-deps/simde/simde/x86/avx.h delete mode 100644 ffi-deps/simde/simde/x86/avx2.h delete mode 100644 ffi-deps/simde/simde/x86/avx512.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/2intersect.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/4dpwssd.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/4dpwssds.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/abs.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/add.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/adds.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/and.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/andnot.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/avg.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/bitshuffle.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/blend.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/broadcast.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/cast.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/cmp.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/cmpeq.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/cmpge.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/cmpgt.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/cmple.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/cmplt.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/cmpneq.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/compress.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/conflict.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/copysign.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/cvt.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/cvts.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/cvtt.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/cvtus.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/dbsad.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/div.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/dpbf16.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/dpbusd.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/dpbusds.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/dpwssd.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/dpwssds.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/expand.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/extract.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/fixupimm.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/fixupimm_round.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/flushsubnormal.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/fmadd.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/fmsub.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/fnmadd.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/fnmsub.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/fpclass.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/gather.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/insert.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/kand.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/knot.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/kshift.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/kxor.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/load.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/loadu.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/lzcnt.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/madd.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/maddubs.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/max.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/min.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/mov.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/mov_mask.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/movm.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/mul.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/mulhi.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/mulhrs.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/mullo.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/multishift.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/negate.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/or.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/packs.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/packus.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/permutex.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/permutex2var.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/permutexvar.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/popcnt.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/range.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/range_round.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/rcp.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/reduce.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/rol.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/rolv.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/ror.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/rorv.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/round.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/roundscale.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/roundscale_round.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/sad.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/scalef.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/set.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/set1.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/set4.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/setone.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/setr.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/setr4.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/setzero.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/shldv.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/shuffle.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/sll.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/slli.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/sllv.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/sqrt.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/sra.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/srai.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/srav.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/srl.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/srli.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/srlv.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/store.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/storeu.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/sub.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/subs.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/ternarylogic.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/test.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/testn.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/types.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/unpackhi.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/unpacklo.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/xor.h delete mode 100644 ffi-deps/simde/simde/x86/avx512/xorsign.h delete mode 100644 ffi-deps/simde/simde/x86/clmul.h delete mode 100644 ffi-deps/simde/simde/x86/f16c.h delete mode 100644 ffi-deps/simde/simde/x86/fma.h delete mode 100644 ffi-deps/simde/simde/x86/gfni.h delete mode 100644 ffi-deps/simde/simde/x86/mmx.h delete mode 100644 ffi-deps/simde/simde/x86/sse.h delete mode 100644 ffi-deps/simde/simde/x86/sse2.h delete mode 100644 ffi-deps/simde/simde/x86/sse3.h delete mode 100644 ffi-deps/simde/simde/x86/sse4.1.h delete mode 100644 ffi-deps/simde/simde/x86/sse4.2.h delete mode 100644 ffi-deps/simde/simde/x86/ssse3.h delete mode 100644 ffi-deps/simde/simde/x86/svml.h delete mode 100644 ffi-deps/simde/simde/x86/xop.h diff --git a/api/src/response.rs b/api/src/response.rs index 10abf2b..7c37713 100644 --- a/api/src/response.rs +++ b/api/src/response.rs @@ -29,6 +29,7 @@ pub fn get_formatted_response(response: &mut QubicApiPacket) { Err(_err) => {} } } else { + println!("{:?}", response); println!("Malformed Current Tick Response."); } } diff --git a/api/src/response/response_entity.rs b/api/src/response/response_entity.rs index a32c089..1276208 100644 --- a/api/src/response/response_entity.rs +++ b/api/src/response/response_entity.rs @@ -1,6 +1,6 @@ use crate::QubicApiPacket; use crate::response::FormatQubicResponseDataToStructure; -use crate::identity::get_identity_from_pub_key; +use crate::crypto::qubic_identities::get_identity; #[derive(Debug, Clone)] pub struct ResponseEntity { pub identity: String, @@ -63,7 +63,12 @@ pub fn handle_response_entity(response: &mut QubicApiPacket) -> Option "_X86_", - "x86_64" => "_X86_", - _ => "_AMD64_" - }; - let extra_four_q_define: &str = match cpu { - "_X86_" => "_BOGUS_", - _ => "_ARM_" //Mac M1 need this - }; - - if os == "windows" { - return cc::Build::new() - .file("../ffi-deps/chopper-win.cpp") - .define("_MSC_VER", "1") - .define("_AMD64_", "1") - .compile("Chopper"); - } - - cc::Build::new() - .define("__LINUX__", "1") - .define(cpu, "1") - .define(extra_four_q_define, "1") - .define("_AVX_", "1") - .define("USE_ENDO", "true") - .include("../ffi-deps/FourQlib/FourQ_32bit") - .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/crypto_util.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/schnorrq.c") - .file("../ffi-deps/FourQlib/FourQ_32bit/kex.c") - .file("../ffi-deps/FourQlib/random/random.c") - .file("../ffi-deps/FourQlib/sha512/sha512.c") - .compile("libFourQ"); - - let mut binding = cc::Build::new(); - let k12 = binding - .include("../ffi-deps/K12/lib") - .file("../ffi-deps/K12/lib/KangarooTwelve.c"); - - if os == "linux" { - k12 - .include("../ffi-deps/K12/lib/Optimized64") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c") - .file("../ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c") - .flag("-march=native") - .flag("-mavx512vl") - .flag("-mavx512f") - .flag("-msse3") - .compile("KangarooTwelve"); - - cc::Build::new() - .file("../ffi-deps/chopper-linux.cpp") - .define("__LINUX__", "1") - .define(cpu, "1") - .compile("Chopper") - } else { - k12 - .include("../ffi-deps/K12/lib/Inplace32BI") - .file("../ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c") - .compile("KangarooTwelve"); - - cc::Build::new() - .file("../ffi-deps/chopper-linux.cpp") - .define("__LINUX__", "1") - .define("_AMD64_", "1") - .compile("Chopper") - - } -} diff --git a/crypto/src/fourq.rs b/crypto/src/fourq.rs new file mode 100644 index 0000000..66c16f3 --- /dev/null +++ b/crypto/src/fourq.rs @@ -0,0 +1,7 @@ +/* + Thanks To Mineco for Native Rust FourQ!: https://github.com/Mineco1006/qubic-utils +*/ + +pub mod consts; +pub mod types; +pub mod ops; \ No newline at end of file diff --git a/crypto/src/fourq/consts.rs b/crypto/src/fourq/consts.rs new file mode 100644 index 0000000..3f66260 --- /dev/null +++ b/crypto/src/fourq/consts.rs @@ -0,0 +1,402 @@ +#![allow(dead_code)] +pub const CURVE_ORDER_0: u64 = 0x2FB2540EC7768CE7; +pub const CURVE_ORDER_1: u64 = 0xDFBD004DFE0F7999; +pub const CURVE_ORDER_2: u64 = 0xF05397829CBC14E5; +pub const CURVE_ORDER_3: u64 = 0x0029CBC14E5E0A72; + +pub const MONTGOMERY_SMALL_R_PRIME_0: u64 = 0xE12FE5F079BC3929; +pub const MONTGOMERY_SMALL_R_PRIME_1: u64 = 0xD75E78B8D1FCDCF3; +pub const MONTGOMERY_SMALL_R_PRIME_2: u64 = 0xBCE409ED76B5DB21; +pub const MONTGOMERY_SMALL_R_PRIME_3: u64 = 0xF32702FDAFC1C074; + +pub const B11: u64 = 0xF6F900D81F5F5E6A; +pub const B12: u64 = 0x1363E862C22A2DA0; +pub const B13: u64 = 0xF8BD9FCE1337FCF1; +pub const B14: u64 = 0x084F739986B9E651; +pub const B21: u64 = 0xE2B6A4157B033D2C; +pub const B22: u64 = 0x0000000000000001; +pub const B23: u64 = 0xFFFFFFFFFFFFFFFF; +pub const B24: u64 = 0xDA243A43722E9830; +pub const B31: u64 = 0xE85452E2DCE0FCFE; +pub const B32: u64 = 0xFD3BDEE51C7725AF; +pub const B33: u64 = 0x2E4D21C98927C49F; +pub const B34: u64 = 0xF56190BB3FD13269; +pub const B41: u64 = 0xEC91CBF56EF737C1; +pub const B42: u64 = 0xCEDD20D23C1F00CE; +pub const B43: u64 = 0x068A49F02AA8A9B5; +pub const B44: u64 = 0x18D5087896DE0AEA; +pub const C1: u64 = 0x72482C5251A4559C; +pub const C2: u64 = 0x59F95B0ADD276F6C; +pub const C3: u64 = 0x7DD2D17C4625FA78; +pub const C4: u64 = 0x6BC57DEF56CE8877; + +pub const PARAMETER_D: [u64; 4] = [0x0000000000000142, 0x00000000000000E4, 0xB3821488F1FC0C8D, 0x5E472F846657E0FC]; +pub const PARAMETER_D_F2ELM: [[u64; 2]; 2] = [[0x0000000000000142, 0x00000000000000E4], [0xB3821488F1FC0C8D, 0x5E472F846657E0FC]]; +pub const CURVE_ORDER: [u64; 4] = [CURVE_ORDER_0, CURVE_ORDER_1, CURVE_ORDER_2, CURVE_ORDER_3]; +pub const MONTGOMERY_R_PRIME: [u64; 4] = [0xC81DB8795FF3D621, 0x173EA5AAEA6B387D, 0x3D01B7C72136F61C, 0x0006A5F16AC8F9D3]; +pub const ONE: [u64; 4] = [1, 0, 0, 0]; + +pub const C_TAU_1: [u64; 4] = [0x74DCD57CEBCE74C3, 0x1964DE2C3AFAD20C, 0x12, 0x0C]; +pub const C_TAU_DUAL_1: [u64; 4] = [0x9ECAA6D9DECDF034, 0x4AA740EB23058652, 0x11, 0x7FFFFFFFFFFFFFF4]; +pub const C_PHI_0: [u64; 4] = [0xFFFFFFFFFFFFFFF7, 0x05, 0x4F65536CEF66F81A, 0x2553A0759182C329]; +pub const C_PHI_1: [u64; 4] = [0x07, 0x05, 0x334D90E9E28296F9, 0x62C8CAA0C50C62CF]; +pub const C_PHI_2: [u64; 4] = [0x15, 0x0F, 0x2C2CB7154F1DF391, 0x78DF262B6C9B5C98]; +pub const C_PHI_3: [u64; 4] = [0x03, 0x02, 0x92440457A7962EA4, 0x5084C6491D76342A]; +pub const C_PHI_4: [u64; 4] = [0x03, 0x03, 0xA1098C923AEC6855, 0x12440457A7962EA4]; +pub const C_PHI_5: [u64; 4] = [0x0F, 0x0A, 0x669B21D3C5052DF3, 0x459195418A18C59E]; +pub const C_PHI_6: [u64; 4] = [0x18, 0x12, 0xCD3643A78A0A5BE7, 0x0B232A8314318B3C]; +pub const C_PHI_7: [u64; 4] = [0x23, 0x18, 0x66C183035F48781A, 0x3963BC1C99E2EA1A]; +pub const C_PHI_8: [u64; 4] = [0xF0, 0xAA, 0x44E251582B5D0EF0, 0x1F529F860316CBE5]; +pub const C_PHI_9: [u64; 4] = [0xBEF, 0x870, 0x14D3E48976E2505, 0xFD52E9CFE00375B]; + +pub const C_PSI_1: [u64; 4] = [0xEDF07F4767E346EF, 0x2AF99E9A83D54A02, 0x13A, 0xDE]; +pub const C_PSI_2: [u64; 4] = [0x143, 0xE4, 0x4C7DEB770E03F372, 0x21B8D07B99A81F03]; +pub const C_PSI_3: [u64; 4] = [0x09, 0x06, 0x3A6E6ABE75E73A61, 0x4CB26F161D7D6906]; +pub const C_PSI_4: [u64; 4] = [0xFFFFFFFFFFFFFFF6, 0x7FFFFFFFFFFFFFF9, 0xC59195418A18C59E, 0x334D90E9E28296F9]; + +pub const ELL_1: [u64; 4] = [0x259686E09D1A7D4F, 0xF75682ACE6A6BD66, 0xFC5BB5C5EA2BE5DF, 0x07]; +pub const ELL_2: [u64; 4] = [0xD1BA1D84DD627AFB, 0x2BD235580F468D8D, 0x8FD4B04CAA6C0F8A, 0x03]; +pub const ELL_3: [u64; 4] = [0x9B291A33678C203C, 0xC42BD6C965DCA902, 0xD038BF8D0BFFBAF6, 0x00]; +pub const ELL_4: [u64; 4] = [0x12E5666B77E7FDC0, 0x81CBDC3714983D82, 0x1B073877A22D8410, 0x03]; + +pub static FIXED_BASE_TABLE: [u64; 960] = [ + 0xe18a34f3a703e631, 0x287460bf1d502b5f, 0xe02e62f7e4f90353, 0x0c3ba0378b86acde, 0x90bf0f98b0937edc, 0x740b7c7824f0c555, 0xb321239123a01366, 0x4ffcf5b93a9557a5, 0x297afccbabda42bb, 0x5948d137556c97c6, 0xa8189a393330684c, 0x0caf2b720a341f27 + , 0x3a8ba018fd188787, 0x5546128188dd12a8, 0xb0b3cc33c09f9b77, 0x1baeeaf8b84d2049, 0x006425a611faf900, 0x18f7cd12e1a6f789, 0x6dccf09a12556066, 0x448e05eeace7b6eb, 0xbf2f33689d2829b0, 0x6d911dcb2957bdb4, 0x9f2353dbdc3c03ee, 0x06c54305babee501 + , 0x2eaf45713dafa125, 0x72963058648a364d, 0x61b7771f9d313ef2, 0x4f41c7f8bfe2b069, 0x408623ae599790ac, 0x4d33858644330a42, 0xfc5696649cdd7487, 0x74df72e0e598e114, 0xc9a06325913c110b, 0x076bd4115fe4b0d8, 0x76619e65d6bff3d9, 0x249240147cee3a08 + , 0xd695b96148965a73, 0x28aac8a28829f706, 0x41f1c05329f7a57b, 0x441ca9e89f03e00e, 0xe1aa38ab8bf7241e, 0x58f28cafc832b7f4, 0xcadaf8b8fa5400c6, 0x34b6d106284e863e, 0xf5498cab3af15097, 0x6dbe7790017d9c49, 0x63bf76a81448e8bc, 0x6371925bf23ae006 + , 0xc5e2c721bded81fa, 0x4ede70eed68056ab, 0x8f3cd9b5b4975810, 0x4752fd192f0a9aa8, 0x318794eb1f734414, 0x11ddf7d2c8468662, 0x2613b06f72b1a34e, 0x465575b37ab06770, 0x40b9845f82638d2b, 0x48894050790298ce, 0xbedb93a501b4f131, 0x04f3560d2889b2fb + , 0x457dd875115b278b, 0x56f25ee54d92858a, 0x92d4c1cdce0c977e, 0x078fca4187d74996, 0x3bbb2ded76cc22a1, 0x117b28853ddc2bf6, 0x43f3767cb9c2baa2, 0x73079e25e0ea8a8f, 0x0177992b5a15796d, 0x2e77721480d9ef92, 0xbe09883567372916, 0x258f176b7af7576d + , 0x308338fd6168391b, 0x7285925f9a7353a4, 0x862c0fd04fe85114, 0x53259ee7423aeb51, 0xfe0031a84b3b1a68, 0x1a4f1d661fa071fc, 0x2ddd54168dc928a7, 0x60185c1adf196a6a, 0x49809717dc6da9b4, 0x6062094b4dcffc03, 0xa41ea6fa05fa7e8d, 0x4a4fe06f277148a0 + , 0x7bb253a9ee9e80f0, 0x419a928bccb11733, 0x84323be66a9a039e, 0x01b2d1ae972814bb, 0xa7588584d3051231, 0x54df1e20cc979dd7, 0x91d906fe3e2f22dd, 0x4e36e9975fdf1a0f, 0xd81871746b747634, 0x3e5e31baeee13433, 0xe4da80979573baa3, 0x4b852ad97cfe77c6 + , 0xe08b346714418b9e, 0x283d719b2fe6ef88, 0xb7339d2de45c180b, 0x75acfcef11d2d5c8, 0x8f40777a8c561876, 0x0c54ac40a7134c4b, 0xb92e287d66baee08, 0x6f357e5006a188bf, 0xc5903319ed1e6971, 0x747c45ef91dafd40, 0xde4086a91d2f816e, 0x5dcb27edb3b3ef7d + , 0x43fdc46cfa1dd2ee, 0x51551f9f70966498, 0xb54534f761ed9bdc, 0x453455b3073fb07f, 0xf24773e383cab70b, 0x679be25e758cf4df, 0xda17edf2943eee29, 0x3dc9e5b8d6dc0f66, 0x56a50cba413fb75b, 0x1e65315bc5a8537f, 0x5ff90242802c7213, 0x73c9d8c8f425252e + , 0x3c637b8633198c8f, 0x534f84b3ed414f33, 0xad313e72dedd6902, 0x5ed57e941cdf33af, 0x5a6fe01d2a57306e, 0x73b63dea344713f9, 0x39cb70570f1c2bf3, 0x2df8c6e49f1a18db, 0x661bc349677797e4, 0x501ae7cbbebe9062, 0x5b52a88de8959643, 0x0372752811c01d51 + , 0x010c57a2301bb928, 0x378b317155554fc6, 0xf883fa4229a02cf1, 0x5f0047b850d7db29, 0x4d247ae328402daa, 0x0d030627a850a2bc, 0xb4e65d9a88a443f5, 0x6ec9686b2d6db089, 0xde202e08fea1d987, 0x5c64e1d3f28d7600, 0x157d17bef661bfb7, 0x56392d36dd75334c + , 0xe25478d8bd19155c, 0x146d4f2d3d336afd, 0x9bfbe00bf94e15e8, 0x2b185a9a6adf10c0, 0x926527b3ed52ab7b, 0x67997e1473101e80, 0xb58f4ff4947cc541, 0x36f800c7fac99a7a, 0xd0302e32400456d9, 0x4372e43640bc697b, 0x9144cabb4750d898, 0x75d25afac9a23cbf + , 0x794591767655cbfe, 0x74db216617fc4b07, 0x7057b2242566d0c9, 0x1d543b5908417b23, 0x19c280b444428783, 0x352309fd8b6cc3ef, 0x37833d6ac068ae72, 0x4ec0671a23c019f4, 0x9d9836e1a3d05bb5, 0x44fe1adff224efe3, 0xa296bc3ce57efb4a, 0x2efec86835a14150 + , 0x2fe19c09fb194bca, 0x18cc07d3953cd206, 0x5bdff217c9c0b9e0, 0x671aa756581abcee, 0xe1cc33ae28f7d1a2, 0x1b6f254937a0a3fe, 0x51503d1665babb83, 0x74b95636d5889211, 0xbdb97ae4ea96f869, 0x1507ce189e2510bd, 0x796e4d54fab93b13, 0x6a81765f05960929 + , 0x2e940521e5a833ed, 0x3bdea532b245f644, 0xbea76975ffd52693, 0x64b94848ba6d4ed6, 0x9db52d0194e33ec7, 0x71cf65da55639f25, 0xede73b1fdb5a8138, 0x12e4d13b6c62dc22, 0x9d19b0c265185517, 0x77a011d257b5fdd0, 0x1fedc5caaecd84e4, 0x46844e151e3492d1 + , 0x7a423a31904220df, 0x5b3165c747e8f099, 0x1c665eeadf35e22e, 0x7802b556fc45595b, 0x85a2def4015bd2de, 0x17f2ab87957166ad, 0x19cf6d352060c1e5, 0x122a7ad1be408e6a, 0x5b79bbc8645bf766, 0x20fb009d4d0adacf, 0x97526a272ba28538, 0x7041b4e90d420bde + , 0x3b30113358dab057, 0x3d398b66f0d24243, 0x91a5999a03cd4708, 0x1eae2409cd938096, 0x66dd6b604c36108c, 0x1713083789081968, 0x57cad6917125dcfd, 0x34b06cb89704f1ca, 0xdcafe8e71f35abf2, 0x698331198d544db9, 0x6287676643af075b, 0x200950e5559d2b6d + , 0xd4f63fc3ecdd9074, 0x7473317142ac13a2, 0x96b0030805319356, 0x2c20ffe0244378ba, 0x4889511ad26ac01a, 0x4ee327219997fcf6, 0x15ffe6e70f0bf8ea, 0x6b617fb4a6d0a6d7, 0x4916dca1c52f7324, 0x3c8269f086468277, 0xc24210c4c837e04b, 0x4e480b4f915a542c + , 0xc5fef3b09a7fe35e, 0x31a501de44fd84b2, 0x79f29e4940a407b9, 0x0ba7e03ca5cce5ab, 0xa7a8b2058a74d8ea, 0x46f4c7810e26dadc, 0x46171ace94a1128a, 0x44db55025495a811, 0x7f889e1a4bf18d5c, 0x4d4f172a43f306b2, 0x33a99766bb1cffad, 0x6254775924d39aca + , 0xd855230ec225136e, 0x1c544dd078d9211d, 0x12fe9969f63f63ba, 0x069af1dc949dd382, 0x305bcf40cfe5c256, 0x63ae90924bbbb595, 0xe451097793b7de06, 0x09780cf39fc0043e, 0x827af8e7eb798871, 0x3ace8a6c77577a37, 0x79df061332e055ba, 0x561dc07aaacea92b + , 0x7e4422d9820d2673, 0x6b85df83e0af5348, 0x1f151ac1ded8526b, 0x35ead8e5157142bd, 0x6da6ef6c33c79dd4, 0x5f2ea04d2594fde4, 0x91037d0cc027d5fa, 0x53b5401007b0331b, 0x810f198a3d4ba5a3, 0x4463bd259ba94195, 0x32b894acec2acf9e, 0x78711761d64349ce + , 0x253ae1b3f51fe211, 0x409e4b3f535b6463, 0x3a236d10da5e49de, 0x19d2b1029c21336a, 0x2835f40436aadd90, 0x0942a31505190b19, 0xc189131876828279, 0x3afe96c3ca8e1f9c, 0x9f1801b491230693, 0x39e28db8625fd091, 0x9fab50355dd44c8e, 0x145155da729b280d + , 0xd3ccf8101d4d76d5, 0x5a0faa1a8c2b6c68, 0x3cc66c84cb54ea8a, 0x51052ce3f566c773, 0x3bee14de65ae9ff5, 0x7586118a01ccf024, 0x089e791c896bf15e, 0x35ff022d261d93d6, 0xcd3ce13d8f7d1cf9, 0x4f1de98f95b7b8f6, 0x51e68a2462dc41b4, 0x61ad9e3c23f6dd29 + , 0x584fea6480ebdb51, 0x5d52fe073f9decf3, 0x9afe483eadf336d5, 0x1dfa03c980b1696a, 0x55f73d47ff819a19, 0x697bf55d361100ed, 0xded4804446399419, 0x618c94467fce259f, 0xf2597ff1f08ef50c, 0x07c935b98dd933c0, 0xbb758cbc78ded5f6, 0x1e9a0d06af13148f + , 0x879ce1457f4cd4db, 0x28396ca1962d4994, 0xf5095a3dc57605c3, 0x1e570f3da4c527b1, 0x2af69a3904935787, 0x591ee376fdd01cce, 0xf77b58df88bc8633, 0x5464d651b2f395d1, 0xafbc096b1e9a86ae, 0x6ce2df4bf65b6b28, 0x3b3a828d2e9d3e08, 0x6382011d8d2d66d0 + , 0x94987ca64d3d193d, 0x50ddf70d3b6d56af, 0x8d5df67cc8ad15a9, 0x39208098bc5b1f92, 0xce99f520dfd5a4fb, 0x323bbc87b86a7ba9, 0xe13f88a8d803c789, 0x56ffdcbdf2200055, 0x3aff0da31b24c72d, 0x70011566460c0c16, 0x76f7b7f53ac46a13, 0x1c069bfeb7077bc2 + , 0x8f47193ca14a3c36, 0x6d73e34af088de3d, 0x634b2bd9317d6634, 0x5b404738b77f1ec8, 0xf34fabb71ca1cb1d, 0x054abbcaca546a46, 0xe8cdcadd08eda660, 0x6971abbf958bdef1, 0x41338557dddb4eaf, 0x1e158585b079b67c, 0xd2270474cfa26068, 0x53b36d32b3cea469 + , 0x011523c16c543d08, 0x4668e92c5f73314e, 0xbaef3ebe4117acd1, 0x04037d1aa713931a, 0x68e118e4e390c68d, 0x6b80cd55a44c1575, 0x7307ea8a5729c032, 0x5cc5475feee99ab2, 0x34450e424c14ac75, 0x3f09157e5db3dcd8, 0x62ce2b1b50588052, 0x27a899c54e652f8f + , 0x0acd039f2fc2a5ed, 0x4b4044ddd5813eec, 0xc04d189e90a75958, 0x242551bce71d33a1, 0xd95af96b51f87f05, 0x02988820f809d815, 0xb27f65f73b9483c5, 0x2ef60745f4364b43, 0xcb66bdc93f4fb8b9, 0x2b86c9b48756bb8a, 0xf8ebdae09b9867a1, 0x441e70184e6fe9aa + , 0xfdc2530330cc1289, 0x47d8d65a8b4d6992, 0x8c03b6fa30ae74be, 0x1ca8693cc3bd99d5, 0x699eb1511018f2a6, 0x3da04764d9f4fff5, 0x361720433d3aab59, 0x2fa911612cb857ff, 0xa4057da10c2f1cac, 0x48a219b933a5c619, 0x42341020d15f0bc5, 0x73f8895046a09dad + , 0x1bad5312c67421b8, 0x4194771b368e622e, 0x8cc71a79e44e0dff, 0x4b4564e45467f1c2, 0x7759f16aafe52093, 0x391b71dcd75fbea9, 0x2a1c0694ab4ef798, 0x023087545444130d, 0x4b7ae1ffcfaa1aa1, 0x64e26f32d73361e7, 0x8da47038bd0b54b9, 0x148cfa6feaecee15 + , 0x3756d4d479c2cc3d, 0x25d44ea8d31543de, 0xd82c8bef26bb2c43, 0x2c2047033d27f37f, 0x5bd33d9837dad260, 0x77943117a3383b7d, 0x12071d697ea583f2, 0x3c7c41272a225bf2, 0x92ebbdfaf1f03ad3, 0x5d61030c68b63704, 0xca6e2853baee75d1, 0x12404b34771a3636 + , 0xbe13c46326667e4f, 0x2bd261916f9be3b0, 0x86e3f8cbadc80f89, 0x74520d8a1794cb48, 0x1e15c745024cf97e, 0x5cee741e1e53eb02, 0x8d088de0af99cda1, 0x625812961cc0862c, 0x4313437321c0e934, 0x60bbc768c424f7a4, 0xaba71fbf3c10e143, 0x37b8ea9f14a915b8 + , 0x8d96ec65c40213ff, 0x74a08828ff77845c, 0xbedb7194daf607a3, 0x17e86671161c8706, 0xaceb98e0524059cf, 0x68552ac494916f09, 0x4cd2971baf1b3c47, 0x68442ebcdde21b70, 0x19629b8c0e867595, 0x6a6955d3635fa47a, 0x6fab45e0f2e393ad, 0x66dd3ef4fcf050c4 + , 0xbb0b7abcfddc7df1, 0x14eb5b751b0bcf9c, 0x1cf79f9ca2fd411d, 0x5c496f73fff0600a, 0x49648d8555426d70, 0x46c1016a2322d8a9, 0xb57fdb870d9b6d4f, 0x609eb65209ddb633, 0xe70f9166bedc82c5, 0x772fb5b5c8afaf27, 0x79a294d9b0227a20, 0x7f75b141112dbc8d + , 0x98d1c7f88e070020, 0x5953d0aac48217b1, 0xe28253ebe15f33ff, 0x267d1dc11e614c45, 0xbe64f50ab99e2246, 0x4eaaab5c82fe5495, 0x927d5ac07e60bed0, 0x67d3786de6aa1b4d, 0xa71962bf0f6e2945, 0x63d93844a35eea9b, 0xb34228c7d26640ac, 0x169c38d2eb28f5a1 + , 0x4b7972b33439dc22, 0x71478457cdaa1e14, 0x5226e125ec1d58c7, 0x669d8796e78fd4f1, 0x750dd1aaaa44a07f, 0x327c62b55aebbecf, 0x006b8e95b54fbd25, 0x2ab3f95d01eb364e, 0xfcbe5080c0d5e196, 0x2a1b9bd75a57e725, 0x1d2b2b6758139b5d, 0x751cf4af849b7a73 + , 0x164a7d2e337d00a5, 0x00cee3a4cb83a4bc, 0x3498e0366dbe28f9, 0x053d899148d28502, 0x01665d64cab0fb69, 0x4a99132208d68e74, 0xba44bbd4bd3f915d, 0x1d34b0f9172122bb, 0x5d114dc729e8a9f3, 0x08e7a43dd5334b60, 0x28db8e9232f0f3e8, 0x5cb7be1b80264f62 + , 0x9af2c78782508f23, 0x336ae7ccf7e3a1b2, 0x7fe2d4ee2dd194be, 0x573d2e1b2b8a6872, 0x3332ea3363b2ea36, 0x200bc1375b1f4243, 0x65c47c8c06b3260d, 0x42021fca53995c5e, 0x2f7e6cf49bb19946, 0x311fba6a23196d2c, 0xc30c13b62be0d70d, 0x61eeac142711b0dc + , 0x88526996597d35d4, 0x70169bcbe6bd21d7, 0xa0f1b2d0ad29a510, 0x2ade531472c1b94d, 0x11e320dc189873e7, 0x2d2a1794e85cdb38, 0xa0a8c453a6f621e3, 0x4b06d5b54525f6f7, 0xf42916691848ec1c, 0x1d4216555d578730, 0xf8c60da7290a5b4e, 0x66dd9f39a1f3565f + , 0x55ac29d937b474a0, 0x4291967a4a369ee4, 0x918dacaa12e6bc89, 0x3d46e8900651c310, 0xaf055430a00e90b1, 0x16f62bf56da5ca39, 0x1a021c33488c51e6, 0x0d64dadf63fbbcd5, 0x0918ece59dbfea7c, 0x3b3319d7dd74203a, 0x1d88545b8b9fa90c, 0x13b792dc908c59e6 + , 0x0a2d939a9c3d0979, 0x321a5dbeb74bf127, 0x5e5947fff66d8470, 0x22ec9ecafd26bc99, 0xde17ca8293b10536, 0x593f56c0559dd846, 0x1148373375485023, 0x23c6b0fdf7448b1c, 0x377904458a27804f, 0x573e91962726ea70, 0x35e1b24f3235ac70, 0x51ba082049f4f85e + , 0x4bc4918160d47194, 0x5d29a21e3308e1dd, 0x7e15894b3e6e4e33, 0x50dbbd2f4f31d0fb, 0xef248bd235a9c9de, 0x3418add21b634710, 0x96c7233a52363bd2, 0x7c8414ad9a08c99f, 0xbc6acb4a54e6c05c, 0x5729021a1193579a, 0x0627c3e00b08fa1c, 0x3d0b4ff9e17c2a73 + , 0xd507e8755990317f, 0x75b27bb3bc7bfe48, 0x44a80f2c6ce651f5, 0x7b9795fc1b706e46, 0x9de75bdefdf9a640, 0x75ade50ababffaa8, 0xce0ab116870889a0, 0x6f3ddcfcdd59ec6c, 0x6e36833588de0674, 0x291d1129ea28a073, 0xf8b8e53864884d61, 0x706ef8f1ae854d76 + , 0x137a8c6583753069, 0x01e45f1cc620f966, 0xe28e1ff82f76c7ba, 0x36d29eace3e89c54, 0x83379f157f0b49cb, 0x65e9c39e2bacb937, 0x9b323c45070cda3e, 0x16e02f31ab7e2de5, 0x53bcf346635122b7, 0x1fd7e207d6c2de09, 0x3a5f5f94ea1e57ac, 0x0cba06e8d0f0b4df + , 0x70b440c387a9c392, 0x1e7dc143dee1d800, 0x5498ba6d7239912b, 0x332870a017182d14, 0x6be306fc672d794c, 0x2c2ce211245b2b4e, 0x109b722c8d2ba79f, 0x268520fa9c5f727a, 0x515b300524fe78ee, 0x736201eccbaea698, 0x4608ac113210bf78, 0x32d8fd919c441843 + , 0xc9557e1b04b8f2d8, 0x775437f798dc7459, 0x1200f5585ba417f5, 0x2e00ec5f3e7ad304, 0xfc873d5f2b446288, 0x32270a93624876e4, 0xc646a47c08789b22, 0x2370d9fe925616be, 0x430afa3619e671c4, 0x156468ceac1f5fb2, 0x3b84dec2f2417635, 0x31140e9017c0e58f + , 0x5c85f88ccb7443fa, 0x0da75f5d64d864ac, 0x295ff44871b0fb84, 0x1b79e10bad3336c3, 0xffdf9942dd2977b3, 0x4c1b198d0f9a1a23, 0xba778a24c112864e, 0x74f66897f26d48d0, 0x3fd5c06e867ab611, 0x4b98ce33ff7878b9, 0xf7db4dce75cb9165, 0x11665aa099ec5163 + , 0x2a498f16ae7118b9, 0x265ec3dbb4eb509a, 0x3da4230668ce2c86, 0x36e62baab2e33385, 0x99507d4a79ab4478, 0x25bfb2fc411e8875, 0xd7ac1ec933022ce1, 0x23d341ae033d0466, 0xd295b465e962bc00, 0x23d0211ba2d73180, 0xa03ccd7aff922d4d, 0x1e767148de301514 + , 0xc241ab36a894efab, 0x1c9fc2f343fc1e58, 0xca3b96562bd27a87, 0x53623e2285dd7015, 0x557411f01c219420, 0x19265577096b42f9, 0xd3312d941b23592f, 0x30a9a9a1c3c51c06, 0x3d89b0b3ea6e8f79, 0x7eab751dc5c77cb2, 0xc0a9b186e6df6e36, 0x4f844d583f155694 + , 0x419018232793dffa, 0x2add440b6bd3854d, 0xd55480f131df6e32, 0x318ce3846ae3e417, 0x0565062d1a0984f4, 0x6ebaec63d2bff9f6, 0x77075fe729e79790, 0x0dd9434624c8a4e7, 0xbf8f11e2dfa9b062, 0x1b17d8255ee8b364, 0x62c2150cf72c6344, 0x28106880d081e8dc + , 0xf4a4af0ddfec91c1, 0x1a8f0e6c977e1f2e, 0x72a7a3a738b9316f, 0x323716728c4e22ec, 0xc14069065ba4af3b, 0x081514248911d367, 0x51bd4afaa8b6c337, 0x50e77a9b513400e7, 0x46c0051b2a822548, 0x024886e41a5edcfc, 0xa06b0efa41cac17f, 0x336a30b01b9c5675 + , 0x74fb2c10ca097626, 0x2b204caa48e90981, 0x6902c952b9a17b74, 0x39c2e9b6b922303b, 0xb9216b9b3c597419, 0x6d92930264f15f76, 0x7b1297d5eeae1427, 0x0f0744adfe1bd307, 0x33b57e265be6a89d, 0x282fa2e533356c10, 0x3a03995c61dc772c, 0x4f5d8f5e893dcff5 + , 0x4bfc927efc48023f, 0x596f2241d6a685ae, 0x3cb3e0afec29b8a2, 0x31018e0d10653842, 0x2fd00fe944575626, 0x1241d8704982e011, 0x970d56664e6781a7, 0x1b05f49d0f3de2ce, 0xa994ffdf63717e66, 0x416374a76ba88e98, 0x8b082ced53f1579a, 0x56781dfab5d2aa4b + , 0x8151defd1865b318, 0x64669b840d6081f7, 0xe436f4bb5f38e14e, 0x43d438410a974b40, 0x5832ceb3d666be02, 0x06347d9e1ae1828e, 0x6979471b39e3ea86, 0x2cf2cf61cb4b5ae4, 0xb7ab29eada5a6ee4, 0x12e75cb29aca5768, 0xe65b1109d30d1ffc, 0x71f9becd6b320e5a + , 0xdc8289026647eed9, 0x31d62d050ca5458f, 0xea2bbf523a54c1e5, 0x602bf0b9e3ee5491, 0x25aa73622380ad4b, 0x2b6b1e3271df5f58, 0xdbc5efd86aa0470d, 0x05353c24b8c4354b, 0xa3c7db3cf5e06bca, 0x288a1c8f2b4ea5f7, 0xd6152f5e12ce7ca1, 0x59d4c1b436673c7d + , 0x1e02554e521fcb95, 0x66d3980f240ad440, 0xabf16f6b39a4d9d1, 0x7fea351ca94c2f62, 0x3d62b6f3389163ba, 0x0fc6b44f2e7895ea, 0xd5c64403cda7c669, 0x2e4099090e603193, 0x9b5c0faf15fa4c2f, 0x46295c9d8e12b639, 0x5ce4add63a5b331b, 0x5fa7bd736c4c5879 + , 0x47b3471447d1aef2, 0x28004c1c22325739, 0xd588437d9a3c5299, 0x2ab19c1812cd27e8, 0x3ae700f680037802, 0x1ad163800b422b36, 0x45b7ef36fabc2139, 0x44bcdeff21dcbd1d, 0x41c6da2171e11c7b, 0x2c35ee79f7c4cc14, 0x4852942759c13849, 0x6492d26f10be050a + , 0xa6f54e988c50f0d9, 0x6a2db2b6dd62181b, 0xf7d9806b2a5e57a3, 0x57526bdb3ba53d20, 0x17ce6cb1f500e650, 0x05d841b042f8f345, 0xaa800a6c698de970, 0x04f4b559abe2cb8e, 0xc050dfd7259ce49d, 0x213839bdf94db935, 0xb371258655306204, 0x7d323b8b19f9705a + , 0x26d4502b16b6c618, 0x79717069aa89595b, 0xf867c0e36db41872, 0x13d601d86c76e1d0, 0x2dfc8b0d331b7383, 0x185472f3e42e8075, 0x05bd13e72b10eba0, 0x519a387490f79b95, 0x8d09c1b2d3ad2500, 0x045da45d2cf0f733, 0x640181956862426c, 0x728d57f59bfe1b09 + , 0xf9a99f878da2c585, 0x4fc4831e61dc4e10, 0x6dc602cc54394fe0, 0x0484566b67e9e8ae, 0xc5fcf0474a93809b, 0x71c0c23a58f3e2bb, 0xb400fabe36fe6c43, 0x614c2f3eaee4c0a7, 0x7610a980d0e1c6c1, 0x1ce8197c88885dcc, 0xeade1c9f3ac2cb2b, 0x471ad07baf2f341e + , 0xd67a837c6b01121b, 0x2a8e64281f59cb59, 0x52e701e42f3262ca, 0x19e0a27dece50580, 0xb5691c17a7bda6ac, 0x43484c311b9df1f2, 0xa68155549bae49ea, 0x43a2c5dda225fae5, 0xfa5e992aed700eef, 0x58911f5623918856, 0x648b81a1e48c4da9, 0x66e6e30cbdd0c3bd + , 0xf3ba209c169d266b, 0x20f7a86230447685, 0xd1bb5aaa1a0c3d2e, 0x366c29843d1111f1, 0x06c78b642dcc9013, 0x27484a64e109e3fb, 0x8f8eacbca4677464, 0x0b6cb31b1dc24cc1, 0xdf69c84f898f0fa0, 0x2dd426744920f2a2, 0xc0912a197d4c5c69, 0x489ade7f6a98d8d6 + , 0x458769f47f203e28, 0x124f4123fc05ac97, 0x3bb936f4ad6d7d67, 0x330954fed4f00ff8, 0xc2ce650046f90eaf, 0x7bf94762d4f9debd, 0x2e93172a586dfb83, 0x3c7a6062b4113d96, 0x5ddb0397147f0d93, 0x08e3596fc6839034, 0x374e67ff67639bfa, 0x19021c2119888232 + , 0x002f5d04fdd55efa, 0x05b4c6e079e1baa3, 0xe5678ea3ad74c84c, 0x1c42f7826a58a77d, 0xe054668bd2cafacd, 0x237668d3ede4261c, 0xedf46a6374aebb32, 0x31ec8c5931cf0ef4, 0x955c2e95c35b5825, 0x27d8b0ea68259603, 0xb7a8976e427d1ec0, 0x6b6cc5c07152bd13 + , 0x03d88f0ca0b244cd, 0x001cae9a8cfed897, 0xa844b3a1f693a7fd, 0x676c9acb7abdec96, 0x631b6bd5e0cdbd33, 0x29f289dc0cddd9b8, 0x0947d57536fb2eff, 0x1eb2ce650e3eb059, 0x2139b3a40e8bf405, 0x4165edfb39f4ae8d, 0xe061eda67a70d6a6, 0x2e3cc0328c9084f6 + , 0x1ef8329ed056063f, 0x6d4d01ce49e8b3d5, 0x0110c92f1656d34b, 0x6dad1c4e170829e0, 0x584c56c590b477be, 0x597e5f0ad525e935, 0x6008264d8eb7d36d, 0x3f586754999c829e, 0x3d7ea89df5546a1d, 0x41754f7d9a3f4364, 0x3b0796822ef879a7, 0x1ab2779598262872 + , 0xdc37c9f0bbef7923, 0x256ec818ec35a097, 0x4a72da5c09dd5846, 0x51df6c61edcad45c, 0xaef24fcdcf5ce819, 0x0ba6bb959ae689f1, 0xe667bd65a57b3a9e, 0x71ffd591a28a8e4a, 0x06c325fa53a7fadf, 0x6667f2986b2dcf13, 0x3ef751a6d52a09e4, 0x517a104240b8c74a + , 0xd08cddfd8c8183f5, 0x59237cc71b8147f1, 0xfff94fd188395933, 0x538acc592d10ef67, 0xac51ce386ff0eb1d, 0x69d42b8114c5fe65, 0xa17eda3995bfe8b9, 0x5dc6d98fdf05a341, 0xf2304d375ce8be78, 0x31b58521ecc483ca, 0x04d2d8140780222a, 0x3dc18b2be3ed95c9 + , 0xa48e1639f2d70d2b, 0x4ffd54a6bc0f38d0, 0x8ae3c65ba6b7143b, 0x482eb41f9178fa9d, 0x240b8b4e87ad4f1d, 0x6d8532420059eb40, 0xc135f77e44275132, 0x6261076a0daae349, 0x35316bdb3842765c, 0x246165ba3a8bfd92, 0x1c2d774bd5177a75, 0x045a2f991647e3b6 + , 0xed3b5923594671a8, 0x0514fada5acd4db5, 0xe8297fc358a0f50f, 0x7cd2badcf2952a91, 0x0da45130ea9ac266, 0x26a0d43c1e14c979, 0xbb62b729fe93a390, 0x360357aff7f67ccb, 0x3ad4835d1c7c59e8, 0x570daffd86fa470b, 0xd7c4be698fa3bd96, 0x17e4bdec2ad76ffc + , 0x43ce4ea9ead7dc51, 0x58ba7ae0d64a518e, 0xe014cc7e64680555, 0x03abc953ce2630b8, 0xa318620c7799be57, 0x2b258fa2e84da952, 0xdd88fdc5063b2ffd, 0x17371dd79a3aa556, 0x927b837578981299, 0x554552101d90ab2d, 0xb45306218ce54bd0, 0x59109b65ffdb6235 + , 0x8663e0c4a180a515, 0x41467fe41c6604f4, 0xae2c1aa4dcb73878, 0x19d3cb02c6c07517, 0xaa147c97ea6745f1, 0x70dac71a31cac43c, 0xb9213ec26af87dfa, 0x67f228e9f60e7b25, 0xbfb59b8cf78df3df, 0x36687792a4256fa3, 0xe1be5c1f23177544, 0x786a9e1b644b1c90 + , 0x4172f47393ca7f5b, 0x62ae5bb4b8aaeb59, 0xbcd9c431fa631b6f, 0x1fbe20b2edc9cc6d, 0x5fdd829fbc0ee085, 0x241dd315adc5dd59, 0xb4b688d625f7dbb6, 0x595a82fee5bed2d4, 0x69653ae0cc11880d, 0x2b9e85fefc402f76, 0xbb2495b507770a81, 0x05d20c575fb34731 + , 0x9d9e623436485ab2, 0x27012a9665f3febb, 0x586cfef484c04ff7, 0x44a5860cc0eabfbe, 0x6fbfe6e2f3532e80, 0x05abeabaaf3220fe, 0x1bed21f2cb809678, 0x2aa62112b7eafed2, 0xe298837cf610190b, 0x1ec8fbbcef9158f8, 0x1efe9b3aa4f96f6b, 0x6a3b842a068b0ef3 + , 0x92dd4b7cd7f827f7, 0x605175bbf3fd1c97, 0x139bb6419c1f6d98, 0x3a3ab2e9978db310, 0xc5c95941c9d5dd0b, 0x34c6c76025b2bce0, 0x0d44115a49bb8126, 0x7622cbeb11daf619, 0x785bff93164ef5ad, 0x7191647d355cb45d, 0x117f255c4cce6e5c, 0x581b448b0e9aae3e + , 0x54a4f3cb36225414, 0x790180c539bc4685, 0x47064043b7c6b96f, 0x43cccf5b3a2c010b, 0x1dfbf3afc14c3731, 0x1c368f3195572574, 0x00bc2ed3b5070b5a, 0x0332d8dd63b37f60, 0x0744b1908c9bd8f0, 0x2d258e628dacb9ce, 0xbba5b4bdb9c61e14, 0x0bca12295a34e996 + , 0x059c84c66f2175d4, 0x1a3bed438790be78, 0xdf394f577dabb5b0, 0x304777e63b3c33e4, 0x59a29d4fe82c5a6a, 0x72e421d1e88e77a4, 0x69e6230313312959, 0x2da03aad8cf2bbb8, 0x2858d8608fecb0b6, 0x343099e7a40243a6, 0xba29b675d29a8f63, 0x3d2028a4f6f15886 + , 0xf068e2d286047d0a, 0x14999b5d6c770e20, 0xd1874a592385da79, 0x78aeb552c15a1cd9, 0x482dcccc23e9c06e, 0x7b18a19fb54b5745, 0x036c896efe9a7a06, 0x2f2c2ce0d1871c13, 0x3b2d9b9ed65492c7, 0x0649c7e50819d077, 0xcdab66ea7b65e3cb, 0x49b15b40c4aaf03f +]; + +pub static DOUBLE_SCALAR_TABLE: [u64; 3072] = [ + 0xe18a34f3a703e631, 0x287460bf1d502b5f, 0xe02e62f7e4f90353, 0x0c3ba0378b86acde, 0x90bf0f98b0937edc, 0x740b7c7824f0c555, 0xb321239123a01366, 0x4ffcf5b93a9557a5, 0x297afccbabda42bb, 0x5948d137556c97c6, 0xa8189a393330684c, 0x0caf2b720a341f27 + , 0x892756b15bcf68c4, 0x5742f77c98a526ba, 0x340a5a1de9f89f9b, 0x14ef680aee75d0f7, 0x84e770e14043a41f, 0x0212c41116c33c95, 0x35b791e6de4dc0e2, 0x5949df08518d5d28, 0x6a0e120744ed10db, 0x5a5183ce844391d3, 0x6f618b158afdba50, 0x2ce2037e470e2088 + , 0x1f49fa149a64ba3c, 0x5f9876d519670451, 0x030105056f55586b, 0x020f1a557d8fd726, 0xdf4cb175b06d86c8, 0x694fbcbe7fe58390, 0x7933294a756a1b67, 0x09dbe9924b58f8ec, 0x590f4403cdf197b6, 0x1c07969fc87a0ba7, 0xc496477712252367, 0x5508976022f1b096 + , 0xefda361e452e1775, 0x7a0a0cccacc838fb, 0xb07e791c0be5dc5f, 0x24d9b6b418cbcb93, 0x497970f3c6117e03, 0x3986a158cb96d595, 0x8f80586ce692612b, 0x305cafda7e4df9d6, 0xc1a1c2e06452914a, 0x7ef989c0eb583079, 0x3a765b1f7364b099, 0x4fee236d58299c6b + , 0x6f81095f770e8419, 0x53bbd86b7396bc09, 0x2b72ba726b2b4210, 0x625dda1d2901c78b, 0x0ff5bc7b18cd2b3e, 0x0556598c7358d332, 0x0991245f20ff50d7, 0x0e7f58e5e919a97e, 0x5a0561373b758756, 0x6447bc93f87c198a, 0xf9230604c34c7520, 0x6b214425475c1bfa + , 0xe93de62d6a7f9497, 0x2129459d86f4493c, 0x456394c7c464cfe4, 0x612434fec3f4a1b3, 0x1ed91eddf44261f3, 0x0c6d3854f9e0a3ff, 0xd3fd153188a7e4e3, 0x24691fbdca16910c, 0xbe97465cd7625c9d, 0x2aa61cd373f759f4, 0x824d5763a326d62b, 0x1a0ae39e50da20ba + , 0x32d0c8481ee4c3b9, 0x6c3687109cdd18c6, 0xe52717142fbf95da, 0x67bfa41fb52ce9c6, 0x4e24d6a088a01474, 0x49a6ca0ae3fb6626, 0xd67f8faa9103191e, 0x674888f5aa6d3062, 0x4ba73824c2e85a99, 0x406b2fd18d35b314, 0xa7087b1bea728ac1, 0x11d2f222317b160e + , 0xf8946e007e23a469, 0x22a196fabbce31a2, 0x5309ee1bdc1216ba, 0x240fe9953827a324, 0xf9fcb89b63aeb5c7, 0x603b8149ed16b1b0, 0xb1f1876c02cf61fb, 0x4a5e32af612f948b, 0xfc491aede69a8813, 0x1ad9379136e53aa5, 0x5da50db1d5e6c123, 0x2f4014f7fe2c12ca + , 0xe4f6791d7685c3f5, 0x4c218521c3745a9b, 0x0c0521af98555f97, 0x1462a12953cada7b, 0x0bb2ab63d6452c1b, 0x5783c531ec98bb87, 0x737def53605dbc9c, 0x49f982b930e86719, 0x75b16790cb5211e3, 0x45ad6574cdbae99e, 0x1062b72dfeec9851, 0x45029a09cc468c88 + , 0x532240de77f3a1f2, 0x17bd291eaa9ad0ea, 0xe0a2d7efc2f8a0a0, 0x3a7412052021778e, 0xb0dfb0976acc90df, 0x7fd603b689a7b1f3, 0x1152579ccb00d6c6, 0x6340743b631849a3, 0xebaa47290e0cda01, 0x143265a6d53fef0b, 0x45325d6fd981e75a, 0x0e9780cc39586f2a + , 0xa4f68d207a8628dd, 0x50d230b51893e841, 0xf3bd769a4bb504b6, 0x55975c063969292e, 0x07727ba25fb8756f, 0x07ff86cf8ed731fd, 0xef57fa40cc35a1f0, 0x70753a70874218fc, 0x615954e2342b973c, 0x5aa9d68f1a59df86, 0x3b8e9e9ff5e44468, 0x2e749114d60a3d23 + , 0x14a1b91ec176db4b, 0x55f91a63d69aae6d, 0xf42382327b1b6d27, 0x2acf1f475facaafd, 0xfd9069b479b58968, 0x3baaf4e5c4a45f77, 0xa2ac9ab98a7aaab6, 0x5466cb5018f50981, 0x3e6ba27771ba3205, 0x31ea90cdea1bbbe4, 0x0000416b5c557393, 0x464cb0415a510d7d + , 0xd02087d206ff2bbf, 0x2b9c8ecd7fabe736, 0xb2b56d3842caab0d, 0x046ea0b7767700a7, 0x113a7a889e317310, 0x5992a354bef7d0ca, 0x3edda94ed50388bd, 0x052661f767839154, 0x4c28edf6e19e28e0, 0x1d19c2f2d2f644e5, 0x5d732148db35ab3d, 0x680c4714b83580f5 + , 0xa374f282bb80ccec, 0x789e609bc77ae11c, 0x10d2577d599b45f2, 0x1c548b5b857721b1, 0x7baea726b4543fdf, 0x3c1562912d1b4ed2, 0xd6362203b7e82082, 0x1414e523d3c7a900, 0x7ca349951c1d23a9, 0x4da4265e3ce80fb4, 0x7981ebbcaca9ef36, 0x4ebac9e5b5bf980b + , 0xabd2c1dcf49cb5a4, 0x3f54acfc25c6340f, 0x202eeffabbd11cbd, 0x67216b7cb3695e8c, 0xff7cbcf9b23fc9f1, 0x2eebebdff7fa7afb, 0x71156befa111f85e, 0x1b8fd98df522902c, 0x6b28ebad62519791, 0x6cf0ea960e01d8ed, 0xb4617bc2006967d5, 0x323da065cb3df0ad + , 0x31687d0741e24d9c, 0x02db8f2b509a7cc2, 0x9243f85924320527, 0x68c360f01d6e6d2b, 0x2351c5e877d5306a, 0x6f56ccfc85c5f3a9, 0x1b09652837c4928f, 0x0b3337554c83f971, 0xe2931be2ccc783ec, 0x46829694ba08c64f, 0x9f35e36358e2c6ac, 0x1474b333b000d170 + , 0x24d792756fc96640, 0x618fda9fef868c5e, 0xb7ff5b125afd9375, 0x778dd97e0440c258, 0xfbff314886219627, 0x3417e1e1e2a7e811, 0x21e959a88f7b7bdc, 0x3508c2eb8c3c8672, 0x827ecdde111c430f, 0x21bcb19fb07aa134, 0xe0c1fa50ab2f5746, 0x401e680b4e6658fa + , 0x2cc24bab313693cc, 0x20541c12b964447a, 0x374975b6fb81c3cc, 0x52905efb344e17f7, 0x79c5c9b56d8b5f9e, 0x3390bf75d2b9a3ec, 0x7ef3807d895bf4e4, 0x2814165a42046b51, 0x7f8cfd09326fe158, 0x3232fb4f4c9762ec, 0x5678d6dacc194d25, 0x6f7caffb0a7545e8 + , 0xbd981637b23e7963, 0x691d7b7cb88a0ef5, 0x10ba319ae2062914, 0x06fb144f8295a85b, 0x80e620976bf62f8f, 0x2a425971ec73d6b4, 0x800aa9e741d10b1c, 0x230d7d8bd1a0469b, 0x65aace37428dfe8c, 0x0fcab5297f58b667, 0xcf0e9526943af7b8, 0x7d90915b75d4dae7 + , 0x7455a46156259d6b, 0x29bcc06374cce1b5, 0xf2fb0ed3aa87aefd, 0x211a06af0e54dd58, 0x6c0c95c5723de9bc, 0x6299b6ed25008ca7, 0x7fd63e784d4dfb18, 0x2cc93b4d9bc1db30, 0xebc7e2d44c5d13ea, 0x3278e18d4d3d11a0, 0x349e3dd25a215f79, 0x7eb2a7150b30416d + , 0x05f3d7d5f6a094cb, 0x2a3771d48e331405, 0x08ef39e9dc96f009, 0x012248373a364992, 0xf758f92fc9fd4d33, 0x2339d8c6dfd3ca6c, 0x8b000965962673b4, 0x746ff43eb99d9054, 0x47ecdc054a422eff, 0x33d8f7c8267b7f0c, 0x22fe00ac921a42ae, 0x31e57f3d31fcd8e6 + , 0xbb912315a1c50869, 0x4ac8cdb0fa7ebbaf, 0x0541d74a60973edf, 0x7234900334b2c5d7, 0xf2e545f730adfa33, 0x224e44e63db5ac96, 0xfcba3d005c6fdeb9, 0x2c93a4e6559936b5, 0x7727a0d7ad88d758, 0x2e33100216719cdd, 0x7b2ef89aeb2c0254, 0x1f6de5b74758afb4 + , 0x6ae89047114fb321, 0x3d605e9a6ec6d80d, 0x18e915c727a874d8, 0x699088b5e9d0912f, 0xaf9344618e056f10, 0x1b9169df8245e0b3, 0x5eb8c33d70f4c891, 0x1609ddfb222b13c3, 0x8131c885d1b366ed, 0x7bc3cf9d9cb1a7b0, 0xd297478d2fc93968, 0x13cbb4573a4ea7f5 + , 0xdd37b5cc64d5986b, 0x7ed3d1d7d81ab5dc, 0xac53485f23973c9e, 0x0705675d333b91d7, 0xade5d213c43186c1, 0x6a8bdf57b4bfdf14, 0xa87f88a1de717963, 0x17f29220b519bce2, 0x7af2d7fb0f95c610, 0x28d1d3923b144a7c, 0x8e73c3d8972813e1, 0x00100b40c62e72c1 + , 0x84de7a81fa1f50da, 0x4fa391d6589d8244, 0xbcc3596f0834b285, 0x4d4acbd60a24e9ce, 0x97fa98b8c1835a0d, 0x33abcf8e29901d0b, 0x60a73d1975b3d082, 0x60666aa4325b948d, 0xad54adb769284a39, 0x227a98d113609b28, 0x4a1e1ffcae6a3872, 0x1e4ee44bd67f818c + , 0x5a74c6bb4387d315, 0x019428c0b1b18795, 0x5cc153e270bbb055, 0x2b3cabdf00dc4a61, 0x834110c026924b57, 0x2d30e985f2d9f217, 0x47116979333389f5, 0x53e3fd6a18202417, 0xb1393cd79c2e5864, 0x58d92935e4112e82, 0x86989a7ec8305b6d, 0x42a8fe4eee28f37a + , 0x74e212ef01591901, 0x3277917a0397b1b9, 0x7bbcbe6e3d687544, 0x0b8957701d09afb6, 0x6cfbc8ee74503668, 0x48a9925ada9f8348, 0x57045753ba2d0f4e, 0x7d69ca3866223d66, 0xc7054ce22917271f, 0x41bce1e1133b51de, 0x3a3ae42df81ec35e, 0x7eaada0f42d47cc3 + , 0x13b138f1048a57cc, 0x64f98abd7e915a8f, 0x7af195eb16a0c732, 0x11be81a791d634d2, 0x97d8df47430f61b8, 0x0767c7b381271004, 0x3e949136fb940aa6, 0x3bdee340cd956dba, 0xb250ec4ff91d2602, 0x4cde2454d47f59db, 0xaf5e749530d978cb, 0x5a8e2f2119d4d835 + , 0xdf1cb5425a0744df, 0x3d3b08a7bf35d055, 0xc6335e832de4719c, 0x6eb8d97e09154d42, 0x2f6a3f8de3d20dd9, 0x13f23cfd276233da, 0xb4a6b80dfc0fa41c, 0x58d876403acfd7d7, 0x2ad422078b8e139b, 0x73dbee2abbaf494d, 0x09a2758891eca3c8, 0x6ef9a9f1178b0938 + , 0xfc7e9ecb90c637da, 0x3a04345fc10b1a7c, 0xc024e9cb62f9ff1f, 0x6c4f9c3aa4aa33d8, 0x049d6995b95ac1f0, 0x2243845195763a1b, 0xa1466a31700ac276, 0x600fb7123a325905, 0x9d391a64a0d35a24, 0x3b093b550641f108, 0x2275de5bfd2e221f, 0x25f5e7465963db1e + , 0x3e220107f7e7fb84, 0x6f06a23bc1b85a8e, 0xb4198d19f6eb0e48, 0x5dc11761dad45fda, 0xba303e492ab52a0d, 0x127c69c73da9f528, 0xd3a5b70cf6c790be, 0x0d72b0c50819da5c, 0x193f90d62ec2cdf7, 0x67f7d0cfc4f46daf, 0x7aec083d52f380ea, 0x7c0a1dda4a28bf4d + , 0x46fd20fe6008cba7, 0x7a588c914115d595, 0x8fb1d3daecf45f78, 0x0851dac094e7b036, 0xcae0a76e2a32a892, 0x104f861322dddb2f, 0xb79d81e46e1f9006, 0x1e4d28d7a2498912, 0xaf3175d3974b89bf, 0x613d00f9a69c55c2, 0x23f6883e8e65226f, 0x072f7ed65c6def05 + , 0x6690e643bb38e243, 0x1a81c4a7c9189b15, 0x1056d1669e4749ae, 0x0137f2a7418f190c, 0xed3192796e699d16, 0x3ed76db45c38a37c, 0x78e86d1475a88243, 0x45985aacc495b16e, 0x47d5c8208e8f1030, 0x6dbe5f68b4d0e782, 0x08d3d0182cf7f26b, 0x64c375ce172fadbd + , 0xba0f6db3a20c2875, 0x57e1d90a53241250, 0x0315433fddf8e63e, 0x33344750e37dad9b, 0x62cc0d28ae69b016, 0x435fe80f6100d547, 0x5874aea8669d3df5, 0x3b96913f8264d4a9, 0x738067d6bb1314b0, 0x48cccf24cc6f4ccf, 0x6f5e2bbd68b777af, 0x34c2c37ba9635d66 + , 0xd731534900fdbe5b, 0x4e4f9d97afe11d43, 0x81b41214351b73d7, 0x1d48d100ad11a5ae, 0x2a4ee76628e2b151, 0x34902e901877efb8, 0xb5a8561a0fd45394, 0x44317af6d5cd5ac0, 0x354c2469e9068bad, 0x0771fe2761cad022, 0xfda76ee8212d0f2b, 0x76cdeec6d4435495 + , 0x55c98575b3e825fd, 0x2983325ed5d73a1b, 0x563c4c4fb3f466e7, 0x731b0fa413338bb0, 0xdeb519ca57a05240, 0x7a7e909b5c4f7351, 0xefb7c153dd2ab28e, 0x11ca1c865dee30b3, 0x013ca8348d9d7de1, 0x575e0bdaeee8cf9a, 0x464c98a21083af7f, 0x683ddcd85c212ee3 + , 0x1171f0ab4cd02019, 0x22c7e01c7f4d64c8, 0x972ec0ef3f2e2ed3, 0x623f83c2611a476c, 0x99b3f16be9aa25a1, 0x2d3ebc5468990e0b, 0x5d5fba8546a4d5f2, 0x4716e6919d2986e3, 0x3ab2f2bc183f5d6c, 0x5f6257d3910cd4be, 0x341c6f2a78f94f2b, 0x6ee8390b8a5064f5 + , 0x9d8640b9b83ca8e7, 0x033c5ad24466be3d, 0x6f6cd68db30dfd59, 0x52aa6b1c0f90f3f6, 0xfe7bcd4c97403646, 0x11ab3fc960b05fb0, 0x24584b77575896da, 0x427f8deb932da137, 0x928a28cb505306f0, 0x04ae916fe863820e, 0xaabaa98911b9cd3f, 0x59e588ba994d9145 + , 0x9b8f1afabeee9e9f, 0x04ffc7ef3476ff8e, 0xe9cf53ce9937b146, 0x73fe42a801524448, 0x224bda3cf3bbaaad, 0x5fa85056d59884a4, 0x8e6eead48345726b, 0x09230936d41736d2, 0xe679eb58d1ad6be7, 0x08bb759b530b1eaf, 0x9688eb527860e24b, 0x13704d2daf9af278 + , 0xd9273ac71b906f14, 0x57ee05fbbd40deb5, 0xb7788e19ba9e61eb, 0x7967b6dc1c5d9699, 0x36e043fc230127c0, 0x2a716598bb2d519c, 0xc017b2840d4d1b07, 0x1d3bfa489f756a3f, 0x4ad73abf24318d36, 0x1915e6f53e12625d, 0xb219a7c941f89084, 0x2280087a8f4762fc + , 0x8eb280345fd1b4e7, 0x55b8d4ee5772fd79, 0xc9e63a787e2ce2e1, 0x685741adbda93885, 0xffb830ab11a3b491, 0x7e891121f9356428, 0xc03aea271a629078, 0x71c45932930a2639, 0xe7df192a6bf81795, 0x704aee8f183aadf1, 0x06ddb55a8a7a63d7, 0x52556d8763f3033c + , 0xb76b458c6f0c33a7, 0x28666b87c362b95a, 0x365ae575a4c27b9b, 0x36ef35110562adfd, 0x89955dd8d927f9c7, 0x526e787d6a586c9e, 0x762e0bc4eff988c1, 0x6c9523b4b5ae4946, 0xe90a909688cfe95f, 0x658a7dc8b3ffada3, 0xbee148ba7a58520f, 0x6819007d8573d1cf + , 0x75d3b5ec141be9c5, 0x4bc236ae634f3c27, 0x1192fa9b8b30e894, 0x4129d43e1d092cbf, 0xfcac068558bbea45, 0x513e8d87b8116534, 0x5377a179a155ecd4, 0x6c93531e5545572f, 0x727df81ba09aad91, 0x07527139dbc96250, 0x150320b1d8ba172a, 0x2281e85f60a1809b + , 0x7164b7d524eba6af, 0x50d387163fea4ca8, 0xe90de17d62aebe78, 0x6ab369ba28c0410d, 0x17d07e315a95d138, 0x58b496352453fefd, 0xb87a04dbbc101b92, 0x40a8f0fb757e9b0e, 0x2148b48a696e64d1, 0x4e004a3a350c17d7, 0x17927e9f386b563e, 0x29da9cd441e3e3c5 + , 0x883d2dc357417213, 0x2e94653ff7862644, 0x53a37af548453df1, 0x04475db3c300b93b, 0x2d65fa4d815e7204, 0x231a2db74c2c3ccd, 0x1fd734c0cf4d97cd, 0x32d255c105f6d122, 0xbb74fd9201eb07b0, 0x12e33f1c81ac6f60, 0xfb9a6439bea97072, 0x52e14b7db9cdcbc1 + , 0x637ac1a91ae374cb, 0x1c8622c35adc8224, 0xeb786c50a64b7d33, 0x362823a7232a5893, 0xf22dafca688d472a, 0x18598f0e0237f7c4, 0x97b8497bfff4bcf1, 0x7abf4cb27a9c5b7f, 0xea47c44e3b3d95d3, 0x58728fe3e1827a43, 0x7fd3681a6df902c8, 0x6db1dbbdc413de79 + , 0xbc4effed1ac3007f, 0x7f31a54744887cab, 0xe6559b4f8bd2519a, 0x18a78ec5b0c241db, 0xf6e10285b15d2030, 0x5c1323ea219a8ff4, 0x134b6f20dd116b47, 0x5d0abddbc8998733, 0xa3c993938702e151, 0x0ab6aeb494f6ad5d, 0x8cf3b4beda1815e6, 0x546ce323008c2fdc + , 0xa10eb5a6a78dbe39, 0x26d2e8a8b8457da4, 0x026ccbe31517d806, 0x2a35174b812f562c, 0x57d70499dd7a374d, 0x3368f951acd3c5e5, 0x490b2515f901062c, 0x316109e7c315c377, 0x32e20eba569535cf, 0x496a8c39d667d709, 0x5578096dc44d5e0f, 0x608a162ce73903b0 + , 0x6b2e65852cb37cab, 0x75b09a2e6ed609a9, 0x7ac84b3082602455, 0x7690cbb594e84b94, 0xfc85dad9511973fb, 0x738a74b08c9006d0, 0x83233fc939d5883e, 0x7fbfc08b5db3c9f4, 0x81a0e493fb5f7749, 0x2c255ef7e69a77c1, 0x234f02e609cc656f, 0x5960cf0b961f3cec + , 0xac72940237b1f17a, 0x434e038a29d446ac, 0xca6a090e00d8b0c6, 0x1f1aad24001e473e, 0x6d64b6dc133399fe, 0x0899ba41e9dd4607, 0xca590b3f25bbf5df, 0x57217978b0d8ce11, 0xd6b4cb13da6de9ac, 0x3c88520cf564f75d, 0x649fbd5075a7757f, 0x3f2593b90fe72161 + , 0xe1bee53e91dcc9a8, 0x010069dce4c74a92, 0xef83968978aa855c, 0x6cd8848183b53d73, 0x0b3df59610e403eb, 0x713225d446180a7f, 0xcc23112cc59850e2, 0x105796b670a3730c, 0xa147f4ec7a2fa4cf, 0x32da1f072d75b253, 0x4e7007455e85f560, 0x76a5376a771fdd60 + , 0x47eb4fabdcc699f7, 0x4e45db6334c6ed96, 0x36066f2bab72546f, 0x04f48065593ecdec, 0x3fec02793fbb5601, 0x122f74626b64a526, 0x21d0f66ff83b4dbd, 0x1370610ede647f1c, 0x57b82242b88172c9, 0x527dcbadfdc65ade, 0x5e9c9a04385c93f5, 0x64d1cf9e52548a6c + , 0xba0073337865c994, 0x633ee14e50bcd615, 0xf840228ec4251095, 0x49bb96812a98f08d, 0x82f57d0422f96678, 0x06d7e43bffe7e0e1, 0x33910cca752ae863, 0x04d46e7c66087e38, 0xf14935c4167017c3, 0x3f22e2f44d03c9ac, 0xa6196244f2cd6164, 0x15a2b4ce514fa4db + , 0x5191a04c4abbd0c4, 0x0e763360ecc8a19d, 0xfef583c184a673c0, 0x75c2f30a7c7433e7, 0xe947a55547c7c099, 0x245c7ae44f6e7a83, 0x67a666f9e6bec2d4, 0x5de0b922fa645ac8, 0xdd9b3e4a5cb72e22, 0x0139c2c857adba8e, 0xa7feb68e863ac231, 0x501381ef88ec2da0 + , 0xb2b8c6a470f40b01, 0x051d65bdb8363062, 0x4ce90414a6d65714, 0x1e510b525d19df0c, 0x569e723f5d374cf6, 0x4bfe02fd38fde1f0, 0xae7459ebc50f9aa2, 0x0f7e2cb170dfde32, 0x3c3da2326a7407cb, 0x0cfc50a85ffd1842, 0x62ab34c85e85c3c8, 0x22b4d9644bb37333 + , 0x57d313b3d87c2d98, 0x4f432c1cba49133f, 0x6163d11fa4befc0c, 0x1ab94e122fddf12e, 0xfb7c9358aefc85a8, 0x5b20068f81d949b1, 0xcf8ed6ff2145c810, 0x5794afc021932d00, 0x5c8987ad9b6e35d5, 0x6bb1f4b836fda03e, 0x794f1fed4a3ea1d7, 0x0cf6d128deb0e7bf + , 0x54ec3e1c65878cf5, 0x002811763ba2200e, 0x382d917051e77b71, 0x49e00cbd013a9e7f, 0xccf576e9a4cf019c, 0x4b4a66287970333a, 0xf772168915edfc1f, 0x278eb5eca6479685, 0x8a95c8b9cf41cf06, 0x6e58c9c7826d39db, 0x478e119889f2fe75, 0x73ecd21991bd98d4 + , 0x26e751fe9fbb9502, 0x29825b71b0632e95, 0x21668f96ef8bb5c5, 0x2f2a899e53c9a004, 0x2803292ed4345ce8, 0x72731055c7c65dec, 0x3aaaca9c4b6fe9a5, 0x6228d3ceda8bd671, 0x773e2c5effc48eaf, 0x017ab19e0fea9ac9, 0x9609e10496c8d766, 0x121e89f9b302c30f + , 0x4e87d00a0be96480, 0x09bd8d170ba9dbab, 0xc6756f947ecd4e52, 0x2c9e40bbbccd0f5b, 0x42a5b77669fd812e, 0x66aba9583b080d9e, 0xee55df99d16e77c1, 0x4cc00c5c5eff2509, 0x8c84d5e20ab7c16b, 0x00ae5c96184ffefb, 0xb295e90346dcef54, 0x5d1bda0a39dc3b72 + , 0x75f92d72a89b5ef2, 0x259d998c9ff9ac0e, 0x8a1cfb72a6c433c1, 0x23f5b71d49d67604, 0x478d8f30914f62ef, 0x08fe61135218eca9, 0x4da2ce9bc6488c4a, 0x15f1eafd35283e2e, 0xc2d2be3ebc42ea0f, 0x2a5216539d6ee902, 0xa1e99052e7bdeeb2, 0x3a8f2631ec78290c + , 0xb71518a82ebfbfe4, 0x24700671c46ebddc, 0x6ef52d591a221f75, 0x4794614db6a67d92, 0x761f5c8ee4bab607, 0x31d9dd8f2361b5d5, 0x1a45593be8db3b29, 0x7f06c365eb116260, 0x9d305a66e52eb65b, 0x5edcfcb5613eac18, 0xef34fd28154adb75, 0x790f805753b9d742 + , 0x6ecd5ac255dfb797, 0x0cbe14db5d9a88db, 0xc1c86c5efa815528, 0x2c636133ba59d887, 0xc75d42c2d9f52297, 0x4bd3540c21e2ebd3, 0x32e7cdf790de6903, 0x1aae3c9837d3e30a, 0xeed028e49d436f09, 0x779ae12351efed1c, 0x6e0145587d9797a5, 0x25156e4cee9a407b + , 0xac2fd82f2ac57119, 0x7f8c026f1d182ed2, 0xeacc0d8fb3241611, 0x5968db65d2d7545a, 0x7d525846b1121dbe, 0x57949fd7b80339cf, 0x471fe9bec9b66c01, 0x5c270057f1268efa, 0xce092463083f656e, 0x16e8241cdc862cf9, 0xb7cb2bbcaa06b312, 0x3c25936bd8863416 + , 0x19b8ca966c4a3827, 0x1ae43badfd21e63e, 0x1dfd002b95a6ac6a, 0x4708e27f6d98e997, 0xb5fd6322dc31ac7d, 0x53baf4d9a16dd550, 0x025aa2ea5463960c, 0x5b5b33c7a3cfa54f, 0xdba287866ee96b90, 0x4748c1f3f3a6dc4f, 0x2333ec05a80c154b, 0x4a47745d5b99fb96 + , 0x44955b062a6ecded, 0x7791feea9015f170, 0x736bf603d12fc35a, 0x2632adbca5388026, 0x956e4c48e1697c4f, 0x4ee9adfe8600e32d, 0xa584042a0da56406, 0x34a3d7f4bf457353, 0x8d4fd4fe00176fab, 0x15321ee855941f4e, 0x670701ef81f340a4, 0x0c7d7c618aed0ba8 + , 0x73283131d9bfd9d6, 0x34935a39e31bac65, 0x466cfbbcaae8b991, 0x250dd54e18478ac6, 0x659e46c51e40de4f, 0x618ea014fec50e04, 0xfe64d883080b877c, 0x572cabbb6688c4f7, 0xa2c817493a834146, 0x06cd734876378120, 0xe3de0b717336a849, 0x36942f5191db53c4 + , 0xa3f9adf66abf4d88, 0x2a9a144b8087fa96, 0xfe49fefcb78a5b4f, 0x1be40a8616928bab, 0x07a901975521f7aa, 0x1fc66ea683693510, 0x4dbf0084ba42380e, 0x1f374495b918c737, 0xb8346956a380a00a, 0x1346f4766fcdaa07, 0xb4db5689d46312c1, 0x775e7f3274dc1316 + , 0x07898828f32341c0, 0x144390a33b3e86df, 0x70bc604ce1e9c5e4, 0x127652de00220873, 0x2874bc669df50d45, 0x236f4585150161f4, 0x3bfa4ffd318214e2, 0x7cc92a6165059745, 0x2fae0e92090ef72a, 0x26676bd59c4fcc3b, 0x220c030974d1d447, 0x66455887e98686e7 + , 0x4164b8e4d8760ddc, 0x5517a86f840feb63, 0xd9b42c6c9371cade, 0x3a7f03ceecc160b9, 0xdd4086d64cae366c, 0x1b6290c327842533, 0x144efcd2a7a0e82b, 0x16621925ca10d31e, 0xa9dcd13118e208f1, 0x5a90f97edcb1c54e, 0x80c47331c8749d99, 0x6f061a3569a80b55 + , 0x0f6abf619e2a15c5, 0x29106c98122245f4, 0x5860b10985c9b47f, 0x4f379a379e15f410, 0x2dd6f45df68e1678, 0x2c475167ad9b283c, 0x23b7aa00952a6a3a, 0x5532bc26a40c5365, 0xa5c0a8be3596ce22, 0x4fa3127a9aefa56f, 0x944e843aa973e67f, 0x3c7727d45ae87854 + , 0x48fa2ce675117ea4, 0x7bca8e04ad3bbb9c, 0xd57439e4726f88e5, 0x3337d3a6a03b2286, 0xb0b6172902005953, 0x514bd76734e6c0a1, 0xf97f8934eed7c6b4, 0x0abe13cee7f1b75e, 0x6c88107a120e54a7, 0x634f966d7a6e11df, 0x5044c53109b94097, 0x68d49fc65522b73a + , 0x69e295cd8c444666, 0x542c4c5fd999a224, 0x13ff89418b5da76f, 0x7133fa786a87ecb4, 0x2f180926456402b4, 0x52ddada7931c4dcc, 0x6eaf0d2130c71590, 0x014ec2a2ec231826, 0xac05b61443b34dd6, 0x157acbfab118b219, 0xe4e2f4b84ad01099, 0x0abf4a4da29a0eb8 + , 0x5f852b85b59eab1f, 0x1bd259c4726869ed, 0xce565d9287790a15, 0x17a48442bcf58a00, 0x01e519522381363b, 0x2336d07a710da07a, 0xcfebf2fbdc714cb2, 0x2f7a51474c23b8a9, 0x77db2a07d4e3716c, 0x40e8d8d2d0a09806, 0x644363ce6d401ae4, 0x53f9cae0470172fd + , 0x58d96ecd8ddadc53, 0x15028204f3d6d696, 0x6f40a09214439ce2, 0x738c5371236c3e56, 0x64f87ee7a28bf9fc, 0x4f1899449a810fee, 0xd0aa95f4bf21e376, 0x6170cc24283856bc, 0x9dfc4927d764ff75, 0x227ea1563fa2e012, 0xaddd3665622ce087, 0x473d3bea07a5285e + , 0xc0b986ee0d2b0eb2, 0x78e584c740dd18ed, 0xd5adbf30a04fd508, 0x1c6aed5ab59bedbb, 0x25d05fccbddb5ba1, 0x4a58fb6b3f896319, 0xdb2f6343fd8144fa, 0x46a445de6d5b07e5, 0xf67a06684fe9e1da, 0x57b2515923b15c9f, 0x50439940820a2a0c, 0x62f4b9b26f04dab5 + , 0xe79ea601d01b033d, 0x009bc6176f10fffb, 0x333bff2f907ed39a, 0x253d0a9e626dd400, 0x7a9bbedcfcbef06a, 0x2d1b6a7a5b39342d, 0xbadfb462a124cc9a, 0x2e8cde9d82c15cb0, 0x7c3f81bcd6f1b2a1, 0x04cb0b8fa4075294, 0xfa36d3db38cbd304, 0x59fef93442883553 + , 0x91982a741cb9342e, 0x7b9d63ac17b01982, 0x530b4ec25a293ece, 0x611069ad9fa0f0a4, 0x7a262a59b656a79d, 0x6fe6f8f4d6d015b0, 0x2c2fd7641a5d4e50, 0x24b0c507058c911c, 0x834882e492fe45ae, 0x68d0b01b13432761, 0x0eacaaaf94178b8c, 0x123e3a93006d7d01 + , 0xecf2fe69377ff33c, 0x4fc960ab4408584b, 0x2adc445b1ee45654, 0x4989681cd1d09a93, 0x79509599afe9e3b6, 0x7f6ffbbeee861c15, 0x2ed2859fd6391b25, 0x5e8bd52289b6ad27, 0xc949280adbce7c79, 0x510999e865f0cd54, 0x7f957314ce7d373b, 0x4b2c0ea4bab08ef2 + , 0x2d7cc08b5c05a8db, 0x4609a0ea23507697, 0xe204ba35182c55b8, 0x5e4d5903fdef61e6, 0xfe63842f2826598b, 0x782a3fd3ab62a179, 0xd2f01a1979e5a0f3, 0x0fb4c6bdd637fba2, 0xfbff4c192020c350, 0x14859008c3d223c0, 0x65ed7a889c1a2e55, 0x1d78daf483fa12cb + , 0x5b54d11b01bc09ca, 0x54fde75737306515, 0x89725231105b63a7, 0x712d1f394adcda99, 0xb554006ee9abefab, 0x04dd8f7bbd4c5381, 0x98d22b3a31995549, 0x637a53de6b57122f, 0x8367d69b4c92da63, 0x236f2a9514250df6, 0xb265509af63d7b7c, 0x08522e36bc4b65f8 + , 0xabae725012ce8301, 0x493b257197a98ce9, 0x33185838570e5f0a, 0x65f5477ac414eb6c, 0xd002a36854699753, 0x2be693b4d96efdb3, 0x3b32484119bdc53d, 0x55691ac09a8fae1e, 0x0249e394514c047f, 0x765674c90b78171f, 0x1166f64638d6ab37, 0x746adba4cb52d18f + , 0x93e293653dda6cda, 0x5d004ed52ebf0b68, 0x65c7c42d0ad96cc2, 0x3350dbe11cafca74, 0xc638cfa8942fef67, 0x0ff2dfffc5ac1164, 0x9e1b625e649aa471, 0x13a219d03d2eb86d, 0xdb92859ebaf9f7f9, 0x645c50918f7d5abc, 0x25c10cfe99f7e5c6, 0x13d858b53f90170d + , 0xddb258b13ab1e7a6, 0x4849ff49f4e13fc4, 0x9ef87fa85511cda8, 0x48c50d4d3b4d2f7a, 0x6c98422c8007c9ac, 0x3fdd72e65a3d3491, 0x56b18cb165b4ec3b, 0x6e2c6df9e3fc3daa, 0xf6db5aa98ddc97a4, 0x423fd4082f3fb795, 0x42f8f5edf424d0a0, 0x1a091c2696139936 + , 0x3161c2bbb3b2d58a, 0x2e8d339eb0fb9099, 0x45ef7d11f6fab685, 0x7f222a068db3da4b, 0x9af96f9742549a7c, 0x55370df31dcec81c, 0xde98e81b131af02e, 0x58bd0622a474acee, 0x8ab40fa7ca882e0d, 0x5b4db195655f2410, 0x4754eb479ada77fd, 0x67a8a437d6fc8a7d + , 0x9888254a4f0c9d58, 0x3232ba83bed0c618, 0x587b0de0207b57d9, 0x020df6becb096aa7, 0xef9e41052a29a8ab, 0x4ae671ee70a15a69, 0x167ce954923ee086, 0x6878c3996c1de887, 0xb29c711490ac097e, 0x1cf41a9c2577d144, 0x0590796ba46d8d29, 0x1c2e6dc8d4aebb65 + , 0xbfb904f8ac9b4cb9, 0x4ea1742c786469e7, 0x5a422f48401be57d, 0x0be0afdc77d6d32f, 0x5e8765cba2c738d3, 0x7dad0475059a089d, 0x9288ae0c40df7df6, 0x51c65f97715a16d5, 0xa9615d4c786ff9d4, 0x507ffe03ec0189ef, 0x1c1f46684604e41f, 0x282fe9d567db0efc + , 0xebee7f8381fb8178, 0x5bd4b6045c208d57, 0xf35694743439ed71, 0x7cddd5a373ebc5ec, 0xa58df33cc68e3b5f, 0x40e6714f5c5c8df3, 0xea881d4bfd489131, 0x6b36400b491c28c1, 0xd4475cf594b6303b, 0x5b630cddc72e654a, 0xa0b587ad34394ce3, 0x3ea3ba6014f86275 + , 0xc3deac125d20eeee, 0x2ef3568410a2b3bb, 0xee6ba3fac5d7ec00, 0x5fabcb3337aaa23c, 0x6b1212e7b817889a, 0x0b37d285a9be51d1, 0x617ca543d762bf51, 0x0896b4ca694b01d0, 0xe3add9718277a1fb, 0x553dee7dd4784865, 0x904b8f7e936cf430, 0x5b6a78f20b244b90 + , 0xa2b876c2914b9bfa, 0x704de952e9d969f4, 0xb04ea1b54b7e7654, 0x5d307bb3949cf660, 0xcee4c23ebd049d17, 0x7a88293bb1031063, 0x00b8432b8286f656, 0x260a9c86a16216e5, 0xd140e6e6629d8686, 0x296011ff5601a000, 0x536f0f76cd9b2928, 0x267409c23a823dd4 + , 0x0f041043797f8423, 0x3da6102605962ca9, 0x2e69dfeea02098ea, 0x427e7eeeecd3a0c5, 0x75efa5e8a590793d, 0x1f5841df6dfdfc91, 0x1aa1e1b8b9f3c326, 0x07bd5b0983fcee91, 0xd169420be9c48939, 0x7940334f0bb9023d, 0x9bb330fff113764f, 0x674ff1b0cfe246c7 + , 0xe2083f8d7129cbab, 0x7e6223e3d9c04904, 0x9be411a7d5e883a3, 0x72642664e7c25590, 0xbb1f783b5c412322, 0x46716e8fd737280b, 0xfa363eeaeffde271, 0x6c256c131fc2c3b9, 0x13259abfcb2ce1d8, 0x53b96556e96aa708, 0xfaa7c8d25119da19, 0x05019f438e9f8995 + , 0x05e1d55a9424f1ee, 0x63e8e14e6c2f3f09, 0xe9d844e997a10158, 0x51904ed1e94a0ca5, 0xb09462d4df6bc6cc, 0x2ee5308e62172691, 0x3f8438484547187a, 0x62b92b8d9739ddd4, 0x3ca54ab5d39f083c, 0x25b3336048a288d4, 0x7cab0fd67e296979, 0x58ba2e783962cbb7 + , 0x77808f1a1b8f3515, 0x290c219ee7153bdd, 0x7584441f79128f01, 0x0442db406f5135e3, 0xe741de52ec030a9d, 0x37469756586776b2, 0xbd64c2a7173adde0, 0x2280b66d20888d0c, 0xdd1b53cb4adb0fb2, 0x3974964394c445be, 0x53b6a95e7c7fdd97, 0x6eacdc6f50496d95 + , 0x178d04c0578a5bb3, 0x0d171a5f5215c9c8, 0xfe0d0171c504962e, 0x04eece54b220495e, 0xac4d145001db67aa, 0x6577c466962160af, 0xcddae62d99686ad7, 0x7a053a048d230d89, 0x1ff09aa0e605a880, 0x5d260426f355232f, 0xfbdaf7b0b53aab89, 0x5eef31b9eb0df78c + , 0xfb787e56b7276288, 0x4dcccba87d630d06, 0x415e4a4bc0a44b01, 0x0f0a981f71d8ae33, 0xe0ebb786f98a1502, 0x0ea4aa3ce70dc628, 0x8d36240617ebe037, 0x2d20c0e1d2002b5b, 0x336f8aa411a30282, 0x1d87c67d8178ec4c, 0xe468dff8ac26b63b, 0x266086bd7f11c9bc + , 0x05cfeedc80d829f8, 0x146902a029dd3355, 0x413db9327c068394, 0x55fa413791f64c38, 0xe06395c10021bf9d, 0x18d66268cf79ce45, 0x9e7ae6858dcc21bf, 0x3ad51dbe97b558f7, 0x06792c747aeef43c, 0x27ec9b782170abb7, 0x6aafca394a23e935, 0x18f7cbd98db64112 + , 0x34146ce6b36edbfa, 0x1dcfb4eab7ccea23, 0x68498e1f45b35467, 0x1b20d71a3b71d412, 0x7a875fc94e602e3e, 0x78c15fa449576c2b, 0xb52326d01ccafe8a, 0x3f53f57324d70666, 0x3830836e39bcebaa, 0x27a30c73dd02c884, 0x5dfed73dedf2306f, 0x75ee4a8b6cf54f74 + , 0x97ecc9c5851a8e3e, 0x496b581690c3df2d, 0xf7bba1fe2d169e7d, 0x4b06184810a77bd3, 0x40e6d643b903c7bd, 0x3c90f63b5176906d, 0x92f47e1ac51f1ec6, 0x70c2454c53cc0dcf, 0xb5a75d246c653b4e, 0x7e5173a420a8b0df, 0xcafb44c471d0f4a3, 0x69a3a4e92bbe5977 + , 0x26e93183cdfeb424, 0x1e0489b56fa7e130, 0x669befa672fe9979, 0x0f8aea6a7ef65bf9, 0xff0b883ea96b51ff, 0x31a668763c3c8867, 0x6887a0029701c9be, 0x545644cd70c87d63, 0x537b6fb7db9410e0, 0x6ca227f10229b3b9, 0xc7d1b4d71ff22468, 0x522058d3b20569f9 + , 0x5f4bfd813a51fb62, 0x105b94a3a42424a1, 0x96dfdb685825857b, 0x14d98588154500bf, 0xb4db83514c7a9404, 0x67aaf998856faf37, 0x1229d7e95dbc821c, 0x7e617a17a2f72bd3, 0xe964cdba7222695a, 0x677619cc40a07eaf, 0x7f82c099a8df7538, 0x2a219175ec95a1ad + , 0x755ac147b51ff3dc, 0x4a87f652f86823ec, 0x6d8d4a923f50278d, 0x4bb952ac98c0120a, 0x968c57a6a31e482c, 0x0855a11481fd5653, 0x3f05db6ac608d16d, 0x33f9e5746e1079c6, 0x1f3458e3ec51f53a, 0x4ae3fc836ceccf81, 0x3c0b2e2db5875ddf, 0x42336a1262cbb5e0 + , 0xe3651453cadc3868, 0x25081cfd6e80a2de, 0xd4cb31092872e53a, 0x16ca9349a11a9c37, 0xb1d3ae440d1cb675, 0x41b2d6ecbccbd6a4, 0x475e6a844c3d0ca1, 0x2cd0e0dedbf07023, 0x85ad446ddb002a6e, 0x72a06e5419a64609, 0x9e779387e9a3276c, 0x414a8163a9408b10 + , 0x25c7b53c1791333e, 0x3ea57190b42cd838, 0xbf20b346b094f121, 0x47570cba99b06c9d, 0xe6bd01c8746cb5f2, 0x3c0b0b8c4c0968ef, 0xb22009690e243975, 0x251737e4a5643da2, 0x3cdd49123ab89dea, 0x68748cd1e3cc45a6, 0x563746685effea7b, 0x4e4c5b1c86eb3a29 + , 0xe1ba017516d32070, 0x5cdd35a0c4ba93a3, 0xdbc66a0c7de30288, 0x22107156a0f700f1, 0x0fb69045aac0f647, 0x111dcb9763d08bc0, 0x266db39f6d78cced, 0x02a32587c7033892, 0x76fc94ce6a2a4b19, 0x474db0f12fcfa96f, 0x0c44584c08377ac7, 0x5f435bf43140f4c0 + , 0xb9741c3014eef7a3, 0x54596c23b536ff04, 0xeadf56bb6ea39450, 0x32f24f6e1a656b10, 0x21422e4dd5f54e3f, 0x0d6ad57853660607, 0xf6f62ffdd0bf9928, 0x72569c930015caa7, 0xf4293579931b9216, 0x049d6a4057e6827e, 0x6223e20060be0e05, 0x20d91ae969dfa9a4 + , 0x02611b345456d47a, 0x601dd413d1bdea0f, 0xe6b017b26bbc9bf8, 0x63399ff3d6542359, 0xdbdfe225045a9764, 0x10acd93346649beb, 0xc652d5a50e0535ce, 0x49efbd5639c4caf1, 0x65a5dbd8a304de65, 0x08ddebed0e865be8, 0x5db8337d5e715261, 0x34cf4c75496807e2 + , 0xd840c7416e44b56a, 0x10fd30d282d8b151, 0x36ffe6df2c1c9568, 0x66d8a38b6d31a2b1, 0x01fad3aa61984774, 0x412a9fd87b303d90, 0x2720945ee0f0ec9e, 0x0c91b4c7ea84cf37, 0x98462f25fd5832f0, 0x6f4cd578c490d842, 0xecc7d24c31ed3342, 0x580ab96994515fd8 + , 0x6d8a97ed98465b3e, 0x16995dc010908ae3, 0x50626a4e555b774a, 0x082636e5a8a9b568, 0xa99435cc4823b413, 0x41fc423d10eff4e7, 0x114236dce6f9f9dd, 0x6c3995c4bbe0aadc, 0xf3f22c975935753d, 0x6b1b3f27edec2a78, 0xdbadaac32ccc292e, 0x3856036f8a3795aa + , 0x947154caaec01d73, 0x0a22e573e3f0f49b, 0xc50c949f39c184a3, 0x2aadd0868535d0c8, 0x22bc5bbe5f992446, 0x15d36adfca3ace90, 0x038010e37a6308f9, 0x161b06d8d7180307, 0xcfbf4e3abef8d056, 0x2a1765fe9c7696ba, 0x6a15d44ce18ef392, 0x5405239c0369de64 + , 0x5fabda1210f58e29, 0x40cbb03974b37035, 0xa29fdf2875322520, 0x3b32ace85edac547, 0x0f0c92b41d679df8, 0x7f07ecd47a7d2f0c, 0xb5fc65c05accc95a, 0x0e8b1da70636f221, 0xb2ebd131f4e8a846, 0x7df51e4aba57f391, 0xaa2f3d40fef689ed, 0x0ee1e115fde5d582 + , 0xf7d025b42e240ae6, 0x29fc1befeb526af2, 0x7c5ffcaff205e565, 0x4cf4d0d8840e2e1e, 0xb8b00d1810ad0ff6, 0x44d3af686ba915ff, 0x86a8fd1eeea8d08c, 0x3eb300adcf6edc4f, 0x8db03c266b588186, 0x289d0fd301e96881, 0xba83ba260cccc170, 0x26ee69546ceb0c77 + , 0x1109d8bf92c4ea05, 0x033aa036671937d1, 0x4bd9902e5a664a0b, 0x42bd48ed44fdbb71, 0x7359e19357a9622d, 0x0d6ee92855dae22f, 0xc24debb323643859, 0x4c60fee1e191766e, 0x3beaec0e99faa328, 0x056c2ae1709c5b0a, 0x7fe89e0c62710909, 0x7e3b5cd3ac4e6ce1 + , 0xe9d06486ac7370a4, 0x4b1a8c62e99f9429, 0xb11a50e20bc3197f, 0x75ec513c25dac300, 0xfb9fd064b1466dca, 0x290379cfce59308c, 0xca3ee3fb7db99943, 0x2af7a3e930faea44, 0x0d294e6d1505e35b, 0x7d534585181e001f, 0x90285700831d4cfe, 0x419f25105d06c90e + , 0x5f71e79f5f828172, 0x02921e2a43326798, 0xa0981553e84d4a6a, 0x220c82041938573d, 0xfd2b5b78ef20c927, 0x3c99a2dc611caddb, 0xfb1247fd99ed2828, 0x4b3a3739f724890c, 0x7775ea2d7d2d1017, 0x3ab07cb5ba8ac987, 0x82e5123a20a6b5c3, 0x44965098aa82161f + , 0x20948c77e9ac4c0c, 0x521e934ab214157d, 0xc8f4f4052dffedab, 0x1da963c2ef46f27f, 0x3be7631e212fa2e0, 0x0d188e88d1a4184e, 0xb4483ed385de4bae, 0x4ffadfde83d2b0d9, 0xacebd9a51a938608, 0x40968c0c9302b0e8, 0x85704404d06f3a5d, 0x3e9f477a61a26d37 + , 0x1da1efc7cbd18d12, 0x4fb87a47b9f2cb04, 0x7556a45e8b5c8caf, 0x7f6991b7723b35cc, 0x3fa10a169532635f, 0x15e61b1cd72bd52f, 0xe6b45dc3b4667c21, 0x45cf3bd4bbf39baf, 0x7343b0636a9d63f9, 0x457551c49ac49567, 0x331e611a3fcec018, 0x7d19e2584756b92d + , 0x78951df174059655, 0x0573cd896a793337, 0xb3e37121fd458870, 0x3cc032b1a1bebc3c, 0x2571dd06d24d5a41, 0x017382ec4aa29ffa, 0x6cda850c15a224ed, 0x6af59bee2d7586d4, 0x287d3c4027f80ee9, 0x6aa570b9e51d4f25, 0xf29f327c5e0490d5, 0x00fb62f93f43edfb + , 0x7b06e602dc313277, 0x5d8dc98e723b039e, 0x5bb61813041a589a, 0x2a4c9f13eef7f1ec, 0x9439edcb4bbaba6f, 0x027f4d494e7784ad, 0x087ae2a2fd6bbc8d, 0x230f37ba41aec2ff, 0x63876e43daaac09c, 0x28abd7ae6e17dbe3, 0xd354d50cf000982a, 0x1dd774a1273aea75 + , 0x243658930d4b0902, 0x0df50723a2da63d7, 0x22bc07b9ac9628c5, 0x134123d68aa939cc, 0x4e84ee2cf0d450e2, 0x53a8c6dbd4aa9ed1, 0xd06e741c45610565, 0x608da7f96f2f7e19, 0x59b7fc9fe6a0243c, 0x0da36bb46fd1eb3d, 0x09a11de836914182, 0x3becc1cc0b96f1e4 + , 0x820b8a4cad71c17f, 0x2a425dd0204a843c, 0xf6f7fdaae1523c28, 0x5fb74c0c961e6fb1, 0x0c76e0f72b7845a2, 0x273db117946ce778, 0x7a22d35cdea5934f, 0x73aeeb1b24265d5d, 0x938a618552e4392d, 0x6050215beb6c1923, 0xf32f6ab781efbf2f, 0x2e4ece5c476e1354 + , 0xf2a4a59613812356, 0x555185da018933fd, 0x2fffbf95863bce54, 0x72644f9c3181e7a6, 0x98c6b1d509e3d624, 0x5bddd5730939d7d0, 0xdd197613d550fbad, 0x7671fafa1facb923, 0x13dbb61148c5b802, 0x616bc5c73ccdc3bd, 0x0b175b4c46fd8871, 0x498a1eeb000ab870 + , 0xa49f1ca2d7802521, 0x6906346cce00be5a, 0xf1bc33c727dd52b0, 0x5d005ff3122fd749, 0x51318ad5d7c622e7, 0x50f93d6d15e46e82, 0x88dfa2123ffff3b9, 0x3848e6fce3cac6e5, 0x6cefc31a33ea4f5e, 0x0cc5e7dc4e5e144f, 0xee2009402e59a7e2, 0x257679fdb86f4712 + , 0x4cf68953d8b17e83, 0x710f970c16ce2070, 0x4000b8e9e51e6aad, 0x5af48dacd01f24f6, 0x209679d5d3fcc916, 0x0a3538dd7cbe8232, 0x2d6d7aba44d990d2, 0x46c718f2d4b2c1a6, 0x9953d799a378233c, 0x4f4e80f4a682e7a0, 0x9912f04acbb77eee, 0x317432079a195b2d + , 0xaccccda6a1c11e3b, 0x3fd895817d0f3be2, 0x016db17673f750ea, 0x635fc619a24009b6, 0xb8447ab3370da1e7, 0x6c893aa19abf4221, 0x5f35ac703d8508d0, 0x13533d324d4adcb5, 0x84610370dece8512, 0x2223f126f9a70f4b, 0x18f00d60f3bf6a04, 0x174bd78b20ef8543 + , 0xeb179bc6a1698189, 0x732bf44a62015302, 0x98352342bc0e4bc6, 0x053f6640c1549e85, 0x65eee8b0397c7ce8, 0x790451f39f2fa27b, 0x36ffa0cb286cdb97, 0x46d07cec4c967bf2, 0x7c849ace30868412, 0x6dee239d339ef499, 0x8ab78548f273e57f, 0x01c5bebd8b7f5ef0 + , 0xe440e5f042eae93b, 0x65583f57fe057db6, 0xe6d5d26c24a565c9, 0x6b3b87a0a6ad702f, 0xd3f5d533117b8e64, 0x4addb9d0da92df89, 0xf1bd51990e0f9bfa, 0x30c624ec1dbcd0a4, 0xafaf2f00da7023a0, 0x3086e132b54574e4, 0x93bdbd4bfd3dd8c7, 0x690976ee132c892e + , 0x86fc11c79524d198, 0x0f6b95662e02c734, 0x5b78bb385564f568, 0x55c9b3f55d7cd16b, 0xdf1316434ad1c07f, 0x093d67d3fdf312de, 0xa1fd2257ea57b3d6, 0x4b5b18abe4b54439, 0x66c28f5b59d796b2, 0x7baffe6e642fdea4, 0xb9d3753265e68ae4, 0x40903bd6dfb02d6f + , 0x357958d4d72d6bc8, 0x179330dea4659dd3, 0x5a9ca85bc8721aef, 0x0209f09e03c9b225, 0xc0bf2e9738933495, 0x5e0dde4d715e50c5, 0x2743c96b66a6b951, 0x6af96188a0d6d358, 0xb2f3c72820f2a709, 0x5e9b8fd43327d9a0, 0xf0b13f5324012177, 0x7abdeaf4f741bace + , 0x6f006249351471f7, 0x3204eb91cfe9ed6c, 0xe09af1c83c13afa2, 0x6d70ed88d5de535b, 0x2078873d1a2faa1f, 0x5c73bedb8d96f3da, 0x41bbb407a3a1ce1d, 0x7a40ec2fb54eea85, 0xd6d569cb9dd722e3, 0x10acf67805927b6a, 0x27c61d818cc0ea05, 0x57b175c9f59904e2 + , 0x4f7b40bc92b5a60d, 0x51431f647b46b89a, 0xcd84dd55cc2a720e, 0x6b36059700809a1c, 0x78e3e5dd060e9a0f, 0x630c0c1a146c77d4, 0xc9925b0dea8fee2b, 0x4728f0604b16a06d, 0xb4601050635b2318, 0x2484f7281864709b, 0xbe2ed2a2523211db, 0x6425d4ff23dd3a5b + , 0xf0868c09017aef5e, 0x2733d1e1adc6d5ee, 0xa631db49f17f87e9, 0x36d753ced54d5727, 0x451d17fb6c4af537, 0x1dcc4d611dd55b04, 0x0bb8de0c8d3e549b, 0x2fb2ca1271592c3d, 0xd877914ffbc31ced, 0x190809a196504d10, 0x44bdd65a970277e3, 0x13195c678b4b01fa + , 0xe69a41a54f84d41f, 0x61c7c870565e4508, 0xeca2d2fc6f0e1c9b, 0x7f065480e257152a, 0xfaaa9f7c3a8873b0, 0x43fcdb8db58a324a, 0x969a79026e9da7a2, 0x4eab135af328b9d9, 0xb38aaafe87f85f7c, 0x69eba4fe1a6b6f32, 0x5607f6c6b4d27cbc, 0x273072bea774f9e7 + , 0x3c1149e3c8d51db0, 0x161f8cd433c28bfa, 0x765a61f218fe70da, 0x442b5d405f2036bb, 0x96f790271c564cc1, 0x3d5dbb33505cc956, 0x621a38b446af395c, 0x2da978b45bb70ce6, 0x755aca711da49388, 0x46f2e33e55e86df8, 0xfc5b454d5cb7be24, 0x67df47d68d8f6d12 + , 0x7a1e224893898aad, 0x0400219c89c2d13e, 0x6c969e4d63d460d9, 0x4df64d5df8b60ad2, 0x1feed05a45ff89ed, 0x290c4b59e684b4ef, 0x97ffbc3df096adb6, 0x4ac6037e76561c96, 0x1bc40299115e51b1, 0x7169e0a1d96aa1be, 0x43f55f8b6bac596c, 0x1cc6a0603081a178 + , 0x8e1d2db69bc925d0, 0x6ffb86eed51d2931, 0x3ad1eb242e0af1b5, 0x338198152fcd6d7c, 0xc1f381496df13943, 0x05d9242fe1c60b02, 0x39617510de7eec81, 0x24d8ba5ac76b12b8, 0x280eb2db9e548483, 0x6c51317b3a8a93f0, 0xb2a9f90939bd1235, 0x2da9de86c39f9aa6 + , 0x7f54917103127b97, 0x7be2be5ad3276169, 0xc969d703d31e9da7, 0x0500df3bbb1f8a4e, 0xea05c77685795917, 0x049575a992d09345, 0xd567f8de2daabe35, 0x383fad35a8e035cb, 0xb9353eb2bbd43d56, 0x52b3953221860c5a, 0xf9e4bcd46dbec03e, 0x4b0db0b4a7b3279c + , 0x8cc5f6b6e1ff80c0, 0x1bd2ce464b552215, 0xd008eb25b39c4236, 0x3b4ce5bb2f42a9fc, 0xe1f249681d153d9d, 0x3e022cb14bc4c5b9, 0x8a11d021c8ed5a53, 0x560d3fb258bec495, 0xf4405852705a6012, 0x5c8bccd2b1b3efd3, 0xd93c0f63ba7ce0c3, 0x337798cb3e93dbba + , 0x7a9f68cf800c8e88, 0x579afe689f3ebcce, 0x7dd41d6cdfbdb4a9, 0x3802410c4e1b274e, 0x64241d770cf0db02, 0x2f7c8133c74bde23, 0xf3c3fd835ed1952e, 0x741b1d88a3cee37b, 0x74e1ae644683c68f, 0x0c80dd9e0f7a91e1, 0x3984d741f3e47c24, 0x4b3eb97b6a39d252 + , 0x32e9b9410da9a195, 0x11d09fdc04ec3b41, 0xf92fd5e53cddea30, 0x296e095589e0ce05, 0x4e3200c3a283b696, 0x7e33fbba44ecb32c, 0xed3c039790ad0033, 0x5c8ebb260b5ec084, 0xa667455bb79d2e9d, 0x12fbec9d4f5bb155, 0x3aa5f6bb4d0d8d49, 0x0ca652ed7065d80b + , 0xb7938753d51c6f83, 0x41644ac1a602f9f2, 0x84223d4d63c38f7d, 0x71057b4b8b931282, 0xd39fa015165f47b5, 0x7536c8a19c33c201, 0xbe713ca4166c2dad, 0x456c98c2b4198511, 0x4793f25e1cb44658, 0x1d002f1cfe1a1ba7, 0x9f9ed6e1e1a27957, 0x095dece028426bdb + , 0xe57d3412fc1001d6, 0x481c63a0d9b25e99, 0xc756b6ba0dc02aa5, 0x24af047d79ed4683, 0xe37ac10133b68275, 0x418b45e570802012, 0x87578def0c3900ce, 0x7c5661923b8c9740, 0x5f4ab0a6fdda7366, 0x0ac6100825e4eb3c, 0x308528e42c9e4d32, 0x436e5979933ddde8 + , 0x0cd6ebe123352222, 0x63d1768a46f33dc7, 0x96cc55dff38c9273, 0x474438da7140411d, 0xa184b89b81cf6402, 0x6bf820a3aa675050, 0x3bd4720417391f0e, 0x3f2b8f859a8e0cba, 0xed952561b125da29, 0x07eb1ac74165097d, 0xc3f70d0c7db0a9fd, 0x5ab896a489294a6c + , 0xd4b608975c20018d, 0x6243b039f25d0456, 0xf766e98fc24c7464, 0x20035c09d2291e42, 0xcc0e5b5eeb462524, 0x24bcba5505f90657, 0x43a98d98e4fa9bf6, 0x3b621ec4188264d4, 0x633472fe235c812c, 0x31a20844a3316d23, 0x47b80db7d7f5d0bd, 0x22d482f5663780f9 + , 0x4df227dc52142020, 0x25076d0624bf137e, 0xcb4a6ee30a657645, 0x0ce469dbb5ada433, 0xfdb06251f65b9c5b, 0x44f82274a8e8f538, 0x98fa4c81cdec4b97, 0x0ccd61d1abb61d0d, 0xb9dc371344c5ab54, 0x35dcd9ccf8e5f919, 0x67fc81f369ba5722, 0x121b5aa1af6024da + , 0xe0b1b16b0fb1f1fa, 0x4dc688d6d3b1805b, 0x05c187cf10e40104, 0x71af39c743daacd9, 0xe691e97f82acf4b3, 0x0c46305b9243bf5b, 0xb063af137fde616b, 0x4e26e72a1de067f6, 0x61fe66d01a221004, 0x172fe9240cea50b1, 0x4ff50d37b2effefc, 0x06be02ab0b89aa5d + , 0xdd4aab96717af213, 0x32322555b58a7ffc, 0x7812aa965889326d, 0x1bd608f60d6457a4, 0x2c7b6b44e999e141, 0x113a86a87856a8a8, 0xd95469fc33814855, 0x4a18dc36f6bfd586, 0x0706b60bdb854fd3, 0x4dc356685650fa90, 0x24ef7cfce41f8dcc, 0x19049c3e632deae8 + , 0x5c9a4e28b7138a89, 0x0f0b7dbc1e5087e2, 0xebf49cdc66a362d2, 0x19e4b815e6576c85, 0x1896051ee3b6063d, 0x09ecc741852a68e4, 0x4009034def986795, 0x36b440ff39b4b5e8, 0x9bc2647ee28af1cb, 0x62613c9dd152b3a8, 0xc2018ae5dfae5f2d, 0x29ce5ef30009c855 + , 0x0b653558b21d2b1c, 0x45e2c505d1f74936, 0x48304373240553d3, 0x0528569885a82310, 0xa90d402e33924181, 0x5e610edc23cb9555, 0x28890ae7e007d28a, 0x7e5132b6b1ebae37, 0x0d5252eb7c94cb1b, 0x308ddaea1fdbb672, 0x99fac0b431730534, 0x77d54ed63b9325b9 + , 0x4d647bcb76c6ec3f, 0x0e968b22ec2cad86, 0x4b22b5ec30b08a35, 0x3b31df3b52326b5c, 0xbe84f638dac3105d, 0x7db085f133ecbed3, 0x7a8b694596f2cf2a, 0x67b2e6c15d16e0aa, 0x4808b20bf173011d, 0x25d5fbbfbe66f864, 0xf67f3f3cd9743987, 0x654250e89617ddf3 + , 0xf5a1a7e0ba0a88c0, 0x3616c781799ab50a, 0x2669c27a2d256902, 0x3a8ec380e12fd7dd, 0xa25361f44a418e30, 0x2942f3001d233645, 0x60f1d3b7535a4133, 0x14deaaa12e5c7bdf, 0x0089fbece10c8d6f, 0x4bf7c313757c803d, 0x65aa30bfbb70567d, 0x4fed47af409a3fb3 + , 0x07557dd875d3daf5, 0x36c49c2380e3c9bb, 0xa21f643d329ae02f, 0x6cf6f7474338bcb0, 0xb5df78136a0f3012, 0x031fb2df2e00e9d4, 0x4d86fccbe75e79cd, 0x23f890e082d03b7d, 0x5716a1ffb50a8262, 0x0199b50aa6cf3302, 0x6a1be351f86090d5, 0x36095efc13349364 + , 0xffe752be8ce46920, 0x65047a340b652f65, 0x320ee55fd03156a6, 0x5af6aa45278409f6, 0xa6caf283b1cf3850, 0x4e3a988f61072f96, 0x750f67926b18f680, 0x09fc3f2927d21a4a, 0x914893c2f2ce1169, 0x4d15b367121b3e75, 0x6cb12559723774f2, 0x3ee5b8c2a70e054a + , 0x7dd9b3518d84d2d7, 0x147d5a5a53f57a58, 0xe1bd0904ad842a05, 0x3a0f3b029c9a5845, 0x7153c03261410074, 0x4e203d6737058c17, 0xebecf5cb79f28af9, 0x574b889870c279f4, 0x326317b005f444a4, 0x7480da44b34f4b1e, 0x7c5f21cdc46275b2, 0x210494b9ee24e4e0 + , 0x3cbf6ca1f4aa4ead, 0x6bf3872ccbfed940, 0x19e8a84673a566ca, 0x61a80e16990401a2, 0xea2e029e7f9b3824, 0x5762298465f0ebd3, 0xf60e36d4969f9af0, 0x00b826180531c799, 0x17120ec95cf3c61d, 0x47196cd6de85c7d0, 0xb0d47cff46a5cba3, 0x29271400d7ede26b + , 0x835908353516b894, 0x4bc57f8c1eedec8e, 0x2ec5deede5c0db5f, 0x7b9fc48ac4a689fb, 0xf82ce6de88fc10e5, 0x6c5d84a70e03a3d6, 0x88a211fc4ea531f9, 0x7d5583e5918aa03e, 0xbdf2d70766fb8f39, 0x5926497e734ab18a, 0xd6a9872b800cacb4, 0x757c1cd521fd22d6 + , 0x22d50b0c13ec4bc0, 0x288a77d34a15e99a, 0x95c8e78fced3d4eb, 0x45ece109c15be169, 0x878ef262d0132128, 0x48110e9fd98939d6, 0xe3fc5425d2e7741e, 0x050ca6e71f599c65, 0xe02f97605d9fe375, 0x2af48b9bfee410e4, 0xfd34a1c107229a54, 0x43dc6f0cdcbd41fe + , 0x15b4eb7d65cc562b, 0x369a7b0dd3e91248, 0x2b087611edd32810, 0x116b234ddce09d7f, 0xcdb03cae8e90d2b0, 0x4017d51587566038, 0x081793739242b600, 0x5086e8e633cd52a1, 0xf5ddaee155cb8087, 0x773311b60d59a7e9, 0x36e5aa0acadf2068, 0x7126a4281b192882 + , 0x54a10df54f7ecef8, 0x3cd7d2fbb6e33f67, 0xac31eb6c3e740c25, 0x517db54840feb2de, 0xf17cb269b3ce27a2, 0x04a8fecd1dcc99e7, 0xfc887c1f2f85a2da, 0x280da7425bb55b01, 0xa1af72f5256a5a53, 0x71da839fc459f465, 0xc203fe7ba6587f71, 0x08a4201f77a4f335 + , 0x6cb9ea5683014d96, 0x5da17076b6b51ae2, 0xb55ac168c3e3997f, 0x41b9a32373d78f7a, 0x96f58033b8600a50, 0x6ebfba3ec9d956cc, 0x0ff8883707d66d0c, 0x2f562b035445226f, 0x2388fc015bd368c7, 0x2b7d802ce27f627e, 0x301f0369c24083a6, 0x77e139f6da8d5aaa + , 0x9f78574697fce43c, 0x02726c94565421b6, 0x1ad6007338e26585, 0x6134cc5eb35c02ff, 0x77ae739c9cdcd1e1, 0x04e96543233c7a13, 0x97d3926dcded2e10, 0x6bcdff7e14cebb73, 0x9c46ae2b32489774, 0x04a97b9a0473af8d, 0xb0350bd910d9784e, 0x448212d3e2164ad7 + , 0xf3464e0351f5e995, 0x68ab4d24b3ade8d6, 0x86854d534002af20, 0x613f7ffe5de92aeb, 0xb385b4f4608a370a, 0x220dccecbc6f2688, 0xc31ec5384abd3680, 0x25a82841a2000fd8, 0xd19e422504694236, 0x0bc1124d541781f5, 0x0808651edcd99176, 0x41b81f223d429c76 + , 0x1a6dcb2662cc80c6, 0x0b101fb0ef0d1f74, 0x6f02aed8f8327119, 0x5b4c5176ccc4a340, 0x8fcefd200d6ee8ed, 0x0548127287f44749, 0xe1efeca1fadd1341, 0x0e74bc189dc9016c, 0xe90470353f46cb12, 0x69513d3455bc890c, 0x9503686f1f2497d1, 0x280a0bb7733f1086 + , 0x14e5f99930a91dea, 0x7840ad84b03c3878, 0x46e32c654fdbceb1, 0x7e88d2822bb2cecf, 0x4d78a8aed7f8661d, 0x70eb17416ef40180, 0x97b6f1733c474a10, 0x3d0d27fc4c7084ef, 0x730f60f6a1ee0d71, 0x7bf6e3885d3d9302, 0xa1e8af33742f1611, 0x73b798ec129822ed + , 0x0f669bb094642a70, 0x142927de789fc4a4, 0x0db18e01fa98cbd7, 0x6ae4d37674be1451, 0x7175e98f178b4b74, 0x40534e319bc52c6c, 0xb7211d252c4db879, 0x1a7651f8f3ed1aae, 0x9c9a43932d50cc97, 0x630b232b7201c359, 0x327d77575f5b3839, 0x5f0e19e78431864a + , 0xbfbb00b6530a3bb6, 0x19ba9d60d97f7857, 0x759779de744bd764, 0x5facbe63177791e1, 0xc74ea511c56a3b61, 0x1d8909e84083c31d, 0xcd20094b507af492, 0x2ef1b9c07c92ab37, 0x8430ed9ef8494fc9, 0x3f9170e6df5b1fa1, 0x1fb8dbc837175d73, 0x65b961b58008d022 + , 0x7e1afb6816864b6f, 0x54c4b92c534871e9, 0xc0a1dcd60d61ef84, 0x4390f0e992c41298, 0x1e54e2c8b7c27348, 0x7a987e01a2ec308c, 0xee42fbd90c4a89fc, 0x1ed8c77f8d7c609d, 0x569dedaca99a3346, 0x0eb471e609fef4ed, 0xc915522a3b9fd03c, 0x726453b246746bfb + , 0x4ed3cae53dc5fa4b, 0x1bf1e4b34b9feef6, 0x0850df9f0401fac3, 0x0a58d33cb2422e2f, 0x3d197f9603ecfc29, 0x45e46edba1cc432e, 0x96c0c93310d9bcaf, 0x18de3a458be2c33f, 0xc9e65e5bcc12a49a, 0x71a5345f0239b187, 0x53b3b2f01c5710b3, 0x438350f57ce2ec4a + , 0xdbbd368a760391db, 0x4033638dfec29fe2, 0x297ad75ed73117fd, 0x269c08d54b106e8c, 0xa4e3e4fd238b4218, 0x1f48a1cb09208aaa, 0x9575153115cf5fa7, 0x59feeff0876fb74a, 0xfdedb4af6f368710, 0x79be1fe79fa674d4, 0x689d6bbb4c707c39, 0x394a451499057bb1 + , 0x5887d4fb21fc43b3, 0x37628dfc4b5c23bf, 0xc66b76944b34bd13, 0x6e97f0a8a45bcb36, 0x3ac6b10139edbbdd, 0x313f4846b6745833, 0xf8758d9777cd9037, 0x02fdc98f02692537, 0x9e79f381fff833a5, 0x25ac5d68c49b105c, 0x1e9f48a076d8c9ee, 0x788c85c9fe9543b3 + , 0x776ea51db3b3b778, 0x0007c44055b64db2, 0x3c392c2a82fddd25, 0x65000203be8ee976, 0xea119666ab7c50ab, 0x528b2700e8f82d39, 0xc4aaf797118b8282, 0x55e5a7d5382e0d3a, 0x15a80b22e89f1039, 0x199f68594b1247a0, 0x8d5630750d622435, 0x2687f48cc6def5b2 + , 0xa16b0c0259eafaee, 0x7aeb10834e93595a, 0xe31bcf34ce679d9f, 0x4e2c19829eee3c87, 0xa46869cb8ca35c9d, 0x3cd35313c08504eb, 0xa088eca66e98389c, 0x44c562f0f7262740, 0xd3eb8a28f447523a, 0x43a0e059bfe37576, 0x0312c5d6d0f2e0ad, 0x5f30aaf0d1614c61 + , 0x6f09a7a6e182b0aa, 0x575db3d21a82296b, 0x6599bb5eee7925e6, 0x093f89458dcc2fe3, 0x70c4af785151fc84, 0x1230c0c519de5480, 0x0e66f8f93075a4f6, 0x5de4a122633a5c6d, 0xdb99cf83f9ece1b6, 0x1c3acd4a13ded617, 0x4dfe69e68f59c447, 0x482ba1f7715a3c16 + , 0xefeed2a7c81ea8fd, 0x4e089eeb713a572f, 0x78bc74acfbdf322b, 0x4b4951ce8eb86fbf, 0x0eafb6b46ac6714d, 0x72913ed109f7d404, 0xb498bf6fcde9e3a2, 0x3c08a283ef5ded62, 0x9af09f593a48b346, 0x7ed52441d00d4980, 0xa78e843ee5df44ac, 0x25db12d420a86151 + , 0xec840e7e89d049e0, 0x5a34cbe928bf96cc, 0xd875dc5525da882c, 0x2af4442fc256827d, 0x089fb428c2ef5a5d, 0x0b573ace080a3d9c, 0x6f57282554c240da, 0x425ceda6707b6bc9, 0x94b5a8c3dde824fb, 0x264f6f6a445b5da9, 0xadf292191c5c1eb7, 0x5e302e82fa4e5533 + , 0xf51712fc44237f35, 0x2b0af62c42e56e66, 0x10392cb4d9c71b75, 0x4d7e08fe8457a95b, 0x210b9eceb04534bf, 0x73329d1c7d88e1e5, 0x667a43fdb4ba79e9, 0x3435ec04276ede87, 0x38b8540a1a78b098, 0x4f6c266e6793bb78, 0x447ea35172754041, 0x109d7b742d8c3dac + , 0xe3ccab45d2a4f6f7, 0x59040bb73f3bbd2a, 0x730b39d65645bab5, 0x5c61aed2f83382aa, 0xa992143de3cf83e1, 0x13455cb889b700f9, 0x54648228b310e2f7, 0x5b837752ee0f733a, 0x3923a6c0e5ea0dd9, 0x5ebebd01fc9ca9a2, 0xa34c205b8fd94258, 0x7d1a10029a0b6cd5 + , 0x6c83c02241a46527, 0x4127c85d6be1fc62, 0x26f86ff5ca7240b6, 0x2167391e7dd95cd9, 0x79227506ac78caef, 0x1a2cf919b8832a0f, 0x07745266405cf574, 0x38095a07f5713ae1, 0xe5eeab985ca3e7e7, 0x6a5dd9eeb734d639, 0x991027ebe44a4822, 0x311085fb4de9c1f0 + , 0x33f361e21066c3b5, 0x550091d2dfc8688f, 0x376345c5532bac13, 0x0aa0898f990931b5, 0xea2f3346e5d3226e, 0x208790ab78776afc, 0xac7c2ae63433850c, 0x3c5c373ada10ef52, 0x96c1b4003f4cde6a, 0x4546a9c475c09781, 0x6c961fd3e8536294, 0x43f36e63fc0d5066 + , 0x296601d8c42167f4, 0x241c1fc38565471b, 0xdb00a27e11ce9617, 0x60381181b7e7e4ee, 0xc1076b7635ac4d52, 0x0166010ffb8dda38, 0x5238f69becc43e0b, 0x63303a2015708b17, 0xe8badb2e5bb22591, 0x3a10a4e218b6131d, 0x236ab01aabf1a7b3, 0x1ce8a51a68a4126f + , 0x59e775e2a2a87928, 0x770b48eb4b738301, 0x0b43c2be176bf79b, 0x1957850fb6424660, 0x44455ee1ecb0ab2a, 0x620ceaa116eef4f0, 0x0198f62cb6183f6b, 0x3274f78eaf2d55db, 0xd2ba4e460cf7ed5f, 0x19cfc17bc0b66f43, 0xcbae6f45b1942722, 0x5d93e44739147b58 + , 0xd07180b9d28fc597, 0x35372b21b2ea5a46, 0xed2673477f083464, 0x7a9ebeeecc57e6c2, 0xb51d991a81a6b314, 0x35e7d90f4ed6de58, 0x45f21e209510dd05, 0x446ffd2715c8d380, 0xe69b5c7a9b6d3e76, 0x1379e79fb96912e6, 0xc161c848bd508738, 0x22264a049d8cfff6 + , 0x32321a68ff7ef7b3, 0x57b0e50cc585b333, 0x1c08c65ba9d764e7, 0x5534c793f92f00f5, 0x7a1ced97eafe6fe4, 0x6b8933739202599c, 0x618c5f8fcadd3ff2, 0x2a8719b3e6548653, 0x346a9ec5c4200f0c, 0x7a36b8d00d0eda58, 0x844b22b75021accd, 0x769737059fc5e465 + , 0xdb1ba69b5019f266, 0x1777242305db9ac1, 0x491d11ad264b6ff3, 0x136198dfc57a3053, 0x4a6cc64741eb7176, 0x14e811c97fc97650, 0x6b64667f71be386d, 0x3286fcadf019eb5e, 0x3f2591f4498e10a0, 0x674fa7c32df7867b, 0xbae8ec7ee100dcf2, 0x03b2c0a20a6372a4 + , 0x4c8d76b471e24474, 0x421fb6a7b8a3216b, 0xc672bdb2fe8f514d, 0x202af653d9aff3f5, 0x05e5f80f9626953e, 0x7b721fa3ccd42ffc, 0x99d8e481c0f70479, 0x054c31746d23362b, 0xfbef2e20430e8025, 0x60e1e3f02e7720c2, 0x161701874eb347e3, 0x363924e90cbb77a6 + , 0x180f5ee1863a1a6a, 0x2f79c0046ff79fe2, 0x44679866e35447f0, 0x1c64c6dd73e0d636, 0x1d8175566341469d, 0x5ba634965b8b9e87, 0x8f48744f976952a5, 0x744f28d23db94c8a, 0xd15e84b1f232da34, 0x556f3d7aa38bee8c, 0x14693c56e866ef89, 0x1564fb9a0f81eb03 + , 0xe97eed56fa2b483f, 0x6d3f7e01aebd1957, 0xae8f128aca3b3e45, 0x3d41e85ba2afd3a9, 0xe4fe485e4b6d8328, 0x65c49b4c3e98098e, 0xe96a00e054d6e91a, 0x394a2122738cd006, 0x715cca3dffd90785, 0x7bc3dcde15890965, 0x6dcdc47a33a148ac, 0x435db9d6dbe1bd55 + , 0xd74d4d6e0fd89c27, 0x25e727f6a5380553, 0xbe54127ba6c5189a, 0x65c87d3c3e61939c, 0xc34a6d122a809e2e, 0x7de6b787f097eafa, 0xb8f8b6e701758661, 0x10705fbf97042046, 0x1591614e6da2d44f, 0x7c74f26ec6eb070f, 0x9ad98c1a50249c60, 0x6e1bbd44d64b2302 + , 0x937cee76047790f9, 0x5b4ccbc70beaf690, 0x332e79ae75ae0dae, 0x2e6394161d093556, 0x4b378bf68f6849f0, 0x6c419fa0cebba72d, 0x8bb431e1e273f2a4, 0x357cec80bbe024fd, 0x83a6e913962f11a9, 0x7808df02e2523718, 0xb6690b5dabc49e13, 0x6cef23259375972a + , 0xd18ac767b5e551fc, 0x5a0ba1dddb15bd36, 0x6f7923de219e3e1f, 0x3ec23588db9b5cfe, 0xa4fc23d42c83bbe0, 0x21581a00768658cd, 0xa295b6e57110218e, 0x3e7bbab1d15f477f, 0x2266c03d3f0d0635, 0x4174f08a95be03b5, 0xaa1a674abb8cbeb8, 0x6bdf6ba553ae3390 + , 0x8a31f824638545e2, 0x2a9e37a0f0eede53, 0x148a53d8cba69f65, 0x64c587e816d96316, 0x777a028a47e97e93, 0x13728e46befb2e0e, 0x13138b44862fa665, 0x0fca8c38a87775f6, 0xcc44bd580dd067fa, 0x40f2f7642e22d02e, 0xab3ba6db80c2f728, 0x5068aa2e2d25b7f9 + , 0x5a8a842c0a2923ff, 0x67c39e8a1006c196, 0x8f5cb9ff55460a84, 0x2e735c20a419a518, 0x0c6ee3fcbfdc2da4, 0x5bf6ed60a87b92bd, 0x5e4ce130e8e1608f, 0x0932ceb3e50028e8, 0x793cf8a0538cbfb8, 0x4e89e2c018beb7bd, 0xcaaa79642f5060de, 0x542a38a4d13f0016 + , 0xa1b0fd9aac663e55, 0x5158bf1f7b33c0e4, 0x060e82f65a4119fe, 0x32347069a1529fc4, 0x5c96ef69127480d5, 0x409a902134df6ffe, 0xdbe8c392eb6c7013, 0x73f2c48b0e3b4a79, 0xddf5060b937e2dff, 0x1534f901278611d9, 0xf47fe29ae4fd49a7, 0x7a2c0bfe75539f29 + , 0x19e04d1b2b0fe7fb, 0x56381ebd8181b50e, 0x5c8970c249df4ac3, 0x08acaece8ede7685, 0xc44f1a71aca0d20b, 0x623edc8d92e4ac3a, 0x5496a7e5885a0c95, 0x20a9ba37315b116e, 0x3765873809f5b55d, 0x23c44c42ebef2ff5, 0x56a96d921f724573, 0x3217815b72b8a9eb + , 0x2cc1b42f5350a489, 0x31f0b36e85b8c70b, 0x504a5c8c4d2ce34d, 0x1af8ea26b3786eac, 0x69bc5e26d7afd62f, 0x21e399d04247bf9a, 0x6e6d6676a88efb27, 0x476212b9fe9a6fd4, 0x0740fb65284168de, 0x5f7570be65e69408, 0x0166c3279dd81c29, 0x6565489007c4ed6d + , 0xbafb5bd37b5219c9, 0x00251709f2e210f7, 0x0d22639b51c1198b, 0x0f3c0df3be3de811, 0x3552612be3374eef, 0x0834744318ffa0aa, 0xcb9f1c1e3557a00c, 0x20c359f5de8b6614, 0xd319482a34d05268, 0x42165771b46b75d7, 0xca336c22e8d911a6, 0x4d072f70067a47e1 + , 0x9022c6f101555e9e, 0x4c8c7eaf7cc2d697, 0x629810b2d8044817, 0x25110bc01b06c9c1, 0x1bf9c06bf39eaff7, 0x6cc36f151f52b4e8, 0x76b73a6a14b62068, 0x47dcb0dc89db3821, 0xfe9dfeac2f670f41, 0x625b5c93b973c417, 0x5f8c917930133c1a, 0x6bd35f3e0992bb2b + , 0x03b5391a85409e5e, 0x7981d8fd16362767, 0xdb45c80a32a23cb6, 0x67356a7ef48b2dc3, 0x6189236e9f01adaf, 0x07a1e954e5032bd6, 0x53d627199c69727e, 0x25d67e4163cec014, 0x18e7bb6a63a80738, 0x3112be4cb5dcbc74, 0xad9ad6d381643f04, 0x116112cbeabb734d + , 0x32623abe2d66ff07, 0x4d780300822436de, 0x9bed066c04497808, 0x40db29b39ce86700, 0x6e5e5eb3805602a5, 0x52f227f2b1b9b40d, 0x51c2c4c197a18394, 0x6d8bca423ee270bc, 0xd6e60cfe8fb07f72, 0x7dd66c3970f940c6, 0x66aea7b59a0b17cc, 0x75fcf8b00160d729 + , 0xbedc5ea39b2402b5, 0x0dc3600425feedd5, 0xadc1ddf2cb1b6631, 0x205ee93e3aae976a, 0x7a2cb4e333c98498, 0x7d12eb776d56872c, 0x8e339bc1b41599fe, 0x4600f0a53fac9427, 0x1049d3a372f14304, 0x7b54e020b22db742, 0xd567962272a35739, 0x27a1178b1115f0c4 + , 0x6cfb39d619c35e1b, 0x5cb96fd1a9d9d486, 0xaf45cef7fb4fffea, 0x4a73d7b2ba9321d1, 0x44b46b4a80be86ac, 0x2769b50579e8f734, 0xab5d109e7472f372, 0x2bccfba1cbe995b6, 0xc00026115332f6a3, 0x7acb287da1561c53, 0x21555c608cd90dd9, 0x7731d1b2878dae13 + , 0x32122bf5ec1a0649, 0x592b5fa180ec8467, 0x876be1b5ad9ce66f, 0x484c1cc5bb34819d, 0x08e4cc425b30b06c, 0x2766065f0e4d22ce, 0xd90825644987aeff, 0x3a835fcc7fc456a6, 0xf4d801d2cc806d69, 0x41d767ecca55f839, 0xf2dea9fd01f1e74f, 0x74d01b97462211cb + , 0xe43e280ad29f80cc, 0x5cdf66a69029b231, 0xe8d655a03c862cd9, 0x388e38b58d0e8c79, 0x5d9aaa4848ff83a2, 0x14d6fbee4d6cbe74, 0x0426dcda912109ea, 0x1bb7b9cd75d4b541, 0x3a3c0504b39b8505, 0x35a3c5882b31367a, 0x678793d635a6473a, 0x66abca7e20202034 + , 0x4a90ff1dad300021, 0x18f29036544d2684, 0x2036d39b8f69095d, 0x36490f5645d18cc8, 0x9414d7368ad3562e, 0x7f8108a04558487e, 0x93db0e56d653e40b, 0x03f413ea960537bb, 0x984717b77f7267ef, 0x6c5d9da4a5ee7305, 0x725318dc36060a49, 0x274397f8e79a239e + , 0xbda7965b4095bab0, 0x6292b2505c7866e3, 0x451fb6a0672d6733, 0x37c560f40242a859, 0x151e56eb818f1423, 0x63451986f0c22ee1, 0x9275ff873a5c75e1, 0x178cdc734a32b96a, 0xff7adbb24244aacc, 0x76518aa0dfd96ddc, 0x161c1c8c81071219, 0x0584d44c10a3e6dc + , 0x2727282a09e9acab, 0x1298e49c34514ebd, 0x0323d059ca1c0e6d, 0x6072c8b87dd26bc6, 0x36eca2ab28d36f26, 0x2a977cb5aae4ea2a, 0xf157d43a0b9546a7, 0x04d60af0ed661d29, 0x34bc1080126e4402, 0x7677ef9a21589171, 0xbd13797278f07a40, 0x32c0daf0b57f20ac + , 0xbc83fd1b8366dc2e, 0x6cd07286c4e670ec, 0xf35485a3f339dc8a, 0x6e7e9285f2247e8b, 0xa9d19d3a09943bae, 0x43fa5197eed852a6, 0xf911398a043242fe, 0x4a100dcb1312cbe9, 0xbe2fd86be910a692, 0x614fd829368d7937, 0xdb5a98b1a92d578f, 0x46f1d23e1b0dca7e + , 0x8bf4c6725e813f36, 0x68bc89078129ce91, 0xff56503ae28f5c7f, 0x2b6e0f4e42178ce5, 0xa97cd947ec65895b, 0x7aa90b66280ff6c9, 0xebbaf32df158a0a0, 0x6a748d0ac02bb713, 0xdf79b5d619e83397, 0x16934947f6485b69, 0xe75185521ab32881, 0x20791e276a7460c9 + , 0xd25c403e22c70bc9, 0x0bf079518e66e1d3, 0x45dd5c971d3711de, 0x66bd2c6a30be232c, 0x607829e5b29e53ca, 0x30ed414e71dc08a2, 0x3fd38589ea0f1d39, 0x5a881a121f37fc5c, 0x27b9394368987a4f, 0x321fe45e13afae2d, 0xc6feb75080f33ea0, 0x02166d52f45eebbd + , 0x15026a1b0ccd2fc9, 0x1141be93d5bc3d6d, 0xfd20df606fc676c9, 0x4059e26b00ad78c4, 0x0709b409cec6b505, 0x68f020e8acf478e5, 0x875d77d1f5df0cfc, 0x66eb377735162ff1, 0x860482ab417a32ae, 0x21175f47da213935, 0xa07ff0cda099ecdb, 0x26ae5f177ae2b8e7 + , 0xa9a070ea5120eaf7, 0x2581feeba7383f81, 0x49e0f137f1fa2a7a, 0x7fe93c51cfd1ec62, 0x2d74dbdca7777f7e, 0x562da2ba74e823ff, 0x543b4f8609d77a2e, 0x3a0f65212f234ec8, 0xf842e3fea270ebc6, 0x4524322c6a289e11, 0x80815887aa6a8576, 0x46f49d53c3fe29a3 + , 0xbcc93cedfdb0d388, 0x4db312076ef0ad2b, 0x1f2cd56373654ad9, 0x4c6446970034d15f, 0x34d2cdbfd5d7130c, 0x6198950d03db2ae5, 0x736094b72faf1b1a, 0x1f6ca46a9f2588f7, 0xcba0b03d6259772a, 0x24e5a23d8d6be3a8, 0x7090e340c94f6d6f, 0x287ba27ee54e8466 + , 0x87320c8822d607f0, 0x44fd5802509df171, 0xf35c09860bf6ba4a, 0x6cf53130ef77cc0a, 0xaa81167a00b48ce4, 0x649f4c775b0d8b48, 0x59a25683ee98d33d, 0x651479007d1061a6, 0x155487411f6e16da, 0x411d036475404bf2, 0xc231f1344162458a, 0x4f36b7633f7dd368 + , 0xa98ddc0a4e7a89a4, 0x55d8a5da6eacd542, 0x5c3fb48b1001ed45, 0x5c7785ccafa702b9, 0xa64369fd216afb79, 0x1f405ef10e940669, 0x755f4831bc327b6f, 0x2bc1b67d71f1882d, 0x8eab15cfed7777d0, 0x517370d580d99326, 0x0811b75701c9db39, 0x234d84cb52f7b621 + , 0x970c4fbddddae49c, 0x3ba8d842475e41e1, 0xb0720f6ad75e7008, 0x275cd5c5184bf345, 0x5eb9833888d3796a, 0x1b3a42dfde11c2f3, 0x946548fe092b5f4d, 0x119917b50f263cc9, 0x622de955a20a3f82, 0x6a552ea3a60c7ff4, 0xc79230138150372a, 0x18083b9518de76a7 + , 0x55fb74dd7d3b5455, 0x523eea9a70ff8334, 0x5994a7335e356271, 0x3bb011f60430f1d2, 0x1ec434cba1d6ea7c, 0x69b632960feb5780, 0x46c50417541ebf07, 0x01470bfbf9d23830, 0xe9551f4c049bc5cc, 0x1c124638f35ee8ed, 0x09ca3a9141e83a38, 0x44daaf3e7411127b + , 0x0e54717b6c2fcd10, 0x518ab46b26d5914b, 0x528ac6c82341e833, 0x2247fa99d41f4672, 0xabe30c65c0f327a2, 0x3ac74e012b77e1b4, 0x35defd694c0e86b3, 0x7c382e10bfe60e4e, 0xf37e382996b8461c, 0x4d47481c53631e1a, 0xac8f167884f7b7b1, 0x5ae1bb6ab1a4c643 + , 0x63eb02590829df80, 0x623126862a793fa1, 0x6e1e242f1ce09807, 0x7bf96130aaecfd2b, 0xedc5e9ea10bff70a, 0x66b548233b94d26e, 0x70c70ee4594d30ab, 0x79b0006c8811353e, 0x4352792c91710c1f, 0x0c7bf15181a9f539, 0xfc995ee769e3779c, 0x44871c6cb9dcedcd + , 0x0d180bbf2c9a046b, 0x5445c598c45d0cd9, 0xdefb32386875fb94, 0x5b0d235355660f35, 0xbe1dea825b3a7973, 0x10658ec4e1bbe147, 0x48af5e87fad77504, 0x55f5d3c94a7dd694, 0xa9a3e7062cad6ba2, 0x36c0a7e3f9e0ea31, 0xc4bd65217010aebc, 0x1d031dfc8b9fb598 + , 0xe3621c104113889e, 0x774b77ee1e6a6477, 0x124c5b8a07785fd7, 0x5a6c0df18188cada, 0xf4adcd545e72d7be, 0x38100fffb66ba966, 0x2100cbe35fe4a4d0, 0x4489be2df052c175, 0xa03a22403b26899f, 0x5ae4a0a0fec13928, 0x89dfbfb802795eaa, 0x34917e9c4ecf2532 + , 0x64b93674c60cbbb3, 0x25c098506334c71d, 0x8a723f66f1ee34e1, 0x3a960adf48f141e4, 0x659f386695e440bb, 0x577a0fbf6e8095e6, 0x8ef419b0f4b25496, 0x044176a30b9e465b, 0x7a98705df2013e6f, 0x77d0b2483aa95ce7, 0x309e917b978effd7, 0x08f1e55bfe942c7f + , 0xfc241629b8d613c8, 0x140f2e35cd68949f, 0x38899f6a3ee4f9fa, 0x7abc8ecdd300f3b5, 0xd3dad23505d23eaf, 0x75e73f09376b2c7c, 0x5644a663b60ec5c4, 0x511ade8afe1eaec9, 0xbb005fe4e1abca89, 0x2838de73b0ca1f6c, 0x800a6658b80d28c8, 0x48aaba61c91641ec + , 0x222759cab704d4e2, 0x106dd3c0ce85beca, 0xa1ce1ce341f69d03, 0x1651b210e8e4ee10, 0x47329a5e7133e136, 0x58c02f47dc9367b9, 0x09dcba56947b02af, 0x435c251178125b48, 0xd56979a3f0cd9315, 0x2f02b0a6422afddb, 0x23920f500731f32d, 0x0ab833238232cb5d + , 0xa7b3d1bfb0bb60db, 0x2342c2a03c6eaec2, 0xac5e6e5a14d5282e, 0x5b9a421ddc42a24b, 0x018506414543e056, 0x6d7c377c084954e6, 0x4f8bf71ed3db1ced, 0x5150dbc15ab10979, 0x00b50a1b373a7fbf, 0x140be5c3d3244705, 0x5005bfe96e5b7911, 0x77cea555bb133f3e + , 0x2ab1e1a9d7a973c6, 0x3897ac98314968d3, 0x9e0f74764b23c9c3, 0x2e5ecbbae41997cd, 0x43e2ea5648f12433, 0x3a515a0e4808e69c, 0x17d36c03c36bb343, 0x44cebd053481ce43, 0x89008656c21b0d76, 0x2f8513fcb9009be6, 0x2e223f90208a0e83, 0x3828c2d4efd36a73 + , 0xbf17d64f89a8527d, 0x59ebb42b9656151d, 0x7d7bc7245c7dc5ef, 0x191b682a0cb695ec, 0x8931172fad9f9add, 0x239b6cbbab2ebdcf, 0x76932f9ca7002dd1, 0x0c140548f858d8b5, 0x6c7adfddcf741ea5, 0x3b39c4b9e2e1a567, 0xc5135a25f87436fe, 0x690d8fecb7dd0ae0 + , 0xd782a618ecda10c2, 0x4f2a84b3134cf832, 0x35a81f71bbc955a4, 0x457f88ed64ae6398, 0xc27eb71c31479985, 0x4ae91808569aab32, 0xa5f2e9785a75eb11, 0x619cb199b837ed36, 0x0e7e5912b9484e40, 0x3b5831e87fdbcaf0, 0x49a2779c2d2b039d, 0x3d4b81e07f49061a + , 0xaa119b0fa222b55c, 0x265c1b11b42fd4e2, 0x6b4d28e519dd7637, 0x3d2da7900de5a4b2, 0x99b06586b5f21d63, 0x4ce62bd9e6a1ee18, 0xb671e753932f8c92, 0x390b7821d0987834, 0x1adf7c73c3f1fc2f, 0x78c636a8514a7af9, 0xaee3b35fe11e7533, 0x7fbd199278f6ffd7 + , 0x41aabbf4363d77de, 0x1b27fdf18b96bf6a, 0xda264a1dff9a981c, 0x36efc08530c0bf9a, 0x5bd8862a5d830854, 0x23d7c905e656e6cb, 0x4523324c5b64fdcf, 0x36627f376238665f, 0x564f53925c6d5ea4, 0x17c7cc86a1913022, 0xf90db52a543b009b, 0x15192dc91f8b994b + , 0x80bfa3c1a79ec6e2, 0x48fca8ea99772ecc, 0xfee6a3b98c0f1824, 0x46a8c75601b81e22, 0x2cb3c402a8895fcc, 0x1d1dff9c04305ce2, 0xc1aefe78e85971d7, 0x79c6a083ab5a80b2, 0x379c7bca5dbf2518, 0x2419358989b3ca02, 0xc9c42c9cfa5f470e, 0x4481c2ce91b14459 + , 0x6b04dea1ea26deca, 0x26ee3ceee0d0a101, 0xe36cc6bcd8fa4f26, 0x4d14709719764fbd, 0xe0572a706f1fef52, 0x0f75fb69a23f2ec1, 0x32ae4b04a864cf3b, 0x0b6373a91b944773, 0x1a8f2bc20bd088af, 0x586b0d5ace401747, 0xa0e6b094a3c51433, 0x1752a123c268c1c7 + , 0x643c2a93b5770ea1, 0x536cb9d1b71eeb43, 0x6bfb0525d0cc6b3f, 0x1f4dcfeec3adefc3, 0x28a0169dd0bf57f0, 0x1336c9aa20a35449, 0xbbcda068703ad7a1, 0x5e33478283c1e03d, 0xf1997733d18fdaf2, 0x789af507a17bb867, 0x79970c14d5695613, 0x79452342e845256f + , 0x6c12f9367a26a018, 0x11beda1c8f9cdfbe, 0x720e6ddf24b30929, 0x7706e91e3e544755, 0x4460381d3a6c9059, 0x7e01916c3678c424, 0x6024355a61d2bb07, 0x68bae01d79c869e2, 0xf21cbcff285df659, 0x02f7ee6aeb57c933, 0xce0f078c17266467, 0x039b4fdb5170a103 + , 0xd5de0fec61a4ae1b, 0x33d37a152a778695, 0xea64e40e6a10ded9, 0x1f1d394373bdb213, 0xf63598b6ef59fd14, 0x57922adc3ae52283, 0xe39a90e18b76f4a1, 0x27f3dbebd98a9dae, 0x18179dd9c03804b3, 0x511d72c1912e2d73, 0x88e1f6d24b2f3225, 0x56009999cdc2997f + , 0xda6df977b7d82fe4, 0x76f746bba63da226, 0x0b5facfc3bf13bd7, 0x4a31eb04f66f0e18, 0x8ace73d5e7cfe28f, 0x19aa731bc30c20b1, 0xa91979fe73400317, 0x6795ce71a09c7c9f, 0x93d55501933700ba, 0x3850eaf08b1fd14d, 0x450c5abc89edca71, 0x1be5db848bdfa5ef + , 0x77667d3f4fcf082b, 0x673b6e6c4824bc45, 0x6f22c12a5fe0ed6d, 0x006ee6722b5dfed1, 0xb47a13c1468d0c62, 0x40564879a378e6e4, 0x0bc6b553a9d3ab58, 0x21761c79e44dfcfd, 0x66f36ed3eb1050fb, 0x2e67df1312dd01d3, 0x48744c4a68dbf2ad, 0x7844962b6d6e039c + , 0xe07b5675d378b65f, 0x336262aa3d2c1df0, 0x320a5667d78c2e2b, 0x4f668dd96dda5e2a, 0xe21556795c7b8470, 0x3061905b2ef82bb1, 0xaee53211472206b6, 0x1f87377fee0d7a39, 0xdac58c52a3b1a0c7, 0x6e3c4ce04f0d7ffd, 0xfdffec45d4a3990f, 0x4b5340f79e2ae2c2 + , 0x0537c8b7b3d1f332, 0x55292744ae35ee1a, 0x42336d0e6d057f1e, 0x5ac40e9e645cb3d7, 0x848f7b7f845e46c7, 0x74bda86736eff150, 0x891acf622baf4f35, 0x14bcef9cf39667bb, 0x9aa1354d9731b9b3, 0x27e855a19295e59f, 0x1a829a8e10662ed0, 0x3bbc43f9ec4437a7 + , 0x8bfa8b1cb1de5341, 0x3432778068d35549, 0xe3d807da41f25a48, 0x1bb6ee1ce2efe552, 0x08d9bded0bd3affc, 0x290f1c5299a917a8, 0xda8dfd79562f8939, 0x1bf7aae68686211f, 0x2ab6daf9bc860765, 0x7bef6e2f0eb58a0b, 0x8746faab7c439b94, 0x017ea87750bb8bda + , 0xf8dfeb22239c9a7c, 0x35cec0d2887b3a13, 0x68aa94ac601f1606, 0x7470553f8ba61767, 0x37894f91c9eac410, 0x55b22aeb8337f732, 0x53f8d90f29a2fe94, 0x0aec068aec69023a, 0x40506162ad6182ee, 0x6a327ff1ac1e5475, 0x968d7095492df3c8, 0x3f93f46195f67521 + , 0x4983bca28970d546, 0x2716b931296b53c2, 0xf42b013266b6f8b3, 0x76f29b084b6a369f, 0x8e28749222216249, 0x4f2fa1d3a6c1acfd, 0x0ee66697eab8f954, 0x37c33e28fec0cce5, 0x7d0419e2bafd1dd1, 0x01f04d4299b94daa, 0x5ec06abbc1e5c7e6, 0x3a24c66060ed72a9 + , 0x0db764e15f960f26, 0x1d5973d5d59f9c3a, 0xf3dc2608dc6d9149, 0x1d80e0461b72f518, 0x2264dccd49c8b09c, 0x1f03e7a246334d5e, 0x2d6e38871b1fc2ad, 0x418588ae4f284bd3, 0x3efb071bafe1afa2, 0x0799ba0c80bdd8dc, 0xa6b273222dcc4a76, 0x13859f08ac8a4b23 + , 0x0194acc2663c5acb, 0x459fa55bd0bbedf6, 0x1b055550f06f8cc1, 0x09e5fad46599ea75, 0x6b3916ef772958a3, 0x4aaaa5c18093a431, 0x8e1503e36610f594, 0x620ef55048a263b9, 0x5a28963c8cb8ecbc, 0x6aee46b1b740c15a, 0x67e39606f59cfea9, 0x13a579e3777ca8b1 + , 0x45ad92f61cbb8de3, 0x53068a1a42460eab, 0x9b163546de379578, 0x07bf38a7cecd4860, 0xf84c77031d282de1, 0x402aed6399f78ffc, 0xfb83dd20295f6d45, 0x3702e257340d2ecd, 0xb8db2d8b979b97c8, 0x617526d2a50b0c51, 0xd86f6278313017db, 0x2f35eedec55f9d92 + , 0xeecb69493517973b, 0x7a111a74e0baf09a, 0xb82c6da8ec39f63d, 0x4217076312833746, 0x5d36d11f3dda88d9, 0x7baebcb360f2a887, 0x9829b62d093d6cbb, 0x10f17a2f6edf28fd, 0xfe3efa4353f40626, 0x731ca3065c118e34, 0x6185678827960895, 0x07f906a4f4c6355c + , 0x361d9cd10e657142, 0x2b5f5d452dd861ce, 0xa3e01df05d04b69a, 0x533723bc4cfcc0db, 0x820384afa1bbccb5, 0x4e67e941595d8dfd, 0x0f8da50839e13646, 0x6887a0573a596968, 0xe93dd1df5ace7343, 0x0d4076f28ecf96c8, 0x0ba2f854988074c1, 0x5eb2a314a41a40b6 + , 0x49ff6d27a676b27e, 0x15f7ca40acd5114e, 0xc171f9a750d7da95, 0x3bedbe891f79eb5c, 0x5b643bceb83f74ff, 0x088b1af3aa331a4c, 0xde294c7e0a60c4a9, 0x0a0770fc8120b151, 0xf09b757a0c7c1937, 0x34b797c03efd9c88, 0x051e3edb2c28cc49, 0x66db34ec5ac5122c + , 0x95fde0d3d3dc8cbf, 0x797897c8121818cf, 0x1fd46d197710f89d, 0x533a505803f809c5, 0xb60f1c090c9fd211, 0x4a7c3479af5c9d82, 0x4bfc3ffa4c8cf5a5, 0x6949f4a61306821f, 0xd814c949c67abcdc, 0x419a5e33166863c4, 0x9de646f6bd0895e0, 0x497cc1449a54545a + , 0x69eb31247fe126f2, 0x323c83233967f477, 0x52e0db4d3d78127d, 0x42a0e188e7b9380c, 0x3a6b011c46e34e7e, 0x79f4168aa9a0b4aa, 0x94270a25d708fa4d, 0x2bb28618cbc9cdc8, 0x741e46bb04606819, 0x02790c52fb2ce982, 0x6dbb92d0c6d0af10, 0x32aa96ae061e9412 + , 0x1376700c90d98eaa, 0x4d1dfe650c0a7136, 0xb397f8eef89aff20, 0x4836ac4a041bae37, 0xf37c1076a80a02b8, 0x0d063fa2467b3a37, 0x498f2617b56b7e7b, 0x65ef1194db859a5d, 0xd1fe25d5d28ffcb6, 0x228ee6f49459c083, 0x6b7e82b3b009b15b, 0x713b185ef1fccbfc + , 0x552468f1ff60c298, 0x2b7ba65d02519614, 0x8a86ad90ff0816c2, 0x7bf9249284bd02e5, 0x3008c56e474c2d10, 0x171473b77f804540, 0x15fb79d07bdea766, 0x66ac67c7b9b0951f, 0x34bca15bb6d2f652, 0x13c63dd2687d617b, 0xc515ae237715c19c, 0x0e543c6765fbfef2 + , 0x668c80faf156fb5e, 0x1e2e9e3b3d9962b8, 0x89ebaa264394e113, 0x322add21cf1659cf, 0xf9e6e26733619f8e, 0x723bfc8b792147f0, 0x79aef2837d7e092f, 0x1aa61c59290b5011, 0x9955ae576a499cd3, 0x2c3d6e6a5a1ce0da, 0xb864cfa199a8676b, 0x4961a21f1080285f + , 0x828e184adf9d997b, 0x0c84bda97e7ce725, 0xe6974677094cfcc5, 0x4ec8cd773946105b, 0xa48681bcc95fb5c6, 0x6ade87f8f7a5f269, 0x9b97628fdd39c03d, 0x3bde0ee1f19f1842, 0x4ef8c8fb117c0ca1, 0x769bf8f8d07de9bf, 0xc8f5f435b78a57e5, 0x79987aa861bbcf9c + , 0x7f6c557204b02022, 0x119bd819111c69d1, 0xf0c61ef00b3eb70b, 0x4317f0511bfb7b39, 0x36a2b944e84d608e, 0x1c1a3862da3369cb, 0x37dbf471085f1775, 0x3835751e107419ad, 0x04ab0c84bb07a3fe, 0x63758bfbc7df13a0, 0x15ffd20cb554f23e, 0x1ff11c442b1515b7 + , 0x171377f1bf937186, 0x615efe82b83538f8, 0x321e7cfae352a761, 0x7af02427d7241502, 0x86546e47f2cc559f, 0x65a1d8a017659d75, 0xc95d8aa5b8bfdac9, 0x01e887cb68990623, 0xf1f8ee8c466bcc3d, 0x40ce5e4f2ba3908f, 0xd2b81a3480c16b35, 0x51625d3eabf708cd + , 0x44d770a210105739, 0x7f1de74a022958a0, 0xfbe4c91bd1e8f732, 0x204fbacb13586460, 0x97d79097d62e3cf8, 0x541ad5591934b114, 0xfdfb47919c141909, 0x354926e5244fdecf, 0x6291b0a0e2e994b0, 0x2b9a9a69d3a6c3d1, 0x8189be54302371e7, 0x3645c65df1a881cd + , 0xdf0460f445e3877b, 0x7ea384dc52d0d26e, 0x0c2e5f768d46b6b0, 0x1f6e62daa7c5d4e6, 0xf8b026b33b2343ee, 0x2b7183c8767d372c, 0xbd45d1b6b6731517, 0x4ddb3d287c470d60, 0x1031dba40263ece2, 0x4e737fa0d659045f, 0x8cbc98d07d09b455, 0x34a35128a2bcb7f5 +]; \ No newline at end of file diff --git a/crypto/src/fourq/ops.rs b/crypto/src/fourq/ops.rs new file mode 100644 index 0000000..afde1a3 --- /dev/null +++ b/crypto/src/fourq/ops.rs @@ -0,0 +1,1335 @@ +#![allow(dead_code)] +#![allow(unused_assignments)] +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::{_subborrow_u64, _addcarry_u64}; + +use core::ptr::copy_nonoverlapping; +use crate::{ + fourq::types::{FelmT, F2elmT, PointPrecomp, PointExtprojPrecomp, PointAffine, PointExtproj}, + fourq::consts::{ + FIXED_BASE_TABLE, + MONTGOMERY_SMALL_R_PRIME_0, + MONTGOMERY_SMALL_R_PRIME_1, + MONTGOMERY_SMALL_R_PRIME_2, + MONTGOMERY_SMALL_R_PRIME_3, + CURVE_ORDER, + CURVE_ORDER_3, + CURVE_ORDER_2, + CURVE_ORDER_1, + CURVE_ORDER_0, + PARAMETER_D_F2ELM, + MONTGOMERY_R_PRIME, + ONE, + C_TAU_1, + C_TAU_DUAL_1, + C_PHI_0, + C_PHI_1, + C_PHI_2, + C_PHI_3, + C_PHI_4, + C_PHI_5, + C_PHI_6, + C_PHI_7, + C_PHI_8, + C_PHI_9, ELL_1, ELL_2, ELL_3, ELL_4, B11, B21, B31, B41, B12, B22, B23, B24, B32, B33, B34, B13, B14, B43, B44, B42, C1, C2, C3, C4, DOUBLE_SCALAR_TABLE, PARAMETER_D, C_PSI_2, C_PSI_1, C_PSI_3, C_PSI_4 + }}; + +#[inline(always)] +fn addcarry_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 { + #[cfg(target_arch = "x86_64")] + unsafe { + _addcarry_u64(c_in, a, b, out) + } + + #[cfg(not(target_arch = "x86_64"))] + { + let c_out = a.overflowing_add(b); + let c_out1 = c_out.0.overflowing_add(if c_in != 0 { 1 } else { 0 }); + + *out = c_out1.0; + + (c_out.1 || c_out1.1) as u8 + } +} + +#[inline(always)] +fn subborrow_u64(b_in: u8, a: u64, b: u64, out: &mut u64) -> u8 { + #[cfg(target_arch = "x86_64")] + unsafe { + _subborrow_u64(b_in, a, b, out) + } + + #[cfg(not(target_arch = "x86_64"))] + { + let b_out = a.overflowing_sub(b); + let b_out1 = b_out.0.overflowing_sub(if b_in != 0 { 1 } else { 0 }); + + *out = b_out1.0; + + (b_out.1 || b_out1.1) as u8 + } +} + + +/// Modular correction, a = a mod (2^127-1) +#[inline(always)] +pub fn mod1271(a: &mut FelmT) { + subborrow_u64(subborrow_u64(0, a[0], 0xFFFFFFFFFFFFFFFF, &mut a[0]), a[1], 0x7FFFFFFFFFFFFFFF, &mut a[1]); + let mask = 0u64.wrapping_sub(a[1] >> 63); + addcarry_u64(addcarry_u64(0, a[0], mask, &mut a[0]), a[1], 0x7FFFFFFFFFFFFFFF & mask, &mut a[1]); +} + +/// Field addition, c = a+b mod (2^127-1) +#[inline(always)] +pub fn fpadd1271(a: FelmT, b: FelmT, c: &mut FelmT) { + addcarry_u64(addcarry_u64(0, a[0], b[0], &mut c[0]), a[1], b[1], &mut c[1]); + addcarry_u64(addcarry_u64(0, c[0], c[1] >> 63, &mut c[0]), c[1] & 0x7FFFFFFFFFFFFFFF, 0, &mut c[1]); +} + +/// Field subtraction, c = a-b mod (2^127-1) +#[inline(always)] +pub fn fpsub1271(a: FelmT, b: FelmT, c: &mut FelmT) { + subborrow_u64(subborrow_u64(0, a[0], b[0], &mut c[0]), a[1], b[1], &mut c[1]); + subborrow_u64(subborrow_u64(0, c[0], c[1] >> 63, &mut c[0]), c[1] & 0x7FFFFFFFFFFFFFFF, 0, &mut c[1]); +} + +/// Field negation, a = -a mod (2^127-1) +#[inline(always)] +pub fn fpneg1271(a: &mut FelmT) { + a[0] = !a[0]; + a[1] = 0x7FFFFFFFFFFFFFFF - a[1]; +} + +#[inline(always)] +pub fn _umul128(a: u64, b: u64, hi: &mut u64) -> u64 { + let r = (a as u128) * (b as u128); + *hi = (r >> 64) as u64; + r as u64 +} + +#[inline(always)] +pub fn __shiftleft128(lo: u64, hi: u64, s: u32) -> u64 { + let s = s % 64; + (((lo as u128 | ((hi as u128) << 64)) << s) >> 64) as u64 +} + +#[inline(always)] +pub fn __shiftright128(lo: u64, hi: u64, s: u32) -> u64 { + let s = s % 64; + ((lo as u128 | ((hi as u128) << 64)) >> s) as u64 +} + +/// Field multiplication, c = a*b mod (2^127-1) +#[inline(always)] +pub fn fpmul1271(a: FelmT, b: FelmT, c: &mut FelmT) { + let (mut tt1, mut tt2, mut tt3) = ([0u64; 2], [0u64; 2], [0u64; 2]); + tt1[0] = _umul128(a[0], b[0], &mut tt3[0]); + tt2[0] = _umul128(a[0], b[1], &mut tt2[1]); + addcarry_u64(addcarry_u64(0, tt2[0], tt3[0], &mut tt2[0]), tt2[1], 0, &mut tt2[1]); + tt3[0] = _umul128(a[1], b[0], &mut tt3[1]); + addcarry_u64(addcarry_u64(0, tt2[0], tt3[0], &mut tt2[0]), tt2[1], tt3[1], &mut tt2[1]); + tt3[0] = _umul128(a[1], b[1], &mut tt3[1]); + tt3[1] = __shiftleft128(tt3[0], tt3[1], 1); + addcarry_u64(addcarry_u64(0, __shiftright128(tt2[0], tt2[1], 63), tt3[0] << 1, &mut tt3[0]), tt2[1] >> 63, tt3[1], &mut tt3[1]); + addcarry_u64(addcarry_u64(0, tt1[0], tt3[0], &mut tt1[0]), tt2[0] & 0x7FFFFFFFFFFFFFFF, tt3[1], &mut tt1[1]); + addcarry_u64(addcarry_u64(0, tt1[0], tt1[1] >> 63, &mut c[0]), tt1[1] & 0x7FFFFFFFFFFFFFFF, 0, &mut c[1]); +} + +/// Field squaring, c = a^2 mod (2^127-1) +#[inline(always)] +pub fn fpsqr1271(a: FelmT, c: &mut FelmT) { + let (mut tt1, mut tt2, mut tt3) = ([0u64; 2], [0u64; 2], [0u64; 2]); + tt1[0] = _umul128(a[0], a[0], &mut tt3[0]); + tt2[0] = _umul128(a[0], a[1], &mut tt2[1]); + addcarry_u64(addcarry_u64(0, tt2[0], tt3[0], &mut tt3[0]), tt2[1], 0, &mut tt3[1]); + addcarry_u64(addcarry_u64(0, tt2[0], tt3[0], &mut tt2[0]), tt2[1], tt3[1], &mut tt2[1]); + tt3[0] = _umul128(a[1], a[1], &mut tt3[1]); + tt3[1] = __shiftleft128(tt3[0], tt3[1], 1); + addcarry_u64(addcarry_u64(0, __shiftright128(tt2[0], tt2[1], 63), tt3[0] << 1, &mut tt3[0]), tt2[1] >> 63, tt3[1], &mut tt3[1]); + addcarry_u64(addcarry_u64(0, tt1[0], tt3[0], &mut tt1[0]), tt2[0] & 0x7FFFFFFFFFFFFFFF, tt3[1], &mut tt1[1]); + addcarry_u64(addcarry_u64(0, tt1[0], tt1[1] >> 63, &mut c[0]), tt1[1] & 0x7FFFFFFFFFFFFFFF, 0, &mut c[1]); +} + +/// Field squaring, c = a^2 mod (2^127-1) +#[inline(always)] +pub fn fpexp1251(a: FelmT, af: &mut FelmT) { + let (mut t1, mut t2, mut t3, mut t4, mut t5) = ([0u64; 2], [0u64; 2], [0u64; 2], [0u64; 2], [0u64; 2]); + + fpsqr1271(a, &mut t2); + fpmul1271(a, t2, &mut t2); + fpsqr1271(t2, &mut t3); + fpsqr1271(t3, &mut t3); + fpmul1271(t2, t3, &mut t3); + fpsqr1271(t3, &mut t4); + fpsqr1271(t4, &mut t4); + fpsqr1271(t4, &mut t4); + fpsqr1271(t4, &mut t4); + fpmul1271(t3, t4, &mut t4); + fpsqr1271(t4, &mut t5); + for _ in 0..7 { + fpsqr1271(t5, &mut t5) + } + fpmul1271(t4, t5, &mut t5); + fpsqr1271(t5, &mut t2); + for _ in 0..15 { + fpsqr1271(t2, &mut t2); + } + fpmul1271(t5, t2, &mut t2); + fpsqr1271(t2, &mut t1); + for _ in 0..31 { + fpsqr1271(t1, &mut t1) + } + fpmul1271(t2, t1, &mut t1); + for _ in 0..32 { + fpsqr1271(t1, &mut t1) + } + //for (unsigned int i = 0; i < 32; i++) fpsqr1271(t1, t1); + fpmul1271(t1, t2, &mut t1); + + for _ in 0..16 { + fpsqr1271(t1, &mut t1) + } + //for (unsigned int i = 0; i < 16; i++) fpsqr1271(t1, t1); + fpmul1271(t5, t1, &mut t1); + + for _ in 0..8 { + fpsqr1271(t1, &mut t1) + } + //for (unsigned int i = 0; i < 8; i++) fpsqr1271(t1, t1); + fpmul1271(t4, t1, &mut t1); + for _ in 0..4 { + fpsqr1271(t1, &mut t1) + } + //for (unsigned int i = 0; i < 4; i++) fpsqr1271(t1, t1); + fpmul1271(t3, t1, &mut t1); + fpsqr1271(t1, &mut t1); + fpmul1271(a, t1, af); +} + +/// GF(p^2) division by two c = a/2 mod p +#[inline(always)] +pub fn fp2div1271(a: &mut F2elmT) { + let mut mask: u64; + let mut temp = [0u64; 2]; + + mask = 0 - (1 & a[0][0]); + addcarry_u64(addcarry_u64(0, a[0][0], mask, &mut temp[0]), a[0][1], mask >> 1, &mut temp[1]); + a[0][0] = __shiftright128(temp[0], temp[1], 1); + a[0][1] = temp[1] >> 1; + + mask = 0u64.wrapping_sub(1 & a[1][0]); + addcarry_u64(addcarry_u64(0, a[1][0], mask, &mut temp[0]), a[1][1], mask >> 1, &mut temp[1]); + a[1][0] = __shiftright128(temp[0], temp[1], 1); + a[1][1] = temp[1] >> 1; +} + +/// GF(p^2) negation, a = -a in GF((2^127-1)^2) +#[inline(always)] +pub fn fp2neg1271(a: &mut F2elmT) { + fpneg1271(&mut a[0]); + fpneg1271(&mut a[1]); +} + +/// GF(p^2) squaring, c = a^2 in GF((2^127-1)^2) +#[inline(always)] +pub fn fp2sqr1271(a: F2elmT, c: &mut F2elmT) { + let (mut t1, mut t2, mut t3) = ([0u64; 2], [0u64; 2], [0u64; 2]); + + fpadd1271(a[0], a[1], &mut t1); // t1 = a0+a1 + fpsub1271(a[0], a[1], &mut t2); // t2 = a0-a1 + fpmul1271(a[0], a[1], &mut t3); // t3 = a0*a1 + fpmul1271(t1, t2, &mut c[0]); // c0 = (a0+a1)(a0-a1) + fpadd1271(t3, t3, &mut c[1]); // c1 = 2a0*a1 +} + +/// GF(p^2) multiplication, c = a*b in GF((2^127-1)^2) +#[inline(always)] +pub fn fp2mul1271(a: F2elmT, b: F2elmT, c: &mut F2elmT) { + let (mut t1, mut t2, mut t3, mut t4) = ([0u64; 2], [0u64; 2], [0u64; 2], [0u64; 2]); + + fpmul1271(a[0], b[0], &mut t1); // t1 = a0*b0 + fpmul1271(a[1], b[1], &mut t2); // t2 = a1*b1 + fpadd1271(a[0], a[1], &mut t3); // t3 = a0+a1 + fpadd1271(b[0], b[1], &mut t4); // t4 = b0+b1 + fpsub1271(t1, t2, &mut c[0]); // c[0] = a0*b0 - a1*b1 + fpmul1271(t3, t4, &mut t3); // t3 = (a0+a1)*(b0+b1) + fpsub1271(t3, t1, &mut t3); // t3 = (a0+a1)*(b0+b1) - a0*b0 + fpsub1271(t3, t2, &mut c[1]); // c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 +} + +/// GF(p^2) addition, c = a+b in GF((2^127-1)^2) +#[inline(always)] +pub fn fp2add1271(a: F2elmT, b: F2elmT, c: &mut F2elmT) { + fpadd1271(a[0], b[0], &mut c[0]); + fpadd1271(a[1], b[1], &mut c[1]); +} + +/// GF(p^2) subtraction, c = a-b in GF((2^127-1)^2) +#[inline(always)] +pub fn fp2sub1271(a: F2elmT, b: F2elmT, c: &mut F2elmT) { + fpsub1271(a[0], b[0], &mut c[0]); + fpsub1271(a[1], b[1], &mut c[1]); +} + +/// GF(p^2) addition followed by subtraction, c = 2a-b in GF((2^127-1)^2) +#[inline(always)] +pub fn fp2addsub1271(mut a: F2elmT, b: F2elmT, c: &mut F2elmT) { + fp2add1271(a, a, &mut a); + fp2sub1271(a, b, c); +} + +/// Table lookup to extract a point represented as (x+y,y-x,2t) corresponding to extended twisted Edwards coordinates (X:Y:Z:T) with Z=1 +#[inline] +pub fn table_lookup_fixed_base(p: &mut PointPrecomp, digit: u64, sign: u64) { + unsafe { + let digit = digit as isize; + if sign != 0 { + p.xy.copy_from_slice(&(*(FIXED_BASE_TABLE.as_ptr() as *const PointPrecomp).offset(digit)).yx); + p.yx.copy_from_slice(&(*(FIXED_BASE_TABLE.as_ptr() as *const PointPrecomp).offset(digit)).xy); + p.t2[0][0] = !(*(FIXED_BASE_TABLE.as_ptr() as *const PointPrecomp).offset(digit)).t2[0][0]; + p.t2[0][1] = 0x7FFFFFFFFFFFFFFF - (*(FIXED_BASE_TABLE.as_ptr() as *const PointPrecomp).offset(digit)).t2[0][1]; + p.t2[1][0] = !(*(FIXED_BASE_TABLE.as_ptr() as *const PointPrecomp).offset(digit)).t2[1][0]; + p.t2[1][1] = 0x7FFFFFFFFFFFFFFF - (*(FIXED_BASE_TABLE.as_ptr() as *const PointPrecomp).offset(digit)).t2[1][1]; + } else { + p.xy.copy_from_slice(&(*(FIXED_BASE_TABLE.as_ptr() as *const PointPrecomp).offset(digit)).xy); + p.yx.copy_from_slice(&(*(FIXED_BASE_TABLE.as_ptr() as *const PointPrecomp).offset(digit)).yx); + p.t2.copy_from_slice(&(*(FIXED_BASE_TABLE.as_ptr() as *const PointPrecomp).offset(digit)).t2); + } + } +} + +#[inline] +pub fn multiply(a: &[u64], b: &[u64], c: &mut [u64]) { + let (mut u, mut v, mut uv) = (0, 0, 0); + c[0] = _umul128(a[0], b[0], &mut u); + u = addcarry_u64(0, _umul128(a[0], b[1], &mut uv), u, &mut c[1]) as u64 + uv; + u = addcarry_u64(0, _umul128(a[0], b[2], &mut uv), u, &mut c[2]) as u64 + uv; + c[4] = addcarry_u64(0, _umul128(a[0], b[3], &mut uv), u, &mut c[3]) as u64 + uv; + + u = addcarry_u64(0, c[1], _umul128(a[1], b[0], &mut uv), &mut c[1]) as u64 + uv; + u = addcarry_u64(0, _umul128(a[1], b[1], &mut uv), u, &mut v) as u64 + uv; + u = addcarry_u64(addcarry_u64(0, c[2], v, &mut c[2]), _umul128(a[1], b[2], &mut uv), u, &mut v) as u64 + uv; + c[5] = addcarry_u64(addcarry_u64(0, c[3], v, &mut c[3]), _umul128(a[1], b[3], &mut uv), u, &mut v) as u64 + uv + addcarry_u64(0, c[4], v, &mut c[4]) as u64; + + u = addcarry_u64(0, c[2], _umul128(a[2], b[0], &mut uv), &mut c[2]) as u64 + uv; + u = addcarry_u64(0, _umul128(a[2], b[1], &mut uv), u, &mut v) as u64 + uv; + u = addcarry_u64(addcarry_u64(0, c[3], v, &mut c[3]), _umul128(a[2], b[2], &mut uv), u, &mut v) as u64 + uv; + c[6] = addcarry_u64(addcarry_u64(0, c[4], v, &mut c[4]), _umul128(a[2], b[3], &mut uv), u, &mut v) as u64 + uv + addcarry_u64(0, c[5], v, &mut c[5]) as u64; + + u = addcarry_u64(0, c[3], _umul128(a[3], b[0], &mut uv), &mut c[3]) as u64 + uv; + u = addcarry_u64(0, _umul128(a[3], b[1], &mut uv), u, &mut v) as u64 + uv; + u = addcarry_u64(addcarry_u64(0, c[4], v, &mut c[4]), _umul128(a[3], b[2], &mut uv), u, &mut v) as u64 + uv; + c[7] = addcarry_u64(addcarry_u64(0, c[5], v, &mut c[5]), _umul128(a[3], b[3], &mut uv), u, &mut v) as u64 + uv + addcarry_u64(0, c[6], v, &mut c[6]) as u64; +} + +/// 256-bit Montgomery multiplication modulo the curve order, mc = ma*mb*r' mod order, where ma,mb,mc in [0, order-1] +/// ma, mb and mc are assumed to be in Montgomery representation +/// The Montgomery constant r' = -r^(-1) mod 2^(log_2(r)) is the global value "Montgomery_rprime", where r is the order +#[inline] +pub fn montgomery_multiply_mod_order(ma: &[u64], mb: &[u64], mc: &mut [u64]) { + let mut p = [0u64; 8]; + let mut q = [0u64; 4]; + let mut temp = [0u64; 8]; + + unsafe { + if mb[0] == 1 && !mb[1] != 0 && !mb[2] != 0 && !mb[3] != 0 { + copy_nonoverlapping(ma.as_ptr(), p.as_mut_ptr(), 4); + } else { + multiply(ma, mb, &mut p); + } + + let (mut u, mut v, mut uv) = (0u64, 0u64, 0u64); + + q[0] = _umul128(p[0], MONTGOMERY_SMALL_R_PRIME_0, &mut u); + u = addcarry_u64(0, _umul128(p[0], MONTGOMERY_SMALL_R_PRIME_1, &mut uv), u, &mut q[1]) as u64 + uv; + u = addcarry_u64(0, _umul128(p[0], MONTGOMERY_SMALL_R_PRIME_2, &mut uv), u, &mut q[2]) as u64 + uv; + addcarry_u64(0, p[0].wrapping_mul(MONTGOMERY_SMALL_R_PRIME_3), u, &mut q[3]); + u = addcarry_u64(0, q[1], _umul128(p[1], MONTGOMERY_SMALL_R_PRIME_0, &mut uv), &mut q[1]) as u64 + uv; + u = addcarry_u64(0, _umul128(p[1], MONTGOMERY_SMALL_R_PRIME_1, &mut uv), u, &mut v) as u64 + uv; + addcarry_u64(addcarry_u64(0, q[2], v, &mut q[2]), p[1].wrapping_mul(MONTGOMERY_SMALL_R_PRIME_2), u, &mut v); + addcarry_u64(0, q[3], v, &mut q[3]); + u = addcarry_u64(0, q[2], _umul128(p[2], MONTGOMERY_SMALL_R_PRIME_0, &mut uv), &mut q[2]) as u64 + uv; + addcarry_u64(0, p[2].wrapping_mul(MONTGOMERY_SMALL_R_PRIME_1), u, &mut v); + addcarry_u64(0, q[3], v, &mut q[3]); + addcarry_u64(0, q[3], p[3].wrapping_mul(MONTGOMERY_SMALL_R_PRIME_0), &mut q[3]); + + multiply(&q, &CURVE_ORDER, &mut temp); // temp = Q * r + + let a = addcarry_u64(addcarry_u64(addcarry_u64(addcarry_u64(addcarry_u64(addcarry_u64(addcarry_u64(addcarry_u64(0, p[0], temp[0], &mut temp[0]), p[1], temp[1], &mut temp[1]), p[2], temp[2], &mut temp[2]), p[3], temp[3], &mut temp[3]), p[4], temp[4], &mut temp[4]), p[5], temp[5], &mut temp[5]), p[6], temp[6], &mut temp[6]), p[7], temp[7], &mut temp[7]); + let b = subborrow_u64(subborrow_u64(subborrow_u64(subborrow_u64(0, temp[4], CURVE_ORDER_0, &mut mc[0]), temp[5], CURVE_ORDER_1, &mut mc[1]), temp[6], CURVE_ORDER_2, &mut mc[2]), temp[7], CURVE_ORDER_3, &mut mc[3]); + + // temp not correct after addcarry + if a.wrapping_sub(b) != 0 + { + addcarry_u64(addcarry_u64(addcarry_u64(addcarry_u64(0, mc[0], CURVE_ORDER_0, &mut mc[0]), mc[1], CURVE_ORDER_1, &mut mc[1]), mc[2], CURVE_ORDER_2, &mut mc[2]), mc[3], CURVE_ORDER_3, &mut mc[3]); + } + } +} + +/// Normalize a projective point (X1:Y1:Z1), including full reduction +#[inline] +pub fn eccnorm(p: &mut PointExtproj, q: &mut PointAffine) { + let mut t1 = [[0u64; 2]; 2]; + + fpsqr1271(p.z[0], &mut t1[0]); + fpsqr1271(p.z[1], &mut t1[1]); + fpadd1271(t1[0], t1[1], &mut t1[0]); + fpexp1251(t1[0], &mut t1[1]); + fpsqr1271(t1[1], &mut t1[1]); + fpsqr1271(t1[1], &mut t1[1]); + fpmul1271(t1[0], t1[1], &mut t1[0]); + fpneg1271(&mut p.z[1]); + fpmul1271(p.z[0], t1[0], &mut p.z[0]); + fpmul1271(p.z[1], t1[0], &mut p.z[1]); + + fp2mul1271(p.x, p.z, &mut q.x); // X1 = X1/Z1 + fp2mul1271(p.y, p.z, &mut q.y); // Y1 = Y1/Z1 + mod1271(&mut q.x[0]); + mod1271(&mut q.x[1]); + mod1271(&mut q.y[0]); + mod1271(&mut q.y[1]); +} + +/// Conversion from representation (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT), where T = Ta*Tb +#[inline] +pub fn r1_to_r2(p: &PointExtproj, q: &mut PointExtprojPrecomp) { + fp2add1271(p.ta, p.ta, &mut q.t2); // T = 2*Ta + fp2add1271(p.x, p.y, &mut q.xy); // QX = X+Y + fp2sub1271(p.y, p.x, &mut q.yx); // QY = Y-X + fp2mul1271(q.t2, p.tb, &mut q.t2); // T = 2*T + fp2add1271(p.z, p.z, &mut q.z2); // QZ = 2*Z + fp2mul1271(q.t2, PARAMETER_D_F2ELM, &mut q.t2); // QT = 2d*T +} + +/// Conversion from representation (X,Y,Z,Ta,Tb) to (X+Y,Y-X,Z,T), where T = Ta*Tb +#[inline] +pub fn r1_to_r3(p: &PointExtproj, q: &mut PointExtprojPrecomp) { + fp2add1271(p.x, p.y, &mut q.xy); // XQ = (X1+Y1) + fp2sub1271(p.y, p.x, &mut q.yx); // YQ = (Y1-X1) + fp2mul1271(p.ta, p.tb, &mut q.t2); // TQ = T1 + + unsafe { + copy_nonoverlapping(p.z.as_ptr() as *mut u64, q.z2.as_mut_ptr() as *mut u64, 4) // ZQ = Z1 + } +} + +/// Conversion from representation (X+Y,Y-X,2Z,2dT) to (2X,2Y,2Z,2dT) +#[inline] +pub fn r2_to_r4(p: &PointExtprojPrecomp, q: &mut PointExtproj) { + fp2sub1271(p.xy, p.yx, &mut q.x); // XQ = 2*X1 + fp2add1271(p.xy, p.yx, &mut q.y); // YQ = 2*Y1 + + unsafe { + copy_nonoverlapping(p.z2.as_ptr() as *mut u64, q.z.as_mut_ptr() as *mut u64, 4) // ZQ = Z1 + } +} + +// Point doubling 2P +#[inline] +pub fn eccdouble(p: &mut PointExtproj) { + let (mut t1, mut t2) = ([[0u64; 2]; 2], [[0u64; 2]; 2]); + + fp2sqr1271(p.x, &mut t1); // t1 = X1^2 + fp2sqr1271(p.y, &mut t2); // t2 = Y1^2 + fp2add1271(p.x, p.y, &mut p.x); // t3 = X1+Y1 + fp2add1271(t1, t2, &mut p.tb); // Tbfinal = X1^2+Y1^2 + fp2sub1271(t2, t1, &mut t1); // t1 = Y1^2-X1^2 + fp2sqr1271(p.x, &mut p.ta); // Ta = (X1+Y1)^2 + fp2sqr1271(p.z, &mut t2); // t2 = Z1^2 + fp2sub1271(p.ta, p.tb, &mut p.ta); // Tafinal = 2X1*Y1 = (X1+Y1)^2-(X1^2+Y1^2) + + /*fp2add1271(t2, t2, &mut t2); + fp2sub1271(t2, t1, &mut t2);*/ + fp2addsub1271(t2, t1, &mut t2); + + fp2mul1271(t1, p.tb, &mut p.y); // Yfinal = (X1^2+Y1^2)(Y1^2-X1^2) + fp2mul1271(t2, p.ta, &mut p.x); // Xfinal = 2X1*Y1*[2Z1^2-(Y1^2-X1^2)] + fp2mul1271(t1, t2, &mut p.z); // Zfinal = (Y1^2-X1^2)[2Z1^2-(Y1^2-X1^2)] +} + +/// Basic point addition r = P+Q or r = P+P +#[inline] +pub fn eccadd_core(p: &PointExtprojPrecomp, q: &PointExtprojPrecomp, r: &mut PointExtproj) { + let (mut t1, mut t2) = ([[0u64; 2]; 2], [[0u64; 2]; 2]); + + fp2mul1271(p.t2, q.t2, &mut r.z); // Z = 2dT1*T2 + fp2mul1271(p.z2, q.z2, &mut t1); // t1 = 2Z1*Z2 + fp2mul1271(p.xy, q.xy, &mut r.x); // X = (X1+Y1)(X2+Y2) + fp2mul1271(p.yx, q.yx, &mut r.y); // Y = (Y1-X1)(Y2-X2) + fp2sub1271(t1, r.z, &mut t2); // t2 = theta + fp2add1271(t1, r.z, &mut t1); // t1 = alpha + fp2sub1271(r.x, r.y, &mut r.tb); // Tbfinal = beta + fp2add1271(r.x, r.y, &mut r.ta); // Tafinal = omega + fp2mul1271(r.tb, t2, &mut r.x); // Xfinal = beta*theta + fp2mul1271(t1, t2, &mut r.z); // Zfinal = theta*alpha + fp2mul1271(r.ta, t1, &mut r.y); // Yfinal = alpha*omega +} + +/// Complete point addition P = P+Q or P = P+P +#[inline] +pub fn eccadd(q: &PointExtprojPrecomp, p: &mut PointExtproj) { + let mut r = PointExtprojPrecomp::default(); + + r1_to_r3(p, &mut r); + eccadd_core(q, &r, p); +} + +/// Point conversion to representation (X,Y,Z,Ta,Tb) +#[inline] +pub fn point_setup(p: &PointAffine, q: &mut PointExtproj) { + unsafe { + copy_nonoverlapping(p.x.as_ptr(), q.x.as_mut_ptr(), 2); + copy_nonoverlapping(p.y.as_ptr(), q.y.as_mut_ptr(), 2); + copy_nonoverlapping(p.x.as_ptr(), q.ta.as_mut_ptr(), 2); + copy_nonoverlapping(p.y.as_ptr(), q.tb.as_mut_ptr(), 2); + + q.z[0][0] = 1; + q.z[0][1] = 0; + q.z[1][0] = 0; + q.z[1][1] = 0; + } +} + +/// Point validation: check if point lies on the curve +#[inline] +pub fn ecc_point_validate(p: &PointExtproj) -> bool { + let (mut t1, mut t2, mut t3) = ([[0u64; 2]; 2], [[0u64; 2]; 2], [[0u64; 2]; 2]); + + fp2sqr1271(p.y, &mut t1); + fp2sqr1271(p.x, &mut t2); + fp2sub1271(t1, t2, &mut t3); // -x^2 + y^2 + fp2mul1271(t1, t2, &mut t1); // x^2*y^2 + fp2mul1271(t1, PARAMETER_D_F2ELM, &mut t2); // dx^2*y^2 + t1[0][0] = 1; + t1[0][1] = 0; + t1[1][0] = 0; + t1[1][1] = 0; // t1 = 1 + fp2add1271(t2, t1, &mut t2); // 1 + dx^2*y^2 + fp2sub1271(t3, t2, &mut t1); // -x^2 + y^2 - 1 - dx^2*y^2 + + ((t1[0][0] | t1[0][1]) == 0 || ((t1[0][0] + 1) | (t1[0][1] + 1)) == 0) && ((t1[1][0] | t1[1][1]) == 0|| ((t1[1][0] + 1) | (t1[1][1] + 1)) == 0) +} + +/// Mixed point addition P = P+Q or P = P+P +#[inline] +pub fn eccmadd(q: &PointPrecomp, p: &mut PointExtproj) { + let (mut t1, mut t2) = ([[0u64; 2]; 2], [[0u64; 2]; 2]); + + fp2mul1271(p.ta, p.tb, &mut p.ta); // Ta = T1 + fp2add1271(p.z, p.z, &mut t1); // t1 = 2Z1 + fp2mul1271(p.ta, q.t2, &mut p.ta); // Ta = 2dT1*t2 + fp2add1271(p.x, p.y, &mut p.z); // Z = (X1+Y1) + fp2sub1271(p.y, p.x, &mut p.tb); // Tb = (Y1-X1) + fp2sub1271(t1, p.ta, &mut t2); // t2 = theta + fp2add1271(t1, p.ta, &mut t1); // t1 = alpha + fp2mul1271(q.xy, p.z, &mut p.ta); // Ta = (X1+Y1)(x2+y2) + fp2mul1271(q.yx, p.tb, &mut p.x); // X = (Y1-X1)(y2-x2) + fp2mul1271(t1, t2, &mut p.z); // Zfinal = theta*alpha + fp2sub1271(p.ta, p.x, &mut p.tb); // Tbfinal = beta + fp2add1271(p.ta, p.x, &mut p.ta); // Tafinal = omega + fp2mul1271(p.tb, t2, &mut p.x); // Xfinal = beta*theta + fp2mul1271(p.ta, t1, &mut p.y); // Yfinal = alpha*omega +} + +/// Fixed-base scalar multiplication Q = k*G, where G is the generator. FIXED_BASE_TABLE stores v*2^(w-1) = 80 multiples of G. +#[inline] +pub fn ecc_mul_fixed(k: &[u64], q: &mut PointAffine) { + let mut digits = [0u64; 250]; + let mut scalar = [0u64; 4]; + + montgomery_multiply_mod_order(k, &MONTGOMERY_R_PRIME, &mut scalar); + let scalar1 = scalar; + montgomery_multiply_mod_order(&scalar1, &ONE, &mut scalar); + + unsafe { + if scalar[0] & 1 == 0 { + let mut carry = addcarry_u64(0, scalar[0], CURVE_ORDER_0, &mut scalar[0]); + carry = addcarry_u64(carry, scalar[1], CURVE_ORDER_1, &mut scalar[1]); + carry = addcarry_u64(carry, scalar[2], CURVE_ORDER_2, &mut scalar[2]); + addcarry_u64(carry, scalar[3], CURVE_ORDER_3, &mut scalar[3]); + } + + scalar[0] = __shiftright128(scalar[0], scalar[1], 1); + scalar[1] = __shiftright128(scalar[1], scalar[2], 1); + scalar[2] = __shiftright128(scalar[2], scalar[3], 1); + scalar[3] >>= 1; + + for digit in digits.iter_mut().take(49) { + *digit = (scalar[0] & 1).wrapping_sub(1); // Convention for the "sign" row: if scalar_(i+1) = 0 then digit_i = -1 (negative), else if scalar_(i+1) = 1 then digit_i = 0 (positive) + + // Shift scalar to the right by 1 + scalar[0] = __shiftright128(scalar[0], scalar[1], 1); + scalar[1] = __shiftright128(scalar[1], scalar[2], 1); + scalar[2] = __shiftright128(scalar[2], scalar[3], 1); + scalar[3] >>= 1; + } + + for i in 50..250 { + digits[i] = scalar[0] & 1; + + // Shift scalar to the right by 1 + scalar[0] = __shiftright128(scalar[0], scalar[1], 1); + scalar[1] = __shiftright128(scalar[1], scalar[2], 1); + scalar[2] = __shiftright128(scalar[2], scalar[3], 1); + scalar[3] >>= 1; + + let temp = (0u64.wrapping_sub(digits[i - (i / 50) * 50])) & digits[i]; + + scalar[0] += temp; + let mut carry = if scalar[0] != 0 { 0 } else { temp & 1}; + scalar[1] += carry; + carry = if scalar[1] != 0 { 0 } else { carry & 1 }; + scalar[2] += carry; + scalar[3] += if scalar[2] != 0 { 0 } else { carry & 1 }; + } + + let mut r = PointExtproj::default(); + let mut s = PointPrecomp::default(); + + table_lookup_fixed_base(&mut s, 64 + (((((digits[249] << 1) + digits[199]) << 1) + digits[149]) << 1) + digits[99], 0); + // Conversion from representation (x+y,y-x,2dt) to (X,Y,Z,Ta,Tb) + fp2sub1271(s.xy, s.yx, &mut r.x); // 2*x1 + fp2add1271(s.xy, s.yx, &mut r.y); // 2*y1 + fp2div1271(&mut r.x); // XQ = x1 + fp2div1271(&mut r.y); // YQ = y1 + r.z[0][0] = 1; + r.z[0][1] = 0; + r.z[1][0] = 0; + r.z[1][1] = 0; // ZQ = 1 + copy_nonoverlapping(r.x.as_ptr(), r.ta.as_mut_ptr(), 2); + copy_nonoverlapping(r.y.as_ptr(), r.tb.as_mut_ptr(), 2); + + + table_lookup_fixed_base(&mut s, 48 + (((((digits[239] << 1) + digits[189]) << 1) + digits[139]) << 1) + digits[89], digits[39]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 32 + (((((digits[229] << 1) + digits[179]) << 1) + digits[129]) << 1) + digits[79], digits[29]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 16 + (((((digits[219] << 1) + digits[169]) << 1) + digits[119]) << 1) + digits[69], digits[19]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, (((((digits[209] << 1) + digits[159]) << 1) + digits[109]) << 1) + digits[59], digits[9]); + eccmadd(&s, &mut r); + + eccdouble(&mut r); + table_lookup_fixed_base(&mut s, 64 + (((((digits[248] << 1) + digits[198]) << 1) + digits[148]) << 1) + digits[98], digits[48]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 48 + (((((digits[238] << 1) + digits[188]) << 1) + digits[138]) << 1) + digits[88], digits[38]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 32 + (((((digits[228] << 1) + digits[178]) << 1) + digits[128]) << 1) + digits[78], digits[28]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 16 + (((((digits[218] << 1) + digits[168]) << 1) + digits[118]) << 1) + digits[68], digits[18]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, (((((digits[208] << 1) + digits[158]) << 1) + digits[108]) << 1) + digits[58], digits[8]); + eccmadd(&s, &mut r); + + eccdouble(&mut r); + table_lookup_fixed_base(&mut s, 64 + (((((digits[247] << 1) + digits[197]) << 1) + digits[147]) << 1) + digits[97], digits[47]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 48 + (((((digits[237] << 1) + digits[187]) << 1) + digits[137]) << 1) + digits[87], digits[37]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 32 + (((((digits[227] << 1) + digits[177]) << 1) + digits[127]) << 1) + digits[77], digits[27]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 16 + (((((digits[217] << 1) + digits[167]) << 1) + digits[117]) << 1) + digits[67], digits[17]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, (((((digits[207] << 1) + digits[157]) << 1) + digits[107]) << 1) + digits[57], digits[7]); + eccmadd(&s, &mut r); + + eccdouble(&mut r); + table_lookup_fixed_base(&mut s, 64 + (((((digits[246] << 1) + digits[196]) << 1) + digits[146]) << 1) + digits[96], digits[46]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 48 + (((((digits[236] << 1) + digits[186]) << 1) + digits[136]) << 1) + digits[86], digits[36]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 32 + (((((digits[226] << 1) + digits[176]) << 1) + digits[126]) << 1) + digits[76], digits[26]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 16 + (((((digits[216] << 1) + digits[166]) << 1) + digits[116]) << 1) + digits[66], digits[16]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, (((((digits[206] << 1) + digits[156]) << 1) + digits[106]) << 1) + digits[56], digits[6]); + eccmadd(&s, &mut r); + + eccdouble(&mut r); + table_lookup_fixed_base(&mut s, 64 + (((((digits[245] << 1) + digits[195]) << 1) + digits[145]) << 1) + digits[95], digits[45]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 48 + (((((digits[235] << 1) + digits[185]) << 1) + digits[135]) << 1) + digits[85], digits[35]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 32 + (((((digits[225] << 1) + digits[175]) << 1) + digits[125]) << 1) + digits[75], digits[25]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 16 + (((((digits[215] << 1) + digits[165]) << 1) + digits[115]) << 1) + digits[65], digits[15]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, (((((digits[205] << 1) + digits[155]) << 1) + digits[105]) << 1) + digits[55], digits[5]); + eccmadd(&s, &mut r); + + eccdouble(&mut r); + table_lookup_fixed_base(&mut s, 64 + (((((digits[244] << 1) + digits[194]) << 1) + digits[144]) << 1) + digits[94], digits[44]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 48 + (((((digits[234] << 1) + digits[184]) << 1) + digits[134]) << 1) + digits[84], digits[34]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 32 + (((((digits[224] << 1) + digits[174]) << 1) + digits[124]) << 1) + digits[74], digits[24]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 16 + (((((digits[214] << 1) + digits[164]) << 1) + digits[114]) << 1) + digits[64], digits[14]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, (((((digits[204] << 1) + digits[154]) << 1) + digits[104]) << 1) + digits[54], digits[4]); + eccmadd(&s, &mut r); + + eccdouble(&mut r); + table_lookup_fixed_base(&mut s, 64 + (((((digits[243] << 1) + digits[193]) << 1) + digits[143]) << 1) + digits[93], digits[43]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 48 + (((((digits[233] << 1) + digits[183]) << 1) + digits[133]) << 1) + digits[83], digits[33]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 32 + (((((digits[223] << 1) + digits[173]) << 1) + digits[123]) << 1) + digits[73], digits[23]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 16 + (((((digits[213] << 1) + digits[163]) << 1) + digits[113]) << 1) + digits[63], digits[13]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, (((((digits[203] << 1) + digits[153]) << 1) + digits[103]) << 1) + digits[53], digits[3]); + eccmadd(&s, &mut r); + + eccdouble(&mut r); + table_lookup_fixed_base(&mut s, 64 + (((((digits[242] << 1) + digits[192]) << 1) + digits[142]) << 1) + digits[92], digits[42]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 48 + (((((digits[232] << 1) + digits[182]) << 1) + digits[132]) << 1) + digits[82], digits[32]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 32 + (((((digits[222] << 1) + digits[172]) << 1) + digits[122]) << 1) + digits[72], digits[22]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 16 + (((((digits[212] << 1) + digits[162]) << 1) + digits[112]) << 1) + digits[62], digits[12]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, (((((digits[202] << 1) + digits[152]) << 1) + digits[102]) << 1) + digits[52], digits[2]); + eccmadd(&s, &mut r); + + eccdouble(&mut r); + table_lookup_fixed_base(&mut s, 64 + (((((digits[241] << 1) + digits[191]) << 1) + digits[141]) << 1) + digits[91], digits[41]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 48 + (((((digits[231] << 1) + digits[181]) << 1) + digits[131]) << 1) + digits[81], digits[31]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 32 + (((((digits[221] << 1) + digits[171]) << 1) + digits[121]) << 1) + digits[71], digits[21]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 16 + (((((digits[211] << 1) + digits[161]) << 1) + digits[111]) << 1) + digits[61], digits[11]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, (((((digits[201] << 1) + digits[151]) << 1) + digits[101]) << 1) + digits[51], digits[1]); + eccmadd(&s, &mut r); + + eccdouble(&mut r); + table_lookup_fixed_base(&mut s, 64 + (((((digits[240] << 1) + digits[190]) << 1) + digits[140]) << 1) + digits[90], digits[40]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 48 + (((((digits[230] << 1) + digits[180]) << 1) + digits[130]) << 1) + digits[80], digits[30]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 32 + (((((digits[220] << 1) + digits[170]) << 1) + digits[120]) << 1) + digits[70], digits[20]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, 16 + (((((digits[210] << 1) + digits[160]) << 1) + digits[110]) << 1) + digits[60], digits[10]); + eccmadd(&s, &mut r); + table_lookup_fixed_base(&mut s, (((((digits[200] << 1) + digits[150]) << 1) + digits[100]) << 1) + digits[50], digits[0]); + eccmadd(&s, &mut r); + + eccnorm(&mut r, q); + } +} + +pub const fn f2elm_from_array(a: [u64; 4]) -> [[u64; 2]; 2] { + [[a[0], a[1]], [a[2], a[3]]] +} + +/// Apply tau_dual mapping to a point, P = tau_dual(P) +#[inline] +pub fn ecc_tau(p: &mut PointExtproj) { + let (mut t0, mut t1) = ([[0u64; 2]; 2], [[0u64; 2]; 2]); + + fp2sqr1271(p.x, &mut t0); // t0 = X1^2 + fp2sqr1271(p.y, &mut t1); // t1 = Y1^2 + fp2mul1271(p.x, p.y, &mut p.x); // X = X1*Y1 + fp2sqr1271(p.z, &mut p.y); // Y = Z1^2 + fp2add1271(t0, t1, &mut p.z); // Z = X1^2+Y1^2 + fp2sub1271(t1, t0, &mut t0); // t0 = Y1^2-X1^2 + fp2add1271(p.y, p.y, &mut p.y); // Y = 2*Z1^2 + fp2mul1271(p.x, t0, &mut p.x); // X = X1*Y1*(Y1^2-X1^2) + fp2sub1271(p.y, t0, &mut p.y); // Y = 2*Z1^2-(Y1^2-X1^2) + fp2mul1271(p.x, f2elm_from_array(C_TAU_1), &mut p.x); // Xfinal = X*ctau1 + fp2mul1271(p.y, p.z, &mut p.y); // Yfinal = Y*Z + fp2mul1271(p.z, t0, &mut p.z); // Zfinal = t0*Z +} + +/// Apply tau_dual mapping to a point, P = tau_dual(P) +#[inline] +pub fn ecc_tau_dual(p: &mut PointExtproj) { + let (mut t0, mut t1) = ([[0u64; 2]; 2], [[0u64; 2]; 2]); + + fp2sqr1271(p.x, &mut t0); // t0 = X1^2 + fp2sqr1271(p.z, &mut p.ta); // Ta = Z1^2 + fp2sqr1271(p.y, &mut t1); // t1 = Y1^2 + fp2add1271(p.ta, p.ta, &mut p.z); // Z = 2*Z1^2 + fp2sub1271(t1, t0, &mut p.ta); // Tafinal = Y1^2-X1^2 + fp2add1271(t0, t1, &mut t0); // t0 = X1^2+Y1^2 + fp2mul1271(p.x, p.y, &mut p.x); // X = X1*Y1 + fp2sub1271(p.z, p.ta, &mut p.z); // Z = 2*Z1^2-(Y1^2-X1^2) + fp2mul1271(p.x, f2elm_from_array(C_TAU_DUAL_1), &mut p.tb); // Tbfinal = ctaudual1*X1*X1 + fp2mul1271(p.z, p.ta, &mut p.y); // Yfinal = Z*Tafinal + fp2mul1271(p.tb, t0, &mut p.x); // Xfinal = Tbfinal*t0 + fp2mul1271(p.z, t0, &mut p.z); // Zfinal = Z*t0 +} + +/// Apply delta_phi_delta mapping to a point, P = delta(phi_W(delta_inv(P))), +/// where phi_W is the endomorphism on the Weierstrass form +#[inline] +pub fn ecc_delphidel(p: &mut PointExtproj) { + let (mut t0, mut t1, mut t2, mut t3, mut t4, mut t5, mut t6) = ([[0u64; 2]; 2], [[0u64; 2]; 2], [[0u64; 2]; 2], [[0u64; 2]; 2], [[0u64; 2]; 2], [[0u64; 2]; 2], [[0u64; 2]; 2]); + + fp2sqr1271(p.z, &mut t4); // t4 = Z1^2 + fp2mul1271(p.y, p.z, &mut t3); // t3 = Y1*Z1 + fp2mul1271(t4, f2elm_from_array(C_PHI_4), &mut t0); // t0 = cphi4*t4 + fp2sqr1271(p.y, &mut t2); // t2 = Y1^2 + fp2add1271(t0, t2, &mut t0); // t0 = t0+t2 + fp2mul1271(t3, f2elm_from_array(C_PHI_3), &mut t1); // t1 = cphi3*t3 + fp2sub1271(t0, t1, &mut t5); // t5 = t0-t1 + fp2add1271(t0, t1, &mut t0); // t0 = t0+t1 + fp2mul1271(t0, p.z, &mut t0); // t0 = t0*Z1 + fp2mul1271(t3, f2elm_from_array(C_PHI_1), &mut t1); // t1 = cphi1*t3 + fp2mul1271(t0, t5, &mut t0); // t0 = t0*t5 + fp2mul1271(t4, f2elm_from_array(C_PHI_2), &mut t5); // t5 = cphi2*t4 + fp2add1271(t2, t5, &mut t5); // t5 = t2+t5 + fp2sub1271(t1, t5, &mut t6); // t6 = t1-t5 + fp2add1271(t1, t5, &mut t1); // t1 = t1+t5 + fp2mul1271(t6, t1, &mut t6); // t6 = t1*t6 + fp2mul1271(t6, f2elm_from_array(C_PHI_0), &mut t6); // t6 = cphi0*t6 + fp2mul1271(p.x, t6, &mut p.x); // X = X1*t6 + fp2sqr1271(t2, &mut t6); // t6 = t2^2 + fp2sqr1271(t3, &mut t2); // t2 = t3^2 + fp2sqr1271(t4, &mut t3); // t3 = t4^2 + fp2mul1271(t2, f2elm_from_array(C_PHI_8), &mut t1); // t1 = cphi8*t2 + fp2mul1271(t3, f2elm_from_array(C_PHI_9), &mut t5); // t5 = cphi9*t3 + fp2add1271(t1, t6, &mut t1); // t1 = t1+t6 + fp2mul1271(t2, f2elm_from_array(C_PHI_6), &mut t2); // t2 = cphi6*t2 + fp2mul1271(t3, f2elm_from_array(C_PHI_7), &mut t3); // t3 = cphi7*t3 + fp2add1271(t1, t5, &mut t1); // t1 = t1+t5 + fp2add1271(t2, t3, &mut t2); // t2 = t2+t3 + fp2mul1271(t1, p.y, &mut t1); // t1 = Y1*t1 + fp2add1271(t6, t2, &mut p.y); // Y = t6+t2 + fp2mul1271(p.x, t1, &mut p.x); // X = X*t1 + fp2mul1271(p.y, f2elm_from_array(C_PHI_5), &mut p.y); // Y = cphi5*Y + fpneg1271(&mut p.x[1]); // Xfinal = X^p + fp2mul1271(p.y, p.z, &mut p.y); // Y = Y*Z1 + fp2mul1271(t0, t1, &mut p.z); // Z = t0*t1 + fp2mul1271(p.y, t0, &mut p.y); // Y = Y*t0 + fpneg1271(&mut p.z[1]); // Zfinal = Z^p + fpneg1271(&mut p.y[1]); // Yfinal = Y^p +} + + +/// Apply delta_psi_delta mapping to a point, P = delta(psi_W(delta_inv(P))), +/// where psi_W is the endomorphism on the Weierstrass form +#[inline] +pub fn ecc_delpsidel(p: &mut PointExtproj) { + let (mut t0, mut t1, mut t2) = ([[0u64; 2]; 2], [[0u64; 2]; 2], [[0u64; 2]; 2]); + + fpneg1271(&mut p.x[1]); // X = X1^p + fpneg1271(&mut p.z[1]); // Z = Z1^p + fpneg1271(&mut p.y[1]); // Y = Y1^p + fp2sqr1271(p.z, &mut t2); // t2 = Z1^p^2 + fp2sqr1271(p.x, &mut t0); // t0 = X1^p^2 + fp2mul1271(p.x, t2, &mut p.x); // X = X1^p*Z1^p^2 + fp2mul1271(t2, f2elm_from_array(C_PSI_2), &mut p.z); // Z = cpsi2*Z1^p^2 + fp2mul1271(t2, f2elm_from_array(C_PSI_3), &mut t1); // t1 = cpsi3*Z1^p^2 + fp2mul1271(t2, f2elm_from_array(C_PSI_4), &mut t2); // t2 = cpsi4*Z1^p^2 + fp2add1271(t0, p.z, &mut p.z); // Z = X1^p^2 + cpsi2*Z1^p^2 + fp2add1271(t0, t2, &mut t2); // t2 = X1^p^2 + cpsi4*Z1^p^2 + fp2add1271(t0, t1, &mut t1); // t1 = X1^p^2 + cpsi3*Z1^p^2 + fp2neg1271(&mut t2); // t2 = -(X1^p^2 + cpsi4*Z1^p^2) + fp2mul1271(p.z, p.y, &mut p.z); // Z = Y1^p*(X1^p^2 + cpsi2*Z1^p^2) + fp2mul1271(p.x, t2, &mut p.x); // X = -X1^p*Z1^p^2*(X1^p^2 + cpsi4*Z1^p^2) + fp2mul1271(t1, p.z, &mut p.y); // Yfinal = t1*Z + fp2mul1271(p.x, f2elm_from_array(C_PSI_1), &mut p.x); // Xfinal = cpsi1*X + fp2mul1271(p.z, t2, &mut p.z); // Zfinal = Z*t2 +} + +/// Apply psi mapping to a point, P = psi(P) +#[inline] +pub fn ecc_psi(p: &mut PointExtproj) { + ecc_tau(p); + ecc_delpsidel(p); + ecc_tau_dual(p); +} + +#[inline] +pub fn ecc_phi(p: &mut PointExtproj) { + ecc_tau(p); + ecc_delphidel(p); + ecc_tau_dual(p); +} + +#[inline] +pub fn eccneg_extproj_precomp(p: &PointExtprojPrecomp, q: &mut PointExtprojPrecomp) { + q.t2.copy_from_slice(&p.t2); + q.yx.copy_from_slice(&p.xy); + q.xy.copy_from_slice(&p.yx); + q.z2.copy_from_slice(&p.z2); + fp2neg1271(&mut q.t2); +} + +#[inline] +pub fn eccneg_precomp(p: &PointPrecomp, q: &mut PointPrecomp) { + q.t2 = p.t2; + q.yx = p.xy; + q.xy = p.yx; + fp2neg1271(&mut q.t2); +} + +#[inline] +pub fn mul_truncate(s: &[u64], c: &[u64]) -> u64 { + let (mut t0, mut t1, mut t2, mut t3) = (0u64, 0u64, 0u64, 0u64); + let (mut t4, mut t5, mut t6, mut t7) = (0u64, 0u64, 0u64, 0u64); + let (mut t8, mut t9, mut t10, mut t11) = (0u64, 0u64, 0u64, 0u64); + let (mut t12, mut t13, mut t14, mut t15) = (0u64, 0u64, 0u64, 0u64); + let mut t16 = 0u64; + + let (mut high00, mut low10, mut high10, mut low01) = (0u64, 0u64, 0u64, 0u64); + let (mut high01, mut low20, mut high20, mut low02) = (0u64, 0u64, 0u64, 0u64); + let (mut high02, mut low11, mut high11, mut low03) = (0u64, 0u64, 0u64, 0u64); + let (mut high03, mut low30, mut high30, mut low12) = (0u64, 0u64, 0u64, 0u64); + let mut high12 = 0u64; + let mut high21 = 0u64; + + _umul128(s[0], c[0], &mut high00); + low10 = _umul128(s[1], c[0], &mut high10); + addcarry_u64(addcarry_u64(0, high00, low10, &mut t0), high10, 0, &mut t1); + low01 = _umul128(s[0], c[1], &mut high01); + t2 = addcarry_u64(addcarry_u64(0, t0, low01, &mut t0), t1, high01, &mut t3) as u64; + low20 = _umul128(s[2], c[0], &mut high20); + addcarry_u64(addcarry_u64(0, t3, low20, &mut t4), t2, high20, &mut t5); + low02 = _umul128(s[0], c[2], &mut high02); + t6 = addcarry_u64(addcarry_u64(0, t4, low02, &mut t7), t5, high02, &mut t8) as u64; + low11 = _umul128(s[1], c[1], &mut high11); + t9 = addcarry_u64(addcarry_u64(0, t7, low11, &mut t0), t8, high11, &mut t10) as u64; + low03 = _umul128(s[0], c[3], &mut high03); + addcarry_u64(addcarry_u64(0, t10, low03, &mut t11), t6 + t9, high03, &mut t12); + low30 = _umul128(s[3], c[0], &mut high30); + addcarry_u64(addcarry_u64(0, t11, low30, &mut t13), t12, high30, &mut t14); + low12 = _umul128(s[1], c[2], &mut high12); + addcarry_u64(addcarry_u64(0, t13, low12, &mut t15), t14, high12, &mut t16); + + addcarry_u64(0, t15, _umul128(s[2], c[1], &mut high21), &mut t0) as u64 + t16 + high21 + s[1] * c[3] + s[2] * c[2] + s[3] * c[1] +} + + +/// Scalar decomposition for the variable-base scalar multiplication +#[inline] +pub fn decompose(k: &[u64], scalars: &mut [u64]) { + let a1 = mul_truncate(k, &ELL_1); + let a2 = mul_truncate(k, &ELL_2); + let a3 = mul_truncate(k, &ELL_3); + let a4 = mul_truncate(k, &ELL_4); + + scalars[0] = a1 * B11 + a2 * B21 + a3 * B31 + a4 * B41 + C1 + k[0]; + scalars[1] = a1 * B12 + a2 * B22 + a3 * B32 + a4 * B42 + C2; + scalars[2] = a1 * B13 + a2 * B23 + a3 * B33 + a4 * B43 + C3; + scalars[3] = a1 * B14 + a2 * B24 + a3 * B34 + a4 * B44 + C4; + + if scalars[0] & 1 == 0 { + scalars[0] -= B41; + scalars[1] -= B42; + scalars[2] -= B43; + scalars[3] -= B44; + } +} + +/// Computes wNAF recoding of a scalar, where digits are in set {0,+-1,+-3,...,+-(2^(w-1)-1)} +#[inline] +pub fn w_naf_recode(mut scalar: u64, w: u64, digits: &mut [i8]) { + let val1 = (1 << (w - 1)) - 1; + let val2 = 1 << w; + + let mask = val2 as u64 - 1; + let mut index = 0; + + while scalar != 0 { + let mut digit = (scalar & 1) as i32; + + if digit == 0 { + scalar >>= 1; + digits[index] = 0; + } else { + digit = (scalar & mask) as i32; + scalar >>= w; + + if digit > val1 { + digit -= val2; + } + + if digit < 0 { + scalar += 1; + } + + digits[index] = digit as i8; + if scalar != 0 { + for _ in 0..(w-1) { + index += 1; + digits[index] = 0; + } + } + } + + index += 1; + } +} + +/// Generation of the precomputation table used internally by the double scalar multiplication function ecc_mul_double() +#[inline] +pub fn ecc_precomp_double(p: &mut PointExtproj, table: &mut [PointExtprojPrecomp]) { + let mut q = PointExtproj::default(); + let mut pp = PointExtprojPrecomp::default(); + + r1_to_r2(p, &mut table[0]); + eccdouble(p); + r1_to_r3(p, &mut pp); + + eccadd_core(&table[0], &pp, &mut q); + r1_to_r2(&q, &mut table[1]); + + eccadd_core(&table[1], &pp, &mut q); + r1_to_r2(&q, &mut table[2]); + + eccadd_core(&table[2], &pp, &mut q); + r1_to_r2(&q, &mut table[3]); +} + +/// Double scalar multiplication R = k*G + l*Q, where the G is the generator +/// Uses DOUBLE_SCALAR_TABLE, which contains multiples of G, Phi(G), Psi(G) and Phi(Psi(G)) +/// The function uses wNAF with interleaving. +#[inline] +pub fn ecc_mul_double(k: &mut [u64], l: &mut [u64], q: &mut PointAffine) -> bool { + let (mut digits_k1, mut digits_k2, mut digits_k3, mut digits_k4) = ([0i8; 65], [0i8; 65], [0i8; 65], [0i8; 65]); + let (mut digits_l1, mut digits_l2, mut digits_l3, mut digits_l4) = ([0i8; 65], [0i8; 65], [0i8; 65], [0i8; 65]); + let mut v = PointPrecomp::default(); + let (mut q1, mut q2, mut q3, mut q4, mut t) = (PointExtproj::default(), PointExtproj::default(), PointExtproj::default(), PointExtproj::default(), PointExtproj::default()); + let mut u = PointExtprojPrecomp::default(); + let (mut q_table1, mut q_table2, mut q_table3, mut q_table4) = ([PointExtprojPrecomp::default(); 4], [PointExtprojPrecomp::default(); 4], [PointExtprojPrecomp::default(); 4], [PointExtprojPrecomp::default(); 4]); + let mut k_scalars = [0u64; 4]; + let mut l_scalars = [0u64; 4]; + + point_setup(q, &mut q1); + + if !ecc_point_validate(&q1) { + return false; + } + + q2 = q1; + ecc_phi(&mut q2); + q3 = q1; + ecc_psi(&mut q3); + q4 = q2; + ecc_psi(&mut q4); + + decompose(k, &mut k_scalars); + decompose(l, &mut l_scalars); + w_naf_recode(k_scalars[0], 8, &mut digits_k1); // Scalar recoding + w_naf_recode(k_scalars[1], 8, &mut digits_k2); + w_naf_recode(k_scalars[2], 8, &mut digits_k3); + w_naf_recode(k_scalars[3], 8, &mut digits_k4); + w_naf_recode(l_scalars[0], 4, &mut digits_l1); + w_naf_recode(l_scalars[1], 4, &mut digits_l2); + w_naf_recode(l_scalars[2], 4, &mut digits_l3); + w_naf_recode(l_scalars[3], 4, &mut digits_l4); + + ecc_precomp_double(&mut q1, &mut q_table1); + ecc_precomp_double(&mut q2, &mut q_table2); + ecc_precomp_double(&mut q3, &mut q_table3); + ecc_precomp_double(&mut q4, &mut q_table4); + + t.x[0][0] = 0; t.x[0][1] = 0; t.x[1][0] = 0; t.x[1][1] = 0; // Initialize T as the neutral point (0:1:1) + t.y[0][0] = 1; t.y[0][1] = 0; t.y[1][0] = 0; t.y[1][1] = 0; + t.z[0][0] = 1; t.z[0][1] = 0; t.z[1][0] = 0; t.z[1][1] = 0; + + for i in (0..=64).rev() { + eccdouble(&mut t); + + if digits_l1[i] < 0 { + eccneg_extproj_precomp(&q_table1[((-digits_l1[i]) >> 1) as usize], &mut u); + eccadd(&u, &mut t); + } + else if digits_l1[i] > 0 { + eccadd(&q_table1[((digits_l1[i]) >> 1) as usize], &mut t); + } + + if digits_l2[i] < 0 { + eccneg_extproj_precomp(&q_table2[((-digits_l2[i]) >> 1) as usize], &mut u); + eccadd(&u, &mut t); + } + else if digits_l2[i] > 0 { + eccadd(&q_table2[((digits_l2[i]) >> 1) as usize], &mut t); + } + + if digits_l3[i] < 0 { + eccneg_extproj_precomp(&q_table3[((-digits_l3[i]) >> 1) as usize], &mut u); + eccadd(&u, &mut t); + } + else if digits_l3[i] > 0 { + eccadd(&q_table3[((digits_l3[i]) >> 1) as usize], &mut t); + } + + if digits_l4[i] < 0 { + eccneg_extproj_precomp(&q_table4[((-digits_l4[i]) >> 1) as usize], &mut u); + eccadd(&u, &mut t); + } + else if digits_l4[i] > 0 { + eccadd(&q_table4[((digits_l4[i]) >> 1) as usize], &mut t); + } + + + unsafe { + + if digits_k1[i] < 0 { + eccneg_precomp(&*(DOUBLE_SCALAR_TABLE.as_ptr() as *const PointPrecomp).offset(((-digits_k1[i]) >> 1) as isize), &mut v); + eccmadd(&v, &mut t); + } else if digits_k1[i] > 0 { + eccmadd(&*(DOUBLE_SCALAR_TABLE.as_ptr() as *const PointPrecomp).offset(((digits_k1[i]) >> 1) as isize), &mut t); + } + + if digits_k2[i] < 0 { + eccneg_precomp(&*(DOUBLE_SCALAR_TABLE.as_ptr() as *const PointPrecomp).offset(64 + ((-digits_k2[i]) >> 1) as isize), &mut v); + eccmadd(&v, &mut t); + } else if digits_k2[i] > 0 { + eccmadd(&*(DOUBLE_SCALAR_TABLE.as_ptr() as *const PointPrecomp).offset(64 + ((digits_k2[i]) >> 1) as isize), &mut t); + } + + if digits_k3[i] < 0 { + eccneg_precomp(&*(DOUBLE_SCALAR_TABLE.as_ptr() as *const PointPrecomp).offset(2 * 64 + ((-digits_k3[i]) >> 1) as isize), &mut v); + eccmadd(&v, &mut t); + } else if digits_k3[i] > 0 { + eccmadd(&*(DOUBLE_SCALAR_TABLE.as_ptr() as *const PointPrecomp).offset(2* 64 + ((digits_k3[i]) >> 1) as isize), &mut t); + } + + if digits_k4[i] < 0 { + eccneg_precomp(&*(DOUBLE_SCALAR_TABLE.as_ptr() as *const PointPrecomp).offset(3 * 64 + ((-digits_k4[i]) >> 1) as isize), &mut v); + eccmadd(&v, &mut t); + } else if digits_k4[i] > 0 { + eccmadd(&*(DOUBLE_SCALAR_TABLE.as_ptr() as *const PointPrecomp).offset(3 * 64 + ((digits_k4[i]) >> 1) as isize ), &mut t); + } + } + } + + eccnorm(&mut t, q); + + true +} + +/// Generation of the precomputation table used by the variable-base scalar multiplication ecc_mul() +#[inline] +pub fn ecc_precomp(p: &mut PointExtproj, t: &mut [PointExtprojPrecomp]) { + let (mut q, mut r, mut s) = (PointExtprojPrecomp::default(), PointExtprojPrecomp::default(), PointExtprojPrecomp::default()); + let mut pp = *p; + + ecc_phi(&mut pp); + r1_to_r3(&pp, &mut q); + + ecc_psi(&mut pp); + r1_to_r3(&pp, &mut s); + + r1_to_r2(p, &mut t[0]); + + ecc_psi(p); + r1_to_r3(p, &mut r); + + eccadd_core(&t[0], &q, &mut pp); // T[1] = P+Q using the representations (X,Y,Z,Ta,Tb) <- (X+Y,Y-X,2Z,2dT) + (X+Y,Y-X,Z,T) + r1_to_r2(&pp, &mut t[1]); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT) + eccadd_core(&t[0], &r, &mut pp); // T[2] = P+r + r1_to_r2(&pp, &mut t[2]); + eccadd_core(&t[1], &r, &mut pp); // T[3] = P+Q+r + r1_to_r2(&pp, &mut t[3]); + eccadd_core(&t[0], &s, &mut pp); // T[4] = P+S + r1_to_r2(&pp, &mut t[4]); + eccadd_core(&t[1], &s, &mut pp); // T[5] = P+Q+S + r1_to_r2(&pp, &mut t[5]); + eccadd_core(&t[2], &s, &mut pp); // T[6] = P+r+S + r1_to_r2(&pp, &mut t[6]); + eccadd_core(&t[3], &s, &mut pp); // T[7] = P+Q+r+S + r1_to_r2(&pp, &mut t[7]); +} + +/// Co-factor clearing +#[inline] +pub fn cofactor_clearing(r: &mut PointExtproj) { + let mut q = PointExtprojPrecomp::default(); + + r1_to_r2(r, &mut q); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT) + eccdouble(r); // P = 2*P using representations (X,Y,Z,Ta,Tb) <- 2*(X,Y,Z) + eccadd(&q, r); // P = P+Q using representations (X,Y,Z,Ta,Tb) <- (X,Y,Z,Ta,Tb) + (X+Y,Y-X,2Z,2dT) + eccdouble(r); + eccdouble(r); + eccdouble(r); + eccdouble(r); + eccadd(&q, r); + eccdouble(r); + eccdouble(r); + eccdouble(r); +} + +#[inline] +pub fn ecc_mul(p: &mut PointAffine, k: &[u64], q: &mut PointAffine) -> bool { + let mut r = PointExtproj::default(); + let mut table = [[PointExtprojPrecomp::default(); 8]; 2]; + let mut scalars = [0u64; 4]; + let mut digits = [0u64; 64]; + let mut sign_masks = [0u64; 64]; + + point_setup(p, &mut r); + + if !ecc_point_validate(&r) { + return false; + } + + decompose(k, &mut scalars); + + cofactor_clearing(&mut r); + + + for i in 0..64 { + scalars[0] >>= 1; + let bit0 = scalars[0] & 1; + sign_masks[i] = bit0; + + digits[i] = scalars[1] & 1; + scalars[1] = (scalars[1] >> 1) + ((bit0 | digits[i]) ^ bit0); + + let mut bit = scalars[2] & 1; + scalars[2] = (scalars[2] >> 1) + ((bit0 | bit) ^ bit0); + digits[i] += bit << 1; + + bit = scalars[3] & 1; + scalars[3] = (scalars[3] >> 1) + ((bit0 | bit) ^ bit0); + digits[i] += bit << 2; + } + + ecc_precomp(&mut r, &mut table[1]); + + for i in 0..8 { + table[0][i].xy = table[1][i].yx; + table[0][i].yx = table[1][i].xy; + table[0][i].t2 = table[1][i].t2; + table[0][i].z2 = table[1][i].z2; + fp2neg1271(&mut table[0][i].t2); + } + + r2_to_r4(&table[1][(scalars[1] + (scalars[2] << 1) + (scalars[3] << 2)) as usize], &mut r); + + + for i in (0..64).rev() { + eccdouble(&mut r); + eccadd(&table[sign_masks[i] as usize][digits[i] as usize], &mut r); + } + eccnorm(&mut r, q); + + true +} + + +/// Encode point P +#[inline] +pub fn encode(p: &mut PointAffine, pencoded: &mut [u8]) { + let temp1 = (p.x[1][1] & 0x4000000000000000) << 1; + let temp2 = (p.x[0][1] & 0x4000000000000000) << 1; + + unsafe { + copy_nonoverlapping(p.y.as_ptr() as *const u8, pencoded.as_mut_ptr(), 32); + + if p.x[0][0] == 0 && p.x[0][1] == 0 { + let bytes = temp1.to_le_bytes(); + for i in 0..8 { + pencoded[3*8 + i] |= bytes[i]; + } + } else { + let bytes = temp2.to_le_bytes(); + for i in 0..8 { + pencoded[3*8 + i] |= bytes[i]; + } + } + } +} + + +#[inline] +pub fn decode(pencoded: &[u8], p: &mut PointAffine) -> bool { + let (mut r, mut t, mut t0, mut t1, mut t2, mut t3, mut t4) = ([0u64; 2], [0u64; 2], [0u64; 2], [0u64; 2], [0u64; 2], [0u64; 2], [0u64; 2]); + let (mut u, mut v) = ([[0u64; 2]; 2], [[0u64; 2]; 2]); + let mut r_l = PointExtproj::default(); + + unsafe { + copy_nonoverlapping(pencoded.as_ptr(), p.y.as_mut_ptr() as *mut u8, 32); + p.y[1][1] &= 0x7FFFFFFFFFFFFFFF; + + fp2sqr1271(p.y, &mut u); + fp2mul1271(u, f2elm_from_array(PARAMETER_D), &mut v); + fp2sub1271(u, f2elm_from_array(ONE), &mut u); + fp2add1271(v, f2elm_from_array(ONE), &mut v); + + fpsqr1271(v[0], &mut t0); // t0 = v0^2 + fpsqr1271(v[1], &mut t1); // t1 = v1^2 + fpadd1271(t0, t1, &mut t0); // t0 = t0+t1 + fpmul1271(u[0], v[0], &mut t1); // t1 = u0*v0 + fpmul1271(u[1], v[1], &mut t2); // t2 = u1*v1 + fpadd1271(t1, t2, &mut t1); // t1 = t1+t2 + fpmul1271(u[1], v[0], &mut t2); // t2 = u1*v0 + fpmul1271(u[0], v[1], &mut t3); // t3 = u0*v1 + fpsub1271(t2, t3, &mut t2); // t2 = t2-t3 + fpsqr1271(t1, &mut t3); // t3 = t1^2 + fpsqr1271(t2, &mut t4); // t4 = t2^2 + fpadd1271(t3, t4, &mut t3); // t3 = t3+t4 + + for _ in 0..125 { + fpsqr1271(t3, &mut t3); + } + + fpadd1271(t1, t3, &mut t); + mod1271(&mut t); + + if t[0] == 0 && t[1] == 0 { + fpsub1271(t, t, &mut t); + } + + fpadd1271(t, t, &mut t); // t = 2*t + fpsqr1271(t0, &mut t3); // t3 = t0^2 + fpmul1271(t0, t3, &mut t3); // t3 = t3*t0 + fpmul1271(t, t3, &mut t3); // t3 = t3*t + fpexp1251(t3, &mut r); // r = t3^(2^125-1) + fpmul1271(t0, r, &mut t3); // t3 = t0*r + fpmul1271(t, t3, &mut p.x[0]); // x0 = t*t3 + fpsqr1271(p.x[0], &mut t1); + fpmul1271(t0, t1, &mut t1); // t1 = t0*x0^2 + + let mut temp = [0u64; 2]; + let mask = 0 - (1 & p.x[0][0]); + addcarry_u64(addcarry_u64(0, p.x[0][0], mask, &mut temp[0]), p.x[0][1], mask >> 1, &mut temp[1]); + p.x[0][0] = __shiftright128(temp[0], temp[1], 1); + p.x[0][1] = temp[1] >> 1; + + fpmul1271(t2, t3, &mut p.x[1]); + + fpsub1271(t, t1, &mut t); + mod1271(&mut t); + + if t[0] != 0 || t[1] != 0 { + t0[0] = p.x[0][0]; + t0[1] = p.x[0][1]; + p.x[0][0] = p.x[1][0]; + p.x[0][1] = p.x[1][1]; + p.x[1][0] = t0[0]; + p.x[1][1] = t0[1]; + } + + mod1271(&mut p.x[0]); + if pencoded[31] >> 7 != (p.x[if p.x[0][0] == 0 && p.x[0][1] == 0 { 1 } else { 0 }][1] >> 62) as u8 { + fp2neg1271(&mut p.x); + } + + /*let (a, b) = (p.x[0], p.x[1]); + p.x[1] = a; + p.x[0] = b;*/ + + point_setup(p, &mut r_l); + + if !ecc_point_validate(&r_l) { + fpneg1271(&mut r_l.x[1]); + p.x[1][0] = r_l.x[1][0]; + p.x[1][1] = r_l.x[1][1]; + + if !ecc_point_validate(&r_l) { + return false; + } + } + } + + true +} \ No newline at end of file diff --git a/crypto/src/fourq/types.rs b/crypto/src/fourq/types.rs new file mode 100644 index 0000000..6ac6c6d --- /dev/null +++ b/crypto/src/fourq/types.rs @@ -0,0 +1,47 @@ +#![allow(dead_code)] +/// Datatype for representing 128-bit field elements +pub type FelmT = [u64; 2]; + +/// Datatype for representing quadratic extension field elements +pub type F2elmT = [FelmT; 2]; + +/// Point representation in affine coordinates +#[derive(Debug, Clone, Copy, Default)] +#[repr(C)] +pub struct PointAffine { + pub x: F2elmT, + pub y: F2elmT +} + + +#[derive(Debug, Clone, Copy, Default)] +#[repr(C)] +pub struct PointExtproj { + pub x: F2elmT, + pub y: F2elmT, + pub z: F2elmT, + pub ta: F2elmT, + pub tb: F2elmT +} + + +/// Point representation in extended coordinates (for precomputed points) +#[derive(Debug, Clone, Copy, Default)] +#[repr(C)] +pub struct PointExtprojPrecomp { + pub xy: F2elmT, + pub yx: F2elmT, + pub z2: F2elmT, + pub t2: F2elmT +} + +/// Point representation in extended affine coordinates (for precomputed points) +#[derive(Debug, Clone, Copy, Default)] +#[repr(C)] +pub struct PointPrecomp { + pub xy: F2elmT, + pub yx: F2elmT, + pub t2: F2elmT +} + +pub type PointPrecompT = [PointPrecomp; 1]; \ No newline at end of file diff --git a/crypto/src/lib.rs b/crypto/src/lib.rs index 0001f6d..851b859 100644 --- a/crypto/src/lib.rs +++ b/crypto/src/lib.rs @@ -1,38 +1,156 @@ -extern crate libc; -extern crate core; +#![feature(ascii_char)] +#![feature(ascii_char_variants)] + +mod fourq; + +const A_LOWERCASE_ASCII: u8 = 97u8; + //#[cfg(feature = "hash")] pub mod hash { use sodiumoxide::hex::encode; - extern { - fn KangarooTwelve(input: *const u8, inputByteLen: u32, output: *mut u8, outputByteLen: u32); - } + use tiny_keccak::{Hasher, KangarooTwelve}; + pub fn k12(input: &str) -> String { - let mut output: [u8; 32] = [0; 32]; - unsafe { KangarooTwelve(input.as_ptr(), input.len() as u32, output.as_mut_ptr(), 32); } - let val = encode(output); + let ret_val = k12_bytes(&input.as_bytes().to_vec()); + let val = encode(ret_val); return val; } pub fn k12_bytes(input: &Vec) -> Vec { - let mut output: [u8; 32] = [0; 32]; - unsafe { KangarooTwelve(input.as_ptr(), input.len() as u32, output.as_mut_ptr(), 32); } - return output.to_vec(); + let mut digest = [0; 32]; + let mut kangaroo = KangarooTwelve::new(b""); + kangaroo.update(input.as_slice()); + kangaroo.finalize(&mut digest); + return Vec::from(digest); } - #[cfg(test)] pub mod kangaroo12_tests { - use crate::hash::{k12, k12_bytes}; + use crate::hash::k12; #[test] fn hash_a_value() { let value = k12("inputText"); assert_eq!(value, "2459b095c4d5b1759a14f5e4924f26a813c020979fab5ef2cad7321af37808d3".to_string()) } + } +} + + +pub mod qubic_identities { + use crate::{A_LOWERCASE_ASCII, hash}; + use hash::k12_bytes; + use crate::fourq::ops::{ecc_mul_fixed, encode}; + use crate::fourq::types::{PointAffine}; + + // fn getPublicKey(privateKey: *const u8, publicKey: *mut u8); + // fn getIdentity(publicKey: *const u8, identity: *const u8, isLowerCase: bool); + + pub fn get_subseed(seed: &str) -> Result, String> { + let mut seed_bytes: [u8; 55] = [0; 55]; + if seed.len() != 55 { + return Err(String::from("Invalid Seed Length!")) + } + for (index, el) in &mut seed.chars().enumerate() { + if !el.is_alphabetic() { + return Err(String::from("Invalid Seed!")); + } + seed_bytes[index] = el.to_ascii_lowercase() as u8 - A_LOWERCASE_ASCII; + + } + Ok(k12_bytes(&seed_bytes.to_vec())) + } + pub fn get_private_key(subseed: &Vec) -> Vec { + k12_bytes(subseed) + } + /* + pub fn get_public_key(sk: &Vec) -> Vec { + println!("Got : {:?}", &sk); + let mut p = PointAffine::default(); + let private_key = sk.as_slice().chunks_exact(8).map(|c| u64::from_le_bytes(c.try_into().unwrap())).collect::>(); + println!("{:?}", &private_key); + ecc_mul_fixed(&private_key, &mut p); + let mut pk: [u8; 60] = [0; 60]; + encode(&mut p, &mut pk); + pk.to_vec() + } + */ + pub fn get_public_key(private_key: &Vec) -> [u8; 32] { + let mut ret_val: [u8; 32] = [0; 32]; + let mut p = PointAffine::default(); + let private_key = private_key.chunks_exact(8).map(|c| u64::from_le_bytes(c.try_into().unwrap())).collect::>(); + ecc_mul_fixed(&private_key, &mut p); + encode(&mut p, &mut ret_val); + ret_val + } + + + pub fn get_identity(public_key: &[u8; 32]) -> String { + let mut identity = [0u8; 60]; + for i in 0..4 { + let mut public_key_fragment = u64::from_le_bytes(public_key[i << 3..(i << 3) + 8].try_into().unwrap()); + for j in 0..14 { + identity[i * 14 + j] = (public_key_fragment % 26) as u8 + b'A'; + public_key_fragment /= 26; + } + } + let mut identity_bytes_checksum = [0u8; 3]; + let bytes: Vec = k12_bytes(&public_key.to_vec()); + identity_bytes_checksum[0] = bytes[0]; + identity_bytes_checksum[1] = bytes[1]; + identity_bytes_checksum[2] = bytes[2]; + let mut identity_bytes_checksum = identity_bytes_checksum[0] as u64 | (identity_bytes_checksum[1] as u64) << 8 | (identity_bytes_checksum[2] as u64) << 16; + identity_bytes_checksum &= 0x3FFFF; + for i in 0..4 { + identity[56 + i] = (identity_bytes_checksum % 26) as u8 + b'A'; + identity_bytes_checksum /= 26; + } + + String::from_utf8(identity.to_vec()).unwrap() + } + + #[cfg(test)] + pub mod qubic_identity_primitive_tests { + use crate::qubic_identities::{get_identity, get_private_key, get_public_key, get_subseed}; + #[test] + fn get_a_subseed() { + let seed = "lcehvbvddggkjfnokduyjuiyvkklrvrmsaozwbvjlzvgvfipqpnkkuf"; + let subseed = get_subseed(seed).unwrap(); + let encoded = sodiumoxide::hex::encode(subseed); + assert_eq!(encoded, "d3420abb5f3e0527b588b361fa0a513335833af8b4a4aae23a2958195c3209dc".to_string()) + } + #[test] + fn get_a_private_key() { + let seed = "lcehvbvddggkjfnokduyjuiyvkklrvrmsaozwbvjlzvgvfipqpnkkuf"; + let subseed = get_subseed(seed).unwrap(); + let private_key = get_private_key(&subseed); + let encoded = sodiumoxide::hex::encode(private_key); + assert_eq!(encoded, "11531fcea5e11a4a384e211165ff8bcf458595b32c5374ec76cfa1b1da102238".to_string()) + } + #[test] + fn get_a_public_key() { + let seed = "lcehvbvddggkjfnokduyjuiyvkklrvrmsaozwbvjlzvgvfipqpnkkuf"; + let subseed = get_subseed(seed).unwrap(); + let private_key = get_private_key(&subseed); + let public_key = get_public_key(&private_key); + let encoded = sodiumoxide::hex::encode(public_key); + assert_eq!(encoded, "aa873e4cfd37e4bf528a2aa01eecef36547c99caaabd1bbdf7253a65b041771a".to_string()) + } + #[test] + fn get_an_identity() { + let seed = "lcehvbvddggkjfnokduyjuiyvkklrvrmsaozwbvjlzvgvfipqpnkkuf"; + let subseed = get_subseed(seed).unwrap(); + let private_key = get_private_key(&subseed); + let public_key = get_public_key(&private_key); + let identity = get_identity(&public_key); + assert_eq!(identity, "EPYWDREDNLHXOFYVGQUKPHJGOMPBSLDDGZDPKVQUMFXAIQYMZGEHPZTAAWON".to_string()) + } } } + + //#[cfg(feature = "random")] pub mod random { use sodiumoxide::randombytes::randombytes; diff --git a/ffi-deps/FourQlib/FourQ_32bit/FourQ.h b/ffi-deps/FourQlib/FourQ_32bit/FourQ.h deleted file mode 100644 index 4ae7f85..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/FourQ.h +++ /dev/null @@ -1,173 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: main header file -* -* This code is based on the paper "FourQ: four-dimensional decompositions on a -* Q-curve over the Mersenne prime" by Craig Costello and Patrick Longa, in Advances -* in Cryptology - ASIACRYPT, 2015. -* Preprint available at http://eprint.iacr.org/2015/565. -************************************************************************************/ - -#ifndef __FOURQ_H__ -#define __FOURQ_H__ - - -// For C++ -#ifdef __cplusplus -extern "C" { -#endif - - -#include -#include -#include - - -// Definition of operating system - -#define OS_WIN 1 -#define OS_LINUX 2 - -#if defined(__WINDOWS__) // Microsoft Windows OS - #define OS_TARGET OS_WIN -#elif defined(__LINUX__) // Linux OS - #define OS_TARGET OS_LINUX -#else - #error -- "Unsupported OS" -#endif - - -// Definition of compiler - -#define COMPILER_VC 1 -#define COMPILER_GCC 2 -#define COMPILER_CLANG 3 - -#if defined(_MSC_VER) // Microsoft Visual C compiler - #define COMPILER COMPILER_VC -#elif defined(__GNUC__) // GNU GCC compiler - #define COMPILER COMPILER_GCC -#elif defined(__clang__) // Clang compiler - #define COMPILER COMPILER_CLANG -#else - #error -- "Unsupported COMPILER" -#endif - - -// Definition of the targeted architecture and basic data types - -#define TARGET_x86 1 -#define TARGET_ARM 2 - -#if defined(_X86_) - #define TARGET TARGET_x86 - #define RADIX 32 - typedef uint32_t digit_t; // Unsigned 32-bit digit - typedef int32_t sdigit_t; // Signed 32-bit digit - #define NWORDS_FIELD 4 - #define NWORDS_ORDER 8 -#elif defined(_ARM_) - #define TARGET TARGET_ARM - #define RADIX 32 - typedef uint32_t digit_t; // Unsigned 32-bit digit - typedef int32_t sdigit_t; // Signed 32-bit digit - #define NWORDS_FIELD 4 - #define NWORDS_ORDER 8 -#else - #error -- "Unsupported ARCHITECTURE" -#endif - - -// Constants - -#define RADIX64 64 -#define NWORDS64_FIELD 2 // Number of 64-bit words of a field element -#define NWORDS64_ORDER 4 // Number of 64-bit words of an element in Z_r - - -// Definition of complementary cryptographic functions - -#define RandomBytesFunction random_bytes -//#define CryptoHashFunction KangarooTwelve // Use SHA-512 by default - -// Cache memory support - -#if defined(_NO_CACHE_MEM_) - #define NO_CACHE_MEM -#endif - - -// Basic parameters for variable-base scalar multiplication (without using endomorphisms) -#define W_VARBASE 5 -#define NBITS_ORDER_PLUS_ONE 246+1 - - -// Basic parameters for fixed-base scalar multiplication -#define W_FIXEDBASE 5 // Memory requirement: 7.5KB (storage for 80 points). -#define V_FIXEDBASE 5 - -// Basic parameters for double scalar multiplication -#define WP_DOUBLEBASE 8 // Memory requirement: 24KB (storage for 256 points). -#define WQ_DOUBLEBASE 4 - - -// FourQ's basic element definitions and point representations - -typedef digit_t felm_t[NWORDS_FIELD]; // Datatype for representing 128-bit field elements -typedef felm_t f2elm_t[2]; // Datatype for representing quadratic extension field elements - -typedef struct { f2elm_t x; f2elm_t y; } point_affine; // Point representation in affine coordinates. -typedef point_affine point_t[1]; - - -// FourQ's element and point definitions using a redundant representation - -#define VWORDS_FIELD 5 // Number of 32-bit words representing a field element - -typedef uint32_t velm_t[VWORDS_FIELD]; // Datatype for representing 128-bit field elements -typedef uint32_t v2elm_t[2*VWORDS_FIELD]; // Datatype for representing quadratic extension field elements - -typedef struct { v2elm_t x; v2elm_t y; } vpoint_affine; // Point representation in affine coordinates. -typedef vpoint_affine vpoint_t[1]; - - -// Definitions of the error-handling type and error codes - -typedef enum { - ECCRYPTO_ERROR, // 0x00 - ECCRYPTO_SUCCESS, // 0x01 - ECCRYPTO_ERROR_DURING_TEST, // 0x02 - ECCRYPTO_ERROR_UNKNOWN, // 0x03 - ECCRYPTO_ERROR_NOT_IMPLEMENTED, // 0x04 - ECCRYPTO_ERROR_NO_MEMORY, // 0x05 - ECCRYPTO_ERROR_INVALID_PARAMETER, // 0x06 - ECCRYPTO_ERROR_SHARED_KEY, // 0x07 - ECCRYPTO_ERROR_SIGNATURE_VERIFICATION, // 0x08 - ECCRYPTO_ERROR_END_OF_LIST -} ECCRYPTO_STATUS; - -#define ECCRYPTO_STATUS_TYPE_SIZE (ECCRYPTO_ERROR_END_OF_LIST) - - -// Error message definitions - -#define ECCRYPTO_MSG_ERROR "ECCRYPTO_ERROR" -#define ECCRYPTO_MSG_SUCCESS "ECCRYPTO_SUCCESS" -#define ECCRYPTO_MSG_ERROR_DURING_TEST "ECCRYPTO_ERROR_DURING_TEST" -#define ECCRYPTO_MSG_ERROR_UNKNOWN "ECCRYPTO_ERROR_UNKNOWN" -#define ECCRYPTO_MSG_ERROR_NOT_IMPLEMENTED "ECCRYPTO_ERROR_NOT_IMPLEMENTED" -#define ECCRYPTO_MSG_ERROR_NO_MEMORY "ECCRYPTO_ERROR_NO_MEMORY" -#define ECCRYPTO_MSG_ERROR_INVALID_PARAMETER "ECCRYPTO_ERROR_INVALID_PARAMETER" -#define ECCRYPTO_MSG_ERROR_SHARED_KEY "ECCRYPTO_ERROR_SHARED_KEY" -#define ECCRYPTO_MSG_ERROR_SIGNATURE_VERIFICATION "ECCRYPTO_ERROR_SIGNATURE_VERIFICATION" - - -#ifdef __cplusplus -} -#endif - - -#endif diff --git a/ffi-deps/FourQlib/FourQ_32bit/FourQ_api.h b/ffi-deps/FourQlib/FourQ_32bit/FourQ_api.h deleted file mode 100644 index de9bc63..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/FourQ_api.h +++ /dev/null @@ -1,115 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: API header file -* -* This code is based on the paper "FourQ: four-dimensional decompositions on a -* Q-curve over the Mersenne prime" by Craig Costello and Patrick Longa, in Advances -* in Cryptology - ASIACRYPT, 2015. -* Preprint available at http://eprint.iacr.org/2015/565. -************************************************************************************/ - -#ifndef __FOURQ_API_H__ -#define __FOURQ_API_H__ - - -// For C++ -#ifdef __cplusplus -extern "C" { -#endif - - -#include "FourQ.h" - - -/**************** Public ECC API ****************/ - -// Set generator G = (x,y) -void eccset(point_t G); - -// Variable-base scalar multiplication Q = k*P -bool ecc_mul(point_t P, digit_t* k, point_t Q, bool clear_cofactor); - -// Fixed-base scalar multiplication Q = k*G, where G is the generator -bool ecc_mul_fixed(digit_t* k, point_t Q); - -// Double scalar multiplication R = k*G + l*Q, where G is the generator -bool ecc_mul_double(digit_t* k, point_t Q, digit_t* l, point_t R); - - -/**************** Public API for SchnorrQ ****************/ - -// SchnorrQ public key generation -// It produces a public key PublicKey, which is the encoding of P = s*G, where G is the generator and -// s is the output of hashing SecretKey and taking the least significant 32 bytes of the result. -// Input: 32-byte SecretKey -// Output: 32-byte PublicKey -ECCRYPTO_STATUS SchnorrQ_KeyGeneration(const unsigned char* SecretKey, unsigned char* PublicKey); - -// SchnorrQ keypair generation -// It produces a private key SecretKey and computes the public key PublicKey, which is the encoding of P = s*G, -// where G is the generator and s is the output of hashing SecretKey and taking the least significant 32 bytes of the result. -// Outputs: 32-byte SecretKey and 32-byte PublicKey -ECCRYPTO_STATUS SchnorrQ_FullKeyGeneration(unsigned char* SecretKey, unsigned char* PublicKey); - -// SchnorrQ signature generation -// It produces the signature Signature of a message Message of size SizeMessage in bytes -// Inputs: 32-byte SecretKey, 32-byte PublicKey, and Message of size SizeMessage in bytes -// Output: 64-byte Signature -ECCRYPTO_STATUS SchnorrQ_Sign(const unsigned char* SecretKey, const unsigned char* PublicKey, const unsigned char* Message, const unsigned int SizeMessage, unsigned char* Signature); - -// SchnorrQ signature verification -// It verifies the signature Signature of a message Message of size SizeMessage in bytes -// Inputs: 32-byte PublicKey, 64-byte Signature, and Message of size SizeMessage in bytes -// Output: true (valid signature) or false (invalid signature) -ECCRYPTO_STATUS SchnorrQ_Verify(const unsigned char* PublicKey, const unsigned char* Message, const unsigned int SizeMessage, const unsigned char* Signature, unsigned int* valid); - - -/**************** Public API for co-factor ECDH key exchange with compressed, 32-byte public keys ****************/ - -// Compressed public key generation for key exchange -// It produces a public key PublicKey, which is the encoding of P = SecretKey*G (G is the generator). -// Input: 32-byte SecretKey -// Output: 32-byte PublicKey -ECCRYPTO_STATUS CompressedPublicKeyGeneration(const unsigned char* SecretKey, unsigned char* PublicKey); - -// Keypair generation for key exchange. Public key is compressed to 32 bytes -// It produces a private key SecretKey and a public key PublicKey, which is the encoding of P = SecretKey*G (G is the generator). -// Outputs: 32-byte SecretKey and 32-byte PublicKey -ECCRYPTO_STATUS CompressedKeyGeneration(unsigned char* SecretKey, unsigned char* PublicKey); - -// Secret agreement computation for key exchange using a compressed, 32-byte public key -// The output is the y-coordinate of SecretKey*A, where A is the decoding of the public key PublicKey. -// Inputs: 32-byte SecretKey and 32-byte PublicKey -// Output: 32-byte SharedSecret -ECCRYPTO_STATUS CompressedSecretAgreement(const unsigned char* SecretKey, const unsigned char* PublicKey, unsigned char* SharedSecret); - - -/**************** Public API for co-factor ECDH key exchange with uncompressed, 64-byte public keys ****************/ - -// Public key generation for key exchange -// It produces the public key PublicKey = SecretKey*G, where G is the generator. -// Input: 32-byte SecretKey -// Output: 64-byte PublicKey -ECCRYPTO_STATUS PublicKeyGeneration(const unsigned char* SecretKey, unsigned char* PublicKey); - -// Keypair generation for key exchange -// It produces a private key SecretKey and computes the public key PublicKey = SecretKey*G, where G is the generator. -// Outputs: 32-byte SecretKey and 64-byte PublicKey -ECCRYPTO_STATUS KeyGeneration(unsigned char* SecretKey, unsigned char* PublicKey); - -// Secret agreement computation for key exchange -// The output is the y-coordinate of SecretKey*PublicKey. -// Inputs: 32-byte SecretKey and 64-byte PublicKey -// Output: 32-byte SharedSecret -ECCRYPTO_STATUS SecretAgreement(const unsigned char* SecretKey, const unsigned char* PublicKey, unsigned char* SharedSecret); - - -#ifdef __cplusplus -} -#endif - - -#endif diff --git a/ffi-deps/FourQlib/FourQ_32bit/FourQ_internal.h b/ffi-deps/FourQlib/FourQ_32bit/FourQ_internal.h deleted file mode 100644 index 6f79048..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/FourQ_internal.h +++ /dev/null @@ -1,354 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: internal header file -* -* This code is based on the paper "FourQ: four-dimensional decompositions on a -* Q-curve over the Mersenne prime" by Craig Costello and Patrick Longa, in Advances -* in Cryptology - ASIACRYPT, 2015. -* Preprint available at http://eprint.iacr.org/2015/565. -************************************************************************************/ - -#ifndef __FOURQ_INTERNAL_H__ -#define __FOURQ_INTERNAL_H__ - - -// For C++ -#ifdef __cplusplus -extern "C" { -#endif - - -#include "FourQ_api.h" - - -// Extended datatype definition - -typedef uint64_t uint128_t[2]; - - -// Basic parameters for variable-base scalar multiplication (without using endomorphisms) -#define NPOINTS_VARBASE (1 << (W_VARBASE-2)) -#define t_VARBASE ((NBITS_ORDER_PLUS_ONE+W_VARBASE-2)/(W_VARBASE-1)) - - -// Basic parameters for fixed-base scalar multiplication -#define E_FIXEDBASE (NBITS_ORDER_PLUS_ONE + W_FIXEDBASE*V_FIXEDBASE - 1)/(W_FIXEDBASE*V_FIXEDBASE) -#define D_FIXEDBASE E_FIXEDBASE*V_FIXEDBASE -#define L_FIXEDBASE D_FIXEDBASE*W_FIXEDBASE -#define NPOINTS_FIXEDBASE V_FIXEDBASE*(1 << (W_FIXEDBASE-1)) -#define VPOINTS_FIXEDBASE (1 << (W_FIXEDBASE-1)) -#if (NBITS_ORDER_PLUS_ONE-L_FIXEDBASE == 0) // This parameter selection is not supported - #error -- "Unsupported parameter selection for fixed-base scalar multiplication" -#endif - - -// Basic parameters for double scalar multiplication -#define NPOINTS_DOUBLEMUL_WP (1 << (WP_DOUBLEBASE-2)) -#define NPOINTS_DOUBLEMUL_WQ (1 << (WQ_DOUBLEBASE-2)) - - -// FourQ's point representations - -typedef struct { f2elm_t x; f2elm_t y; f2elm_t z; f2elm_t ta; f2elm_t tb; } point_extproj; // Point representation in extended coordinates. -typedef point_extproj point_extproj_t[1]; -typedef struct { f2elm_t xy; f2elm_t yx; f2elm_t z2; f2elm_t t2; } point_extproj_precomp; // Point representation in extended coordinates (for precomputed points). -typedef point_extproj_precomp point_extproj_precomp_t[1]; -typedef struct { f2elm_t xy; f2elm_t yx; f2elm_t t2; } point_precomp; // Point representation in extended affine coordinates (for precomputed points). -typedef point_precomp point_precomp_t[1]; - - -// FourQ's point formats using a redundant representation - -typedef struct { v2elm_t x; v2elm_t y; v2elm_t z; v2elm_t ta; v2elm_t tb; } vpoint_extproj; // Point representation in extended coordinates. -typedef vpoint_extproj vpoint_extproj_t[1]; -typedef struct { v2elm_t xy; v2elm_t yx; v2elm_t z2; v2elm_t t2; } vpoint_extproj_precomp; // Point representation in extended coordinates (for precomputed points). -typedef vpoint_extproj_precomp vpoint_extproj_precomp_t[1]; -typedef struct { v2elm_t xy; v2elm_t yx; v2elm_t t2; } vpoint_precomp; // Point representation in extended affine coordinates (for precomputed points). -typedef vpoint_precomp vpoint_precomp_t[1]; - - -/********************** Constant-time unsigned comparisons ***********************/ - -// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise - -static __inline unsigned int is_digit_nonzero_ct(digit_t x) -{ // Is x != 0? - return (unsigned int)((x | (0-x)) >> (RADIX-1)); -} - -static __inline unsigned int is_digit_zero_ct(digit_t x) -{ // Is x = 0? - return (unsigned int)(1 ^ is_digit_nonzero_ct(x)); -} - -static __inline unsigned int is_digit_lessthan_ct(digit_t x, digit_t y) -{ // Is x < y? - return (unsigned int)((x ^ ((x ^ y) | ((x - y) ^ y))) >> (RADIX-1)); -} - - -/********************** Macros for digit operations **********************/ - -// Digit multiplication -#define MUL(multiplier, multiplicand, hi, lo) \ - digit_x_digit((multiplier), (multiplicand), &(lo)); - -// Digit addition with carry -#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \ - { digit_t tempReg = (addend1) + (digit_t)(carryIn); \ - (sumOut) = (addend2) + tempReg; \ - (carryOut) = (is_digit_lessthan_ct(tempReg, (digit_t)(carryIn)) | is_digit_lessthan_ct((sumOut), tempReg)); } - -// Digit subtraction with borrow -#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \ - { digit_t tempReg = (minuend) - (subtrahend); \ - unsigned int borrowReg = (is_digit_lessthan_ct((minuend), (subtrahend)) | ((borrowIn) & is_digit_zero_ct(tempReg))); \ - (differenceOut) = tempReg - (digit_t)(borrowIn); \ - (borrowOut) = borrowReg; } - -// Shift right with flexible datatype -#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \ - (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (DigitSize - (shift))); - -// 64x64-bit multiplication -#define MUL128(multiplier, multiplicand, product) \ - mp_mul((digit_t*)&(multiplier), (digit_t*)&(multiplicand), (digit_t*)&(product), NWORDS_FIELD/2); - -// 128-bit addition, inputs < 2^127 -#define ADD128(addend1, addend2, addition) \ - mp_add((digit_t*)(addend1), (digit_t*)(addend2), (digit_t*)(addition), NWORDS_FIELD); - -// 128-bit addition with output carry -#define ADC128(addend1, addend2, carry, addition) \ - (carry) = mp_add((digit_t*)(addend1), (digit_t*)(addend2), (digit_t*)(addition), NWORDS_FIELD); - - -/**************** Function prototypes ****************/ - -/************* Arithmetic functions modulo the curve order **************/ - -// Converting to Montgomery representation -void to_Montgomery(const digit_t* ma, digit_t* c); - -// Converting from Montgomery to standard representation -void from_Montgomery(const digit_t* a, digit_t* mc); - -// 256-bit Montgomery multiplication modulo the curve order -void Montgomery_multiply_mod_order(const digit_t* ma, const digit_t* mb, digit_t* mc); - -// Addition modulo the curve order, c = a+b mod order -void add_mod_order(const digit_t* a, const digit_t* b, digit_t* c); - -// Subtraction modulo the curve order, c = a-b mod order -void subtract_mod_order(const digit_t* a, const digit_t* b, digit_t* c); - -// Reduction modulo the order using Montgomery arithmetic internally -void modulo_order(digit_t* a, digit_t* c); - -/************* Multiprecision functions **************/ - -// Check if multiprecision element is zero -bool is_zero_ct(digit_t* a, unsigned int nwords); - -// Multiprecision addition, c = a+b. Returns the carry bit -unsigned int mp_add(digit_t* a, digit_t* b, digit_t* c, unsigned int nwords); - -// Schoolbook multiprecision multiply, c = a*b -void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); - -// Multiprecision subtraction, c = a-b. Returns the borrow bit -unsigned int subtract(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); - -// Clear "nwords" integer-size digits from memory -extern void clear_words(void* mem, unsigned int nwords); - -/************ Field arithmetic functions *************/ - -// Field negation, a = -a mod p -void vneg1271(velm_t a); - -// Incomplete modular correction, a = a mod p -void vmod1271_incomplete(velm_t a, velm_t c); - -// Modular correction, a = a mod p -void vmod1271(velm_t a, velm_t c); - -// Field addition, c = a+b mod p -void vadd1271(velm_t a, velm_t b, velm_t c); - -// Field subtraction, c = a-b mod p -void vsub1271(velm_t a, velm_t b, velm_t c); - -// Field division by two, c = a/2 mod p -void vdiv1271(uint32_t* a); - -// Field squaring, c = a^2 mod p -void vsqr1271(velm_t a, velm_t c); - -// Field multiplication, c = a*b mod p -void vmul1271(velm_t a, velm_t b, velm_t c); - -// Field inversion, af = a^-1 = a^(p-2) mod p -void vinv1271(velm_t a); - -// Exponentiation over GF(p), af = a^(125-1) -void vexp1251(velm_t a, velm_t af); - -// Conversion functions -void from_std_to_ext(f2elm_t a, v2elm_t c); -void from_ext_to_std(v2elm_t a, f2elm_t c); - -/************ Quadratic extension field arithmetic functions *************/ - -// Zeroing a quadratic extension field element, a=0 -void v2zero1271(v2elm_t a); - -// Copy quadratic extension field element, c = a -void fp2copy1271(f2elm_t a, f2elm_t c); -void v2copy1271(v2elm_t a, v2elm_t c); - -// Quadratic extension field negation, a = -a in GF((2^127-1)^2) -void v2neg1271(v2elm_t a); - -// Quadratic extension field addition, c = a+b in GF((2^127-1)^2) -void v2add1271(v2elm_t a, v2elm_t b, v2elm_t c); - -// Quadratic extension field subtraction, c = a-b in GF((2^127-1)^2) -void v2sub1271(v2elm_t a, v2elm_t b, v2elm_t c); - -// Quadratic extension field addition followed by subtraction over GF(2^127-1) -void v2dblsub1271(v2elm_t a, v2elm_t b, v2elm_t c); - -// Quadratic extension field division by two, c = a/2 mod p -void v2div1271(uint32_t* a); - -// Incomplete modular correction over GF(p^2) -void v2mod1271_incomplete(v2elm_t a, v2elm_t c); - -// Modular correction over GF(p^2) -void v2mod1271(v2elm_t a, v2elm_t c); - -// Quadratic extension field multiplication, c = a*b in GF((2^127-1)^2) -void v2mul1271(v2elm_t a, v2elm_t b, v2elm_t c); - -// Quadratic extension field squaring, c = a^2 in GF((2^127-1)^2) -void v2sqr1271(v2elm_t a, v2elm_t c); - -// Quadratic extension field multiplication in GF((2^127-1)^2) -void v2mul1271(v2elm_t a, v2elm_t b, v2elm_t c); - -// Quadratic extension field inversion, af = a^-1 = a^(p-2) in GF((2^127-1)^2) -void v2inv1271(v2elm_t a); - -/************ Curve and recoding functions *************/ - -// Normalize projective twisted Edwards point Q = (X,Y,Z) -> P = (x,y) -void eccnorm(vpoint_extproj_t P, vpoint_t Q); - -// Conversion from representation (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT), where T = Ta*Tb -void R1_to_R2(vpoint_extproj_t P, vpoint_extproj_precomp_t Q); - -// Conversion from representation (X,Y,Z,Ta,Tb) to (X+Y,Y-X,Z,T), where T = Ta*Tb -void R1_to_R3(vpoint_extproj_t P, vpoint_extproj_precomp_t Q); - -// Conversion from representation (X+Y,Y-X,2Z,2dT) to (2X,2Y,2Z,2dT) -void R2_to_R4(vpoint_extproj_precomp_t P, vpoint_extproj_t Q); - -// Point doubling 2P -void eccdouble(vpoint_extproj_t P); - -// Complete point addition P = P+Q or P = P+P -void eccadd(vpoint_extproj_precomp_t Q, vpoint_extproj_t P); -void eccadd_core(vpoint_extproj_precomp_t P, vpoint_extproj_precomp_t Q, vpoint_extproj_t R); - -// Psi mapping of a point, P = psi(P) -void ecc_psi(vpoint_extproj_t P); - -// Phi mapping of a point, P = phi(P) -void ecc_phi(vpoint_extproj_t P); - -// Scalar decomposition -void decompose(uint64_t* k, uint64_t* scalars); - -// Recoding sub-scalars for use in the variable-base scalar multiplication -void recode(uint64_t* scalars, unsigned int* digits, unsigned int* sign_masks); - -// Convert scalar to odd if even using the prime subgroup order r -void conversion_to_odd(digit_t* k, digit_t* k_odd); - -// Co-factor clearing -void cofactor_clearing(vpoint_extproj_t P); - -// Reduction modulo the order using Montgomery arithmetic -void modulo_order(digit_t* a, digit_t* c); - -// Precomputation function -void ecc_precomp(vpoint_extproj_t P, vpoint_extproj_precomp_t *T); - -// Constant-time table lookup to extract an extended twisted Edwards point (X+Y:Y-X:2Z:2T) from the precomputed table -void table_lookup_1x8(vpoint_extproj_precomp_t* table, vpoint_extproj_precomp_t P, unsigned int digit, unsigned int sign_mask); - -// Modular correction of input coordinates and conversion to representation (X,Y,Z,Ta,Tb) -void point_setup(point_t P, vpoint_extproj_t Q); - -// Point validation: check if point lies on the curve -bool ecc_point_validate(vpoint_extproj_t P); - -// Output error/success message for a given ECCRYPTO_STATUS -const char* FourQ_get_error_message(ECCRYPTO_STATUS Status); - -// Constant-time table lookup to extract a point represented as (x+y,y-x,2t) -void table_lookup_fixed_base(vpoint_precomp_t* table, vpoint_precomp_t P, unsigned int digit, unsigned int sign); - -// Computes the modified LSB-set representation of scalar -void mLSB_set_recode(uint64_t* scalar, unsigned int *digits); - -// Generation of the precomputation table used internally by the double scalar multiplication function ecc_mul_double() -void ecc_precomp_double(vpoint_extproj_t P, vpoint_extproj_precomp_t* Table, unsigned int npoints); - -// Computes wNAF recoding of a scalar -void wNAF_recode(uint64_t scalar, unsigned int w, int* digits); - -// Encode point P -void encode(point_t P, unsigned char* Pencoded); - -// Decode point P -ECCRYPTO_STATUS decode(const unsigned char* Pencoded, point_t P); - - -/************ Functions based on macros *************/ - -// Copy extended projective point Q = (X:Y:Z:Ta:Tb) to P -#define ecccopy(Q, P); v2copy1271((Q)->x, (P)->x); \ - v2copy1271((Q)->y, (P)->y); \ - v2copy1271((Q)->z, (P)->z); \ - v2copy1271((Q)->ta, (P)->ta); \ - v2copy1271((Q)->tb, (P)->tb); - -// Copy extended projective point Q = (X+Y,Y-X,2Z,2dT) to P -#define ecccopy_precomp(Q, P); v2copy1271((Q)->xy, (P)->xy); \ - v2copy1271((Q)->yx, (P)->yx); \ - v2copy1271((Q)->z2, (P)->z2); \ - v2copy1271((Q)->t2, (P)->t2); - -// Copy extended affine point Q = (x+y,y-x,2dt) to P -#define ecccopy_precomp_fixed_base(Q, P); v2copy1271((Q)->xy, (P)->xy); \ - v2copy1271((Q)->yx, (P)->yx); \ - v2copy1271((Q)->t2, (P)->t2); - -// Vectorize extended projective point Q = (X:Y:Z:Ta:Tb) -#define point_from_std_to_ext(Q, P); from_std_to_ext(Q->x, P->x); \ - from_std_to_ext(Q->y, P->y); \ - from_std_to_ext(Q->z, P->z); \ - from_std_to_ext(Q->ta, P->ta); \ - from_std_to_ext(Q->tb, P->tb); - -#ifdef __cplusplus -} -#endif - - -#endif diff --git a/ffi-deps/FourQlib/FourQ_32bit/FourQ_params.h b/ffi-deps/FourQlib/FourQ_32bit/FourQ_params.h deleted file mode 100644 index 958c61b..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/FourQ_params.h +++ /dev/null @@ -1,35 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: FourQ's curve parameters -* -* This code is based on the paper "FourQ: four-dimensional decompositions on a -* Q-curve over the Mersenne prime" by Craig Costello and Patrick Longa, in Advances -* in Cryptology - ASIACRYPT, 2015. -* Preprint available at http://eprint.iacr.org/2015/565. -************************************************************************************/ - -#ifndef __FOURQ_PARAMS_H__ -#define __FOURQ_PARAMS_H__ - -#include "FourQ_internal.h" - - -// Encoding of field elements, elements over Z_r and elements over GF(p^2): -// ----------------------------------------------------------------------- -// Elements over GF(p) and Z_r are encoded with the least significant digit located in the leftmost position (i.e., little endian format). -// Elements (a+b*i) over GF(p^2), where a and b are defined over GF(p), are encoded as a||b, with a in the least significant position. -// Parameter "d" is encoded using the redundant representation 23|26|26|26|26-bit for each field element in a+b*i, where the 23-bit digit -// is the most significant digit. Digits are stored in 32-bit words. - -static const uint32_t PARAMETER_d[10] = { 0x00000142, 0x00000000, 0x000E4000, 0x00000000, 0x00000000, 0x01FC0C8D, 0x0085223C, 0x020FCB38, 0x0211995F, 0x005E472F }; -static const uint64_t GENERATOR_x[4] = { 0x286592AD7B3833AA, 0x1A3472237C2FB305, 0x96869FB360AC77F6, 0x1E1F553F2878AA9C }; -static const uint64_t GENERATOR_y[4] = { 0xB924A2462BCBB287, 0x0E3FEE9BA120785A, 0x49A7C344844C8B5C, 0x6E1C4AF8630E0242 }; -static const uint64_t curve_order[4] = { 0x2FB2540EC7768CE7, 0xDFBD004DFE0F7999, 0xF05397829CBC14E5, 0x0029CBC14E5E0A72 }; -static const uint64_t Montgomery_Rprime[4] = { 0xC81DB8795FF3D621, 0x173EA5AAEA6B387D, 0x3D01B7C72136F61C, 0x0006A5F16AC8F9D3 }; -static const uint64_t Montgomery_rprime[4] = { 0xE12FE5F079BC3929, 0xD75E78B8D1FCDCF3, 0xBCE409ED76B5DB21, 0xF32702FDAFC1C074 }; - - -#endif \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_32bit/FourQ_tables.h b/ffi-deps/FourQlib/FourQ_32bit/FourQ_tables.h deleted file mode 100644 index 29dd62f..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/FourQ_tables.h +++ /dev/null @@ -1,369 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: precomputation tables -************************************************************************************/ - -#ifndef __TABLES_H__ -#define __TABLES_H__ - -#include - - -// The table below was generated using window width W = 5 and table parameter V = 5 (see http://eprint.iacr.org/2013/158). -// Number of point entries = 5 * 2^4 = 80 points, where each point (x,y) is represented using coordinates (x+y,y-x,2*d*t). -// Each coordinate (in GF(p^2)) is encoded using the redundant representation 23|26|26|26|26-bit for each field element -// in a+b*i, where the 23-bit digit is the most significant digit. Digits are stored in 32-bit words. -// Table size = 80 * 3 * 320 = 9.375KB - -static const uint32_t FIXED_BASE_TABLE[2400] = { -0x303E631, 0x28D3CE9, 0x2B5FE18, 0x2FC7540, 0x287460, 0xF90353, 0x398BDF9, 0x2CDEE02, 0xDE2E1A, 0xC3BA0, 0x937EDC, 0x3C3E62C, 0x55590B, 0x1E093C3, 0x740B7C, 0x3A01366, 0x48E448, 0x17A5B32, 0x2E4EA55, 0x4FFCF5, 0x3DA42BB, 0x2BF32EA, 0x17C6297, 0xDD55B2, 0x5948D1, 0x330684C, 0x2268E4C, 0x1F27A81, 0x1C828D0, 0xCAF2B, -0x1188787, 0x2E8063F, 0x12A83A8, 0x2062374, 0x554612, 0x9F9B77, 0xF30CF0, 0x2049B0B, 0x3E2E134, 0x1BAEEA, 0x1FAF900, 0x1096984, 0x3789006, 0x4B869B, 0x18F7CD, 0x2556066, 0x33C2684, 0x36EB6DC, 0x3BAB39E, 0x448E05, 0x12829B0, 0x3CCDA27, 0x3DB4BF2, 0x32CA55E, 0x6D911D, 0x3C03EE, 0xD4F6F7, 0x25019F2, 0x16EAFB, 0x6C543, -0x1AFA125, 0x3D15C4F, 0x364D2EA, 0x1619228, 0x729630, 0x1313EF2, 0x1DDC7E7, 0x306961B, 0x3E2FF8A, 0x4F41C7, 0x19790AC, 0x188EB96, 0xA42408, 0x21910CC, 0x4D3385, 0xDD7487, 0x1A59927, 0x2114FC5, 0x3839663, 0x74DF72, 0x13C110B, 0x18C964, 0x30D8C9A, 0x457F92, 0x76BD4, 0x2BFF3D9, 0x679975, 0x3A08766, 0x51F3B8, 0x249240, -0x965A73, 0x16E5852, 0x3706D69, 0x28A20A7, 0x28AAC8, 0x1F7A57B, 0x7014CA, 0x200E41F, 0x3A27C0F, 0x441CA9, 0x3F7241E, 0x28E2AE2, 0x37F4E1A, 0x2BF20CA, 0x58F28C, 0x25400C6, 0x2BE2E3E, 0x63ECAD, 0x18A13A, 0x34B6D1, 0x2F15097, 0x2632ACE, 0x1C49F54, 0x24005F6, 0x6DBE77, 0x48E8BC, 0x3DDAA05, 0x200663B, 0x16FC8EB, 0x637192, -0x1ED81FA, 0xB1C86F, 0x16ABC5E, 0x3BB5A01, 0x4EDE70, 0x975810, 0x3366D6D, 0x1AA88F3, 0x64BC2A, 0x4752FD, 0x3734414, 0x1E53AC7, 0x662318, 0x34B211A, 0x11DDF7, 0x2B1A34E, 0xEC1BDC, 0x2770261, 0x2CDEAC1, 0x465575, 0x2638D2B, 0x26117E0, 0x18CE40B, 0x141E40A, 0x488940, 0x1B4F131, 0x2E4E940, 0x32FBBED, 0x34A226, 0x4F356, -0x15B278B, 0x3761D44, 0x58A457, 0x395364A, 0x56F25E, 0x20C977E, 0x1307373, 0x99692D, 0x1061F5D, 0x78FCA, 0x2CC22A1, 0x2CB7B5D, 0x2BF63BB, 0x214F770, 0x117B28, 0x1C2BAA2, 0xDD9F2E, 0xA8F43F, 0x9783AA, 0x73079E, 0x215796D, 0x1E64AD6, 0x2F92017, 0x520367, 0x2E7772, 0x3372916, 0x2620D59, 0x176DBE0, 0x1ADEBDD, 0x258F17, -0x168391B, 0xCE3F58, 0x13A4308, 0x17E69CD, 0x728592, 0x3E85114, 0x303F413, 0x2B51862, 0x39D08EB, 0x53259E, 0x33B1A68, 0xC6A12, 0x31FCFE0, 0x1987E81, 0x1A4F1D, 0x1C928A7, 0x35505A3, 0x2A6A2DD, 0x6B7C65, 0x60185C, 0x6DA9B4, 0x25C5F7, 0x3C03498, 0x12D373F, 0x606209, 0x1FA7E8D, 0x3A9BE81, 0x8A0A41, 0x1BC9DC5, 0x4A4FE0, -0x29E80F0, 0x94EA7B, 0x17337BB, 0x22F32C4, 0x419A92, 0x29A039E, 0x8EF99A, 0x14BB843, 0x2BA5CA0, 0x1B2D1, 0x3051231, 0x2216134, 0x1DD7A75, 0x83325E, 0x54DF1E, 0x22F22DD, 0x241BF8F, 0x1A0F91D, 0x25D7F7C, 0x4E36E9, 0x3747634, 0x21C5D1A, 0x3433D81, 0x2EBBB84, 0x3E5E31, 0x173BAA3, 0x2A025E5, 0x37C6E4D, 0x365F3F9, 0x4B852A, -0x418B9E, 0x2CD19C5, 0x2F88E08, 0x26CBF9B, 0x283D71, 0x5C180B, 0xE74B79, 0x15C8B73, 0x3BC474B, 0x75ACFC, 0x561876, 0x1DDEA3, 0xC4B8F4, 0x1029C4D, 0xC54AC, 0x2BAEE08, 0x38A1F59, 0x8BFB92, 0x1401A86, 0x6F357E, 0x11E6971, 0xCC67B, 0x3D40C59, 0x3BE476B, 0x747C45, 0x12F816E, 0x21AA47, 0x2F7DDE4, 0x3B6CECF, 0x5DCB27, -0x21DD2EE, 0x3711B3E, 0x249843F, 0x27DC259, 0x51551F, 0x1ED9BDC, 0x14D3DD8, 0x307FB54, 0x2CC1CFE, 0x453455, 0x3CAB70B, 0x1DCF8E0, 0x34DFF24, 0x179D633, 0x679BE2, 0x3EEE29, 0x1FB7CA5, 0xF66DA1, 0x2E35B70, 0x3DC9E5, 0x13FB75B, 0x1432E90, 0x137F56A, 0x16F16A1, 0x1E6531, 0x2C7213, 0x24090A0, 0x252E5FF, 0x323D094, 0x73C9D8, -0x3198C8F, 0xDEE18C, 0xF333C6, 0x2CFB505, 0x534F84, 0x2DD6902, 0x4F9CB7, 0x33AFAD3, 0x250737C, 0x5ED57E, 0x257306E, 0x3F8074A, 0x13F95A6, 0x3A8D11C, 0x73B63D, 0x31C2BF3, 0x2DC15C3, 0x18DB39C, 0x3927C68, 0x2DF8C6, 0x37797E4, 0x2F0D259, 0x1062661, 0x32EFAFA, 0x501AE7, 0x959643, 0xAA237A, 0x1D515B5, 0xA04700, 0x37275, -0x1BB928, 0x315E88C, 0xFC6010, 0x1C55555, 0x378B31, 0x1A02CF1, 0xFE908A, 0x1B29F88, 0x2E1435F, 0x5F0047, 0x402DAA, 0x11EB8CA, 0x22BC4D2, 0x9EA142, 0xD0306, 0xA443F5, 0x19766A2, 0x3089B4E, 0x1ACB5B6, 0x6EC968, 0x2A1D987, 0xB823F, 0x3600DE2, 0x34FCA35, 0x5C64E1, 0x261BFB7, 0x345EFBD, 0x334C157, 0xDB75D4, 0x56392D, -0x119155C, 0x11E362F, 0x2AFDE25, 0xB4F4CD, 0x146D4F, 0x14E15E8, 0x2F802FE, 0x10C09BF, 0x269AB7C, 0x2B185A, 0x152AB7B, 0x149ECFB, 0x1E80926, 0x51CC40, 0x67997E, 0x7CC541, 0x3D3FD25, 0x1A7AB58, 0x31FEB26, 0x36F800, 0x456D9, 0xB8C90, 0x297BD03, 0xD902F1, 0x4372E4, 0x350D898, 0x132AED1, 0x3CBF914, 0x3EB2688, 0x75D25A, -0x255CBFE, 0x1645D9D, 0xB07794, 0x1985FF1, 0x74DB21, 0x166D0C9, 0x1EC8909, 0x3B23705, 0x1642105, 0x1D543B, 0x428783, 0xA02D11, 0x3EF19C, 0x3F62DB3, 0x352309, 0x68AE72, 0xCF5AB0, 0x19F4378, 0x688F00, 0x4EC067, 0x3D05BB5, 0x20DB868, 0x2FE39D9, 0x37FC893, 0x44FE1A, 0x17EFB4A, 0x1AF0F39, 0x150A29, 0x1A0D685, 0x2EFEC8, -0x3194BCA, 0x67027E, 0x12062FE, 0x34E54F3, 0x18CC07, 0x1C0B9E0, 0x3FC85F2, 0x3CEE5BD, 0x159606A, 0x671AA7, 0xF7D1A2, 0x30CEB8A, 0x23FEE1C, 0x124DE82, 0x1B6F25, 0x1BABB83, 0xF4599, 0x1211515, 0xDB5622, 0x74B956, 0x296F869, 0x25EB93A, 0x10BDBDB, 0x627894, 0x1507CE, 0x2B93B13, 0x393553E, 0x929796, 0x17C1658, 0x6A8176, -0x1A833ED, 0x1014879, 0x36442E9, 0xCAC917, 0x3BDEA5, 0x3D52693, 0x1DA5D7F, 0xED6BEA, 0x122E9B5, 0x64B948, 0xE33EC7, 0x14B4065, 0x1F259DB, 0x369558E, 0x71CF65, 0x35A8138, 0x1CEC7F6, 0x1C22EDE, 0xEDB18B, 0x12E4D1, 0x1185517, 0x26C3099, 0x3DD09D1, 0x3495ED7, 0x77A011, 0x2CD84E4, 0x37172AB, 0x12D11FE, 0x5478D2, 0x46844E, -0x4220DF, 0x8E8C64, 0x30997A4, 0x31D1FA3, 0x5B3165, 0x335E22E, 0x197BAB7, 0x195B1C6, 0x15BF115, 0x7802B5, 0x15BD2DE, 0xB7BD00, 0x26AD85A, 0x21E55C5, 0x17F2AB, 0x60C1E5, 0x3DB4D48, 0xE6A19C, 0x346F902, 0x122A7A, 0x5BF766, 0x26EF219, 0x1ACF5B7, 0x275342B, 0x20FB00, 0x3A28538, 0x9A89CA, 0xBDE975, 0x3A43508, 0x7041B4, -0xDAB057, 0x44CD6, 0x2433B3, 0x19BC349, 0x3D398B, 0x3CD4708, 0x1666680, 0x9691A, 0x27364E, 0x1EAE24, 0x36108C, 0x35AD813, 0x196866D, 0xDE2420, 0x171308, 0x125DCFD, 0x2B5A45C, 0x31CA57C, 0x2E25C13, 0x34B06C, 0x335ABF2, 0x3FA39C7, 0xDB9DCA, 0x663551, 0x698331, 0x3AF075B, 0x1D9D990, 0x2B6D628, 0x3955674, 0x200950, -0xDD9074, 0x18FF0FB, 0x13A2D4F, 0x1C50AB0, 0x747331, 0x1319356, 0xC201, 0x38BA96B, 0x380910D, 0x2C20FF, 0x26AC01A, 0x25446B4, 0x3CF6488, 0x86665F, 0x4EE327, 0x30BF8EA, 0x3F9B9C3, 0x26D715F, 0x2D29B42, 0x6B617F, 0x12F7324, 0x1B72871, 0x277491, 0x3C2191A, 0x3C8269, 0x37E04B, 0x843132, 0x142CC24, 0x13E4569, 0x4E480B, -0x27FE35E, 0x3BCEC26, 0x4B2C5F, 0x37913F6, 0x31A501, 0xA407B9, 0xA79250, 0x25AB79F, 0xF29733, 0xBA7E0, 0x274D8EA, 0x22C8162, 0x1ADCA7A, 0x204389B, 0x46F4C7, 0xA1128A, 0x1C6B3A5, 0x2811461, 0x95256, 0x44DB55, 0x3F18D5C, 0x2278692, 0x6B27F8, 0xA90FCC, 0x4D4F17, 0x31CFFAD, 0x265D9AE, 0x1ACA33A, 0x164934E, 0x625477, -0x225136E, 0x148C3B0, 0x211DD85, 0x341E364, 0x1C544D, 0x23F63BA, 0x3A65A7D, 0x138212F, 0x3725277, 0x69AF1, 0x3E5C256, 0x2F3D033, 0x3595305, 0x2492EEE, 0x63AE90, 0x3B7DE06, 0x425DE4, 0x43EE45, 0x3CE7F00, 0x9780C, 0x3798871, 0x2BE39FA, 0x3A37827, 0x1B1DD5D, 0x3ACE8A, 0x2E055BA, 0x3C184CC, 0x292B79D, 0x1EAAB3A, 0x561DC0, -0x20D2673, 0x108B660, 0x13487E4, 0x20F82BD, 0x6B85DF, 0x2D8526B, 0x146B077, 0x2BD1F1, 0x39455C5, 0x35EAD8, 0x3C79DD4, 0x1BBDB0C, 0x3DE46DA, 0x1349653, 0x5F2EA0, 0x27D5FA, 0xDF4330, 0x331B910, 0x401EC0, 0x53B540, 0x14BA5A3, 0x3C6628F, 0x195810, 0x966EA5, 0x4463BD, 0x2ACF9E, 0x2252B3B, 0x9CE32B, 0x187590D, 0x787117, -0x11FE211, 0x2B86CFD, 0x2463253, 0xFD4D6D, 0x409E4B, 0x25E49DE, 0xDB4436, 0x336A3A2, 0xA7084, 0x19D2B1, 0x2AADD90, 0x17D010D, 0xB19283, 0x541464, 0x942A3, 0x2828279, 0x244C61D, 0x1F9CC18, 0x30F2A38, 0x3AFE96, 0x1230693, 0x2006D24, 0x10919F1, 0x2E1897F, 0x39E28D, 0x1D44C8E, 0x2D40D57, 0x280D9FA, 0x369CA6C, 0x145155, -0x14D76D5, 0x33E0407, 0x2C68D3C, 0x6A30AD, 0x5A0FAA, 0x354EA8A, 0x19B2132, 0x7733CC, 0x38FD59B, 0x51052C, 0x1AE9FF5, 0x3853799, 0x30243BE, 0x2280733, 0x758611, 0x16BF15E, 0x39E4722, 0x13D6089, 0xB49876, 0x35FF02, 0x37D1CF9, 0x3384F63, 0x38F6CD3, 0x23E56DE, 0x4F1DE9, 0x2DC41B4, 0x1A28918, 0x1D2951E, 0xF08FDB, 0x61AD9E, -0xEBDB51, 0x3FA9920, 0x2CF3584, 0x1CFE77, 0x5D52FE, 0x1F336D5, 0x3920FAB, 0x296A9AF, 0x32602C5, 0x1DFA03, 0x3819A19, 0x1CF51FF, 0xED55F, 0x174D844, 0x697BF5, 0x2399419, 0x1201111, 0x259FDED, 0x119FF38, 0x618C94, 0x8EF50C, 0x25FFC7C, 0x33C0F25, 0x2E63764, 0x7C935, 0xDED5F6, 0x1632F1E, 0x148FBB7, 0x1ABC4C, 0x1E9A0D, -0x34CD4DB, 0x338515F, 0x994879, 0x28658B5, 0x28396C, 0x17605C3, 0x2568F71, 0x27B1F50, 0xF69314, 0x1E570F, 0x935787, 0x1A68E41, 0x1CCE2AF, 0x1DBF740, 0x591EE3, 0xBC8633, 0x2D637E2, 0x15D1F77, 0x146CBCE, 0x5464D6, 0x29A86AE, 0x3025AC7, 0x2B28AFB, 0x12FD96D, 0x6CE2DF, 0x29D3E08, 0x2A0A34B, 0x26D03B3, 0x7634B5, 0x638201, -0x13D193D, 0x21F2993, 0x16AF949, 0x34EDB5, 0x50DDF7, 0xAD15A9, 0x37D9F32, 0x1F928D5, 0x262F16C, 0x392080, 0x3D5A4FB, 0x27D4837, 0x3BA9CE9, 0x21EE1A9, 0x323BBC, 0x3C789, 0x3E22A36, 0x55E13, 0x2F7C880, 0x56FFDC, 0x324C72D, 0x3C368C6, 0xC163AF, 0x1991830, 0x700115, 0x2C46A13, 0x1EDFD4E, 0x3BC276F, 0x3FADC1D, 0x1C069B, -0x14A3C36, 0x1C64F28, 0x1E3D8F4, 0x12BC223, 0x6D73E3, 0x17D6634, 0x2CAF64C, 0x1EC8634, 0xE2DDFC, 0x5B4047, 0xA1CB1D, 0x3EAEDC7, 0x2A46F34, 0x32B2951, 0x54ABB, 0xEDA660, 0x372B742, 0x1EF1E8C, 0x2FE562F, 0x6971AB, 0x1DB4EAF, 0xE155F7, 0x367C413, 0x216C1E6, 0x1E1585, 0x3A26068, 0x1C11D33, 0x2469D22, 0xCACF3A, 0x53B36D, -0x543D08, 0x148F05B, 0x314E011, 0xB17DCC, 0x4668E9, 0x117ACD1, 0x3CFAF90, 0x131ABAE, 0x6A9C4E, 0x4037D, 0x390C68D, 0x463938, 0x157568E, 0x1569130, 0x6B80CD, 0x329C032, 0x1FAA295, 0x1AB2730, 0x17FBBA6, 0x5CC547, 0x14AC75, 0x1439093, 0x1CD8344, 0x1F976CF, 0x3F0915, 0x588052, 0x38AC6D4, 0x2F8F62C, 0x3153994, 0x27A899, -0x3C2A5ED, 0x340E7CB, 0x3EEC0AC, 0x3775604, 0x4B4044, 0xA75958, 0x34627A4, 0x33A1C04, 0x2F39C74, 0x242551, 0x1F87F05, 0x2BE5AD4, 0x1815D95, 0x83E027, 0x29888, 0x39483C5, 0x3D97DCE, 0xB43B27, 0x117D0D9, 0x2EF607, 0x34FB8B9, 0x1AF724F, 0x3B8ACB6, 0x2D21D5A, 0x2B86C9, 0x39867A1, 0x2F6B826, 0x29AAF8E, 0x6139BF, 0x441E70, -0xCC1289, 0x94C0CC, 0x2992FDC, 0x16A2D35, 0x47D8D6, 0xAE74BE, 0xEDBE8C, 0x19D58C0, 0xF30EF6, 0x1CA869, 0x18F2A6, 0x3AC5444, 0x3FF5699, 0x19367D3, 0x3DA047, 0x13AAB59, 0x1C810CF, 0x17FF361, 0x184B2E1, 0x2FA911, 0x2F1CAC, 0x15F6843, 0x619A40, 0x2E4CE97, 0x48A219, 0x15F0BC5, 0x1040834, 0x1DAD423, 0x1411A82, 0x73F889, -0x27421B8, 0x354C4B1, 0x222E1BA, 0x6CDA39, 0x419477, 0x4E0DFF, 0x1C69E79, 0x31C28CC, 0x391519F, 0x4B4564, 0x3E52093, 0x27C5AAB, 0x3EA9775, 0x3735D7E, 0x391B71, 0x34EF798, 0x301A52A, 0x130D2A1, 0x1515110, 0x23087, 0x3AA1AA1, 0x2B87FF3, 0x21E74B7, 0xCB5CCD, 0x64E26F, 0x10B54B9, 0x11C0E2F, 0x2E158DA, 0x1BFABB3, 0x148CFA, -0x1C2CC3D, 0x1B5351E, 0x3DE375, 0x2A34C55, 0x25D44E, 0x2BB2C43, 0x322FBC9, 0x337FD82, 0xCF49F, 0x2C2047, 0x3DAD260, 0xCF660D, 0x3B7D5BD, 0x5E8CE0, 0x779431, 0x2A583F2, 0x1C75A5F, 0x1BF2120, 0x9CA889, 0x3C7C41, 0x1F03AD3, 0x2EF7EBC, 0x370492E, 0x31A2D8, 0x5D6103, 0x2EE75D1, 0x38A14EE, 0x3636CA6, 0xD1DC68, 0x12404B, -0x2667E4F, 0xF118C9, 0x23B0BE1, 0x245BE6F, 0x2BD261, 0x1C80F89, 0xFE32EB, 0xB4886E, 0x2285E53, 0x74520D, 0x24CF97E, 0x171D140, 0x2B021E1, 0x78794F, 0x5CEE74, 0x399CDA1, 0x223782B, 0x62C8D0, 0x2587302, 0x625812, 0x1C0E934, 0xD0DCC8, 0x37A4431, 0x1A31093, 0x60BBC7, 0x10E143, 0x1C7EFCF, 0x15B8ABA, 0x27C52A4, 0x37B8EA, -0x213FF, 0x1BB1971, 0x45C8D9, 0xA3FDDE, 0x74A088, 0x2F607A3, 0x2DC6536, 0x706BED, 0x1C45872, 0x17E866, 0x24059CF, 0x2E63814, 0x2F09ACE, 0x3125245, 0x68552A, 0x31B3C47, 0xA5C6EB, 0x1B704CD, 0x2F37788, 0x68442E, 0x2867595, 0xA6E303, 0x247A196, 0x34D8D7E, 0x6A6955, 0x2E393AD, 0x2D1783C, 0x10C46FA, 0x3D3F3C1, 0x66DD3E, -0x1DC7DF1, 0x2DEAF3F, 0xF9CBB0, 0x1D46C2F, 0x14EB5B, 0x2FD411D, 0x1E7E728, 0x200A1CF, 0x1CFFFC1, 0x5C496F, 0x1426D70, 0x1236155, 0x18A9496, 0x1A88C8B, 0x46C101, 0x19B6D4F, 0x3F6E1C3, 0x3633B57, 0x1482776, 0x609EB6, 0x2DC82C5, 0x3E459AF, 0x2F27E70, 0x2D722BE, 0x772FB5, 0x227A20, 0xA5366C, 0x3C8D79A, 0x10444B6, 0x7F75B1, -0x2070020, 0x71FE23, 0x17B198D, 0x2AB1208, 0x5953D0, 0x15F33FF, 0x94FAF8, 0xC45E28, 0x3047985, 0x267D1D, 0x19E2246, 0x13D42AE, 0x1495BE6, 0x1720BF9, 0x4EAAAB, 0x260BED0, 0x356B01F, 0x1B4D927, 0x1B79AA8, 0x67D378, 0x36E2945, 0x258AFC3, 0x2A9BA71, 0x1128D7B, 0x63D938, 0x26640AC, 0x8A31F4, 0x35A1B34, 0x34BACA3, 0x169C38, -0x39DC22, 0x25CACCD, 0x1E144B7, 0x15F36A8, 0x714784, 0x1D58C7, 0x1B8497B, 0x14F1522, 0x25B9E3F, 0x669D87, 0x244A07F, 0x3746AAA, 0x3ECF750, 0x2D56BAE, 0x327C62, 0x14FBD25, 0x2E3A56D, 0x364E006, 0x17407AC, 0x2AB3F9, 0xD5E196, 0x3942030, 0x2725FCB, 0x35D695F, 0x2A1B9B, 0x139B5D, 0x2CAD9D6, 0x3A731D2, 0x2BE126D, 0x751CF4, -0x37D00A5, 0x29F4B8C, 0x24BC164, 0x2932E0E, 0xCEE3, 0x1BE28F9, 0x2380D9B, 0x502349, 0x245234A, 0x53D89, 0x2B0FB69, 0x1975932, 0xE74016, 0x88235A, 0x4A9913, 0x13F915D, 0x12EF52F, 0x22BBBA4, 0x3E45C84, 0x1D34B0, 0x1E8A9F3, 0x5371CA, 0xB605D1, 0xF754CD, 0x8E7A4, 0x2F0F3E8, 0x2E3A48C, 0xF6228D, 0x6E0099, 0x5CB7BE, -0x2508F23, 0xB1E1E0, 0x21B29AF, 0x333DF8E, 0x336AE7, 0x1D194BE, 0xB53B8B, 0x28727FE, 0x6CAE29, 0x573D2E, 0x3B2EA36, 0xBA8CD8, 0x243333, 0xDD6C7D, 0x200BC1, 0x2B3260D, 0x11F2301, 0x1C5E65C, 0x3294E65, 0x42021F, 0x3B19946, 0x39B3D26, 0x2D2C2F7, 0x1A88C65, 0x311FBA, 0x3E0D70D, 0x304ED8A, 0x30DCC30, 0x509C46, 0x61EEAC, -0x17D35D4, 0x9A6596, 0x21D7885, 0x32F9AF4, 0x70169B, 0x129A510, 0x6CB42B, 0x394DA0F, 0x51CB06, 0x2ADE53, 0x9873E7, 0xC83706, 0x1B3811E, 0x253A173, 0x2D2A17, 0x2F621E3, 0x23114E9, 0x36F7A0A, 0x2D51497, 0x4B06D5, 0x48EC1C, 0x2459A46, 0x730F42, 0x155755E, 0x1D4216, 0x10A5B4E, 0x18369CA, 0x165FF8C, 0xE687CD, 0x66DD9F, -0x3B474A0, 0x30A764D, 0x1EE455A, 0x1E928DA, 0x429196, 0x2E6BC89, 0x36B2A84, 0x310918, 0x2401947, 0x3D46E8, 0xE90B1, 0x1550C28, 0xA39AF0, 0x3D5B697, 0x16F62B, 0x8C51E6, 0x870CD2, 0x3CD51A0, 0x37D8FEE, 0xD64DA, 0x1BFEA7C, 0x23B3967, 0x203A091, 0x35F75D0, 0x3B3319, 0x39FA90C, 0x21516E2, 0x19E61D8, 0x3724231, 0x13B792, -0x3D0979, 0x364E6A7, 0x31270A2, 0x2FADD2F, 0x321A5D, 0x26D8470, 0x251FFFD, 0x3C995E5, 0x32BF49A, 0x22EC9E, 0x3B10536, 0x1F2A0A4, 0x1846DE1, 0x3015677, 0x593F56, 0x1485023, 0x20DCCDD, 0xB1C114, 0x3F7DD12, 0x23C6B0, 0x227804F, 0x2411162, 0x2A70377, 0x2589C9B, 0x573E91, 0x235AC70, 0x6C93CC, 0x385E35E, 0x8127D3, 0x51BA08, -0xD47194, 0x1246058, 0x21DD4BC, 0x78CC23, 0x5D29A2, 0x26E4E33, 0x16252CF, 0x10FB7E1, 0xBD3CC7, 0x50DBBD, 0x1A9C9DE, 0x122F48D, 0x710EF2, 0x3486D8D, 0x3418AD, 0x2363BD2, 0x1C8CE94, 0x99F96C, 0x2B66823, 0x7C8414, 0xE6C05C, 0x2B2D295, 0x179ABC6, 0x68464D, 0x572902, 0x308FA1C, 0x1F0F802, 0x2A73062, 0x3E785F0, 0x3D0B4F, -0x190317F, 0x1FA1D56, 0x3E48D50, 0x2CEF1EF, 0x75B27B, 0xE651F5, 0x203CB1B, 0x2E4644A, 0x3F06DC1, 0x7B9795, 0x1F9A640, 0x1D6F7BF, 0x3AA89DE, 0x2AEAFF, 0x75ADE5, 0x30889A0, 0x2AC45A1, 0x2C6CCE0, 0x3F37567, 0x6F3DDC, 0xDE0674, 0x1A0CD62, 0x20736E3, 0xA7A8A2, 0x291D11, 0x884D61, 0x2394E19, 0xD76F8B, 0x3C6BA15, 0x706EF8, -0x3753069, 0x2A31960, 0x3966137, 0x731883, 0x1E45F, 0x376C7BA, 0x387FE0B, 0x1C54E28, 0x2B38FA2, 0x36D29E, 0x30B49CB, 0x1E7C55F, 0x3937833, 0x278AEB2, 0x65E9C3, 0x30CDA3E, 0x8F1141, 0x2DE59B3, 0xC6ADF8, 0x16E02F, 0x35122B7, 0x33CD198, 0x1E0953B, 0x1F5B0B, 0x1FD7E2, 0x21E57AC, 0x3D7E53A, 0x34DF3A5, 0x3A343C2, 0xCBA06, -0x3A9C392, 0x11030E1, 0x180070B, 0x10F7B87, 0x1E7DC1, 0x239912B, 0x22E9B5C, 0x2D14549, 0x2805C60, 0x332870, 0x32D794C, 0xC1BF19, 0x2B4E6BE, 0x44916C, 0x2C2CE2, 0x12BA79F, 0x2DC8B23, 0x327A109, 0x3EA717D, 0x268520, 0xFE78EE, 0x2CC0149, 0x2698515, 0x3B32EBA, 0x736201, 0x210BF78, 0x22B044C, 0x1843460, 0x2467110, 0x32D8FD, -0xB8F2D8, 0x15F86C1, 0x3459C95, 0x3DE6371, 0x775437, 0x3A417F5, 0x3D5616, 0x1304120, 0x17CF9EB, 0x2E00EC, 0x3446288, 0x1CF57CA, 0x36E4FC8, 0x24D8921, 0x32270A, 0x789B22, 0x1A91F02, 0x16BEC64, 0x3FA4958, 0x2370D9, 0x1E671C4, 0x2BE8D86, 0x1FB2430, 0x33AB07D, 0x156468, 0x2417635, 0x137B0BC, 0x258F3B8, 0x2405F03, 0x31140E, -0x37443FA, 0x17E2332, 0x24AC5C8, 0x1759361, 0xDA75F, 0x1B0FB84, 0x3FD121C, 0x36C3295, 0x2EB4CC, 0x1B79E1, 0x12977B3, 0x3E650B7, 0x1A23FFD, 0x2343E68, 0x4C1B19, 0x112864E, 0x1E28930, 0x8D0BA7, 0x25FC9B5, 0x74F668, 0x27AB611, 0x1701BA1, 0x38B93FD, 0xCFFDE1, 0x4B98CE, 0x1CB9165, 0x2D3739D, 0x1163F7D, 0x28267B1, 0x11665A, -0x27118B9, 0x263C5AB, 0x109A2A4, 0x36ED3AD, 0x265EC3, 0xCE2C86, 0x108C19A, 0x33853DA, 0x2AACB8C, 0x36E62B, 0x1AB4478, 0x1F529E, 0x875995, 0x3F1047A, 0x25BFB2, 0x3022CE1, 0x307B24C, 0x466D7A, 0x2B80CF4, 0x23D341, 0x162BC00, 0x16D197A, 0x3180D29, 0x6E8B5C, 0x23D021, 0x3922D4D, 0x3335EBF, 0x1514A03, 0x12378C0, 0x1E7671, -0x94EFAB, 0x6ACDAA, 0x1E58C24, 0x3CD0FF0, 0x1C9FC2, 0x3D27A87, 0x2E5958A, 0x3015CA3, 0x8A1775, 0x53623E, 0x219420, 0x1047C07, 0x2F9557, 0x1DC25AD, 0x192655, 0x323592F, 0x4B6506, 0x1C06D33, 0x2870F14, 0x30A9A9, 0x26E8F79, 0x26C2CFA, 0x3CB23D8, 0x77171D, 0x7EAB75, 0x2DF6E36, 0x26C61B9, 0x1694C0A, 0x160FC55, 0x4F844D, -0x393DFFA, 0x608C9, 0x54D419, 0x2DAF4E, 0x2ADD44, 0x1DF6E32, 0x1203C4C, 0x2417D55, 0x211AB8F, 0x318CE3, 0x20984F4, 0x1418B46, 0x39F6056, 0x18F4AFF, 0x6EBAEC, 0x1E79790, 0x1D7F9CA, 0x24E7770, 0x1189322, 0xDD943, 0x3A9B062, 0x3C478B7, 0x3364BF8, 0x957BA2, 0x1B17D8, 0x32C6344, 0x85433D, 0x28DC62C, 0x2034207, 0x281068, -0x3EC91C1, 0x12BC377, 0x1F2EF4A, 0x1B25DF8, 0x1A8F0E, 0xB9316F, 0x1E8E9CE, 0x22EC72A, 0x1CA3138, 0x323716, 0x3A4AF3B, 0x1A4196, 0x1367C14, 0x922447, 0x81514, 0xB6C337, 0x352BEAA, 0xE751B, 0x26D44D0, 0x50E77A, 0x2822548, 0x146CA, 0x1CFC46C, 0x390697B, 0x24886, 0x1CAC17F, 0x2C3BE90, 0x1675A06, 0x2C06E71, 0x336A30, -0x2097626, 0x2CB0432, 0x98174F, 0x2A923A4, 0x2B204C, 0x1A17B74, 0xB254AE, 0x303B690, 0x2DAE488, 0x39C2E9, 0x597419, 0x5AE6CF, 0x1F76B92, 0x993C5, 0x6D9293, 0x2AE1427, 0xA5F57B, 0x13077B1, 0x2B7F86F, 0xF0744, 0x3E6A89D, 0x15F8996, 0x2C1033B, 0x394CCD5, 0x282FA2, 0x1DC772C, 0xE65718, 0xFF53A0, 0x17A24F7, 0x4F5D8F, -0x48023F, 0x3249FBF, 0x5AE4BF, 0x1075A9A, 0x596F22, 0x29B8A2, 0xF82BFB, 0x38423CB, 0x344194, 0x31018E, 0x575626, 0x3FA51, 0x20112FD, 0x1C1260B, 0x1241D8, 0x26781A7, 0x3559993, 0x22CE970, 0x2743CF7, 0x1B05F4, 0x3717E66, 0x13FF7D8, 0xE98A99, 0x29DAEA2, 0x416374, 0x3F1579A, 0x20B3B54, 0x2A4B8B0, 0x3EAD74A, 0x56781D, -0x65B318, 0x77BF46, 0x1F7815, 0x2103582, 0x64669B, 0x338E14E, 0x1BD2ED7, 0xB40E43, 0x1042A5D, 0x43D438, 0x266BE02, 0xB3ACF5, 0x28E583, 0x2786B86, 0x6347D, 0x1E3EA86, 0x251C6CE, 0x1AE4697, 0x1872D2D, 0x2CF2CF, 0x25A6EE4, 0x2CA7AB6, 0x1768B7A, 0x2CA6B29, 0x12E75C, 0x30D1FFC, 0x2C44274, 0xE5AE65, 0x335ACC8, 0x71F9BE, -0x247EED9, 0xA24099, 0x58FDC8, 0x143295, 0x31D62D, 0x254C1E5, 0x2EFD48E, 0x1491EA2, 0x2E78FB9, 0x602BF0, 0x380AD4B, 0x29CD888, 0x1F5825A, 0xC9C77D, 0x2B6B1E, 0x2A0470D, 0x17BF61A, 0x354BDBC, 0x92E310, 0x5353C, 0x1E06BCA, 0x1F6CF3D, 0x25F7A3C, 0x23CAD3A, 0x288A1C, 0x2CE7CA1, 0x14BD784, 0x3C7DD61, 0x2D0D99C, 0x59D4C1, -0x21FCB95, 0x955394, 0x14401E0, 0x3C902B, 0x66D398, 0x1A4D9D1, 0x5BDACE, 0x2F62ABF, 0x72A530, 0x7FEA35, 0x9163BA, 0xADBCCE, 0x15EA3D6, 0x13CB9E2, 0xFC6B4, 0x1A7C669, 0x19100F3, 0x3193D5C, 0x243980, 0x2E4099, 0x1FA4C2F, 0x303EBC5, 0x36399B5, 0x276384A, 0x46295C, 0x25B331B, 0x12B758E, 0x18795CE, 0x1CDB131, 0x5FA7BD, -0x3D1AEF2, 0xD1C511, 0x173947B, 0x7088C9, 0x28004C, 0x23C5299, 0x210DF66, 0x27E8D58, 0x604B34, 0x2AB19C, 0x37802, 0x1C03DA0, 0x2B363AE, 0x2002D08, 0x1AD163, 0x2BC2139, 0x1FBCDBE, 0x3D1D45B, 0x3FC8772, 0x44BCDE, 0x1E11C7B, 0x1B6885C, 0xC1441C, 0x1E7DF13, 0x2C35EE, 0x1C13849, 0xA509D6, 0x50A485, 0x1BC42F8, 0x6492D2, -0x50F0D9, 0x153A623, 0x181BA6F, 0x2DB7588, 0x6A2DB2, 0x25E57A3, 0x2601ACA, 0x3D20F7D, 0x36CEE94, 0x57526B, 0x100E650, 0x39B2C7D, 0x334517C, 0x2C10BE3, 0x5D841, 0x18DE970, 0x29B1A, 0xB8EAA8, 0x166AF8B, 0x4F4B5, 0x19CE49D, 0x37F5C9, 0x3935C05, 0x2F7E536, 0x213839, 0x1306204, 0x496195, 0x305AB37, 0x22C67E5, 0x7D323B, -0x2B6C618, 0x1140AC5, 0x195B26D, 0x1A6AA25, 0x797170, 0x1B41872, 0x1F038DB, 0x21D0F86, 0x361B1DB, 0x13D601, 0x31B7383, 0x322C34C, 0x752DF, 0x3CF90BA, 0x185472, 0x310EBA0, 0x344F9CA, 0x1B9505B, 0x1D243DE, 0x519A38, 0x3AD2500, 0x2706CB4, 0x37338D0, 0x174B3C3, 0x45DA4, 0x62426C, 0x60655A, 0x1B09640, 0x3D66FF8, 0x728D57, -0x1A2C585, 0x267E1E3, 0xE10F9A, 0x798771, 0x4FC483, 0x394FE0, 0x180B315, 0x28AE6DC, 0x1AD9FA7, 0x48456, 0x293809B, 0x33C11D2, 0x22BBC5F, 0xE963CF, 0x71C0C2, 0x2FE6C43, 0x3EAF8D, 0xA7B40, 0xFABB93, 0x614C2F, 0xE1C6C1, 0x2A6034, 0x1DCC761, 0x1F22221, 0x1CE819, 0x2C2CB2B, 0x38727CE, 0x341EEAD, 0x1EEBCBC, 0x471AD0, -0x301121B, 0x2A0DF1A, 0xB59D67, 0xA07D67, 0x2A8E64, 0x33262CA, 0x1C0790B, 0x58052E, 0x1F7B394, 0x19E0A2, 0x3BDA6AC, 0x24705E9, 0x31F2B56, 0xC46E77, 0x43484C, 0x3AE49EA, 0x555526, 0x3AE5A68, 0x3768897, 0x43A2C5, 0x1700EEF, 0x3A64ABB, 0x856FA5, 0x1588E46, 0x58911F, 0x8C4DA9, 0x2E06879, 0x3BD648, 0x32F743, 0x66E6E3, -0x29D266B, 0x2882705, 0x3685F3B, 0x188C111, 0x20F7A8, 0x20C3D2E, 0x2D6AA86, 0x11F1D1B, 0x210F444, 0x366C29, 0x1CC9013, 0x1E2D90B, 0x23FB06C, 0x1938427, 0x27484A, 0x677464, 0x3AB2F29, 0xCC18F8, 0x6C7709, 0xB6CB3, 0x18F0FA0, 0x27213E2, 0x32A2DF6, 0x1D12483, 0x2DD426, 0x14C5C69, 0x4A865F, 0x18D6C09, 0x1FDAA63, 0x489ADE, -0x3203E28, 0x1DA7D1F, 0x2C97458, 0x8FF016, 0x124F41, 0x16D7D67, 0x24DBD2B, 0xFF83BB, 0x3FB53C0, 0x330954, 0x2F90EAF, 0x3994011, 0x1EBDC2C, 0x18B53E7, 0x7BF947, 0x6DFB83, 0xC5CA96, 0x3D962E9, 0x18AD044, 0x3C7A60, 0x7F0D93, 0x2C0E5C5, 0x10345DD, 0x1BF1A0E, 0x8E359, 0x3639BFA, 0x399FFD9, 0x232374, 0x846622, 0x19021C, -0x1D55EFA, 0x3D7413F, 0x3AA3002, 0x381E786, 0x5B4C6, 0x174C84C, 0x1E3A8EB, 0x277DE56, 0x209A962, 0x1C42F7, 0x2CAFACD, 0x119A2F4, 0x261CE05, 0x34FB790, 0x237668, 0xAEBB32, 0x11A98DD, 0xEF4EDF, 0x164C73C, 0x31EC8C, 0x35B5825, 0x30BA570, 0x1603955, 0x3A9A096, 0x27D8B0, 0x27D1EC0, 0x225DB90, 0x3D13B7A, 0x301C54A, 0x6B6CC5, -0xB244CD, 0x223C328, 0x189703D, 0x26A33FB, 0x1CAE, 0x293A7FD, 0x12CE87D, 0x2C96A84, 0x32DEAF7, 0x676C9A, 0xCDBD33, 0x2DAF578, 0x19B8631, 0x3703377, 0x29F289, 0x2FB2EFF, 0x1F55D4D, 0x3059094, 0x19438FA, 0x1EB2CE, 0x28BF405, 0x26CE903, 0x2E8D213, 0x3ECE7D2, 0x4165ED, 0x270D6A6, 0x7B699E, 0x4F6E06, 0xCA3242, 0x2E3CC0, -0x56063F, 0x20CA7B4, 0x33D51EF, 0x33927A2, 0x6D4D01, 0x256D34B, 0x324BC5, 0x29E0011, 0x1385C20, 0x6DAD1C, 0xB477BE, 0x315B164, 0x2935584, 0x2B5497, 0x597E5F, 0x2B7D36D, 0x2099363, 0x29E600, 0x1526672, 0x3F5867, 0x1546A1D, 0x3AA277D, 0x3643D7, 0x1F668FD, 0x41754F, 0x2F879A7, 0x1E5A08B, 0x28723B0, 0x2566098, 0x1AB277, -0x3EF7923, 0x1F27C2E, 0x2097DC3, 0x63B0D6, 0x256EC8, 0x1DD5846, 0xB69702, 0x145C4A7, 0x187B72B, 0x51DF6C, 0x35CE819, 0x93F373, 0x9F1AEF, 0x2566B9A, 0xBA6BB, 0x17B3A9E, 0x1EF5969, 0xE4AE66, 0x2468A2A, 0x71FFD5, 0x3A7FADF, 0xC97E94, 0xF1306C, 0x261ACB7, 0x6667F2, 0x12A09E4, 0x1D469B5, 0x74A3EF, 0x10902E3, 0x517A10, -0x8183F5, 0x3377F63, 0x7F1D08, 0x31C6E05, 0x59237C, 0x395933, 0x253F462, 0x2F67FFF, 0x164B443, 0x538ACC, 0x3F0EB1D, 0x738E1B, 0x3E65AC5, 0x2045317, 0x69D42B, 0x1BFE8B9, 0x3B68E65, 0x2341A17, 0x23F7C16, 0x5DC6D9, 0xE8BE78, 0x134DD7, 0x3CAF23, 0x87B312, 0x31B585, 0x380222A, 0xB60501, 0x15C904D, 0xAF8FB6, 0x3DC18B, -0x2D70D2B, 0x3858E7C, 0x38D0A48, 0x29AF03C, 0x4FFD54, 0x2B7143B, 0xF196E9, 0x3A9D8AE, 0x7E45E3, 0x482EB4, 0x3AD4F1D, 0x2E2D3A1, 0x2B40240, 0x1080167, 0x6D8532, 0x275132, 0x17DDF91, 0x2349C13, 0x1A836AB, 0x626107, 0x42765C, 0x5AF6CE, 0x3D92353, 0x2E8EA2F, 0x246165, 0x1177A75, 0x35DD2F5, 0x23B61C2, 0x264591F, 0x45A2F, -0x14671A8, 0x2D648D6, 0xDB5ED3, 0x3696B35, 0x514FA, 0xA0F50F, 0x25FF0D6, 0x2A91E82, 0x373CA54, 0x7CD2BA, 0x29AC266, 0x1144C3A, 0x9790DA, 0xF07853, 0x26A0D4, 0x293A390, 0xADCA7F, 0x3CCBBB6, 0x2BFDFD9, 0x360357, 0x7C59E8, 0x120D747, 0x70B3AD, 0x3F61BE9, 0x570DAF, 0x3A3BD96, 0x12F9A63, 0x2FFCD7C, 0x3B0AB5D, 0x17E4BD, -0x2D7DC51, 0x393AA7A, 0x118E43C, 0x3835929, 0x58BA7A, 0x680555, 0x1331F99, 0x30B8E01, 0x14F3898, 0x3ABC9, 0x399BE57, 0x218831D, 0x2952A31, 0x28BA136, 0x2B258F, 0x23B2FFD, 0x23F7141, 0x2556DD8, 0x35E68EA, 0x17371D, 0x981299, 0x2E0DD5E, 0x2B2D927, 0x407642, 0x554552, 0xE54BD0, 0xC18863, 0x2235B45, 0x197FF6D, 0x59109B, -0x180A515, 0xF83128, 0x4F4866, 0x3907198, 0x41467F, 0xB73878, 0x306A937, 0x3517AE2, 0xB1B01, 0x19D3CB, 0x26745F1, 0x11F25FA, 0x43CAA1, 0x68C72B, 0x70DAC7, 0x2F87DFA, 0x4FB09A, 0x3B25B92, 0x3A7D839, 0x67F228, 0x38DF3DF, 0x166E33D, 0x2FA3BFB, 0x24A9095, 0x366877, 0x3177544, 0x39707C8, 0x1C90E1B, 0x6D912C, 0x786A9E, -0x3CA7F5B, 0xBD1CE4, 0x2B59417, 0x2D2E2AB, 0x62AE5B, 0x2631B6F, 0x2710C7E, 0xC6DBCD, 0x2CBB727, 0x1FBE20, 0xEE085, 0x360A7EF, 0x1D595FD, 0x56B717, 0x241DD3, 0x1F7DBB6, 0x1A23589, 0x12D4B4B, 0x3FB96FB, 0x595A82, 0x11880D, 0x14EB833, 0x2F76696, 0x3FBF100, 0x2B9E85, 0x3770A81, 0x1256D41, 0x731BB2, 0x15D7ECD, 0x5D20C, -0x2485AB2, 0x3988D0D, 0x3EBB9D9, 0x25997CF, 0x27012A, 0xC04FF7, 0x33FBD21, 0x3FBE586, 0x3303AA, 0x44A586, 0x3532E80, 0x3F9B8BC, 0x20FE6FB, 0x2EABCC8, 0x5ABEA, 0x3809678, 0x3487CB2, 0x3ED21BE, 0x4ADFAB, 0x2AA621, 0x210190B, 0x220DF3D, 0x18F8E29, 0x2F3BE45, 0x1EC8FB, 0xF96F6B, 0x3A6CEA9, 0xEF31EF, 0xA81A2C, 0x6A3B84, -0x3F827F7, 0x352DF35, 0x1C9792D, 0x2EFCFF4, 0x605175, 0x1F6D98, 0x2ED9067, 0x3310139, 0x3A65E36, 0x3A3AB2, 0x1D5DD0B, 0x2565072, 0x3CE0C5C, 0x18096CA, 0x34C6C7, 0x1BB8126, 0x1045692, 0x36190D4, 0x3AC476B, 0x7622CB, 0x24EF5AD, 0x2FFE4C5, 0x345D785, 0x1F4D572, 0x719164, 0xCE6E5C, 0x3C95713, 0x2E3E117, 0x22C3A6A, 0x581B44, -0x2225414, 0x13CF2CD, 0x68554A, 0x314E6F1, 0x790180, 0x3C6B96F, 0x19010ED, 0x10B470, 0x16CE8B0, 0x43CCCF, 0x14C3731, 0x2FCEBF0, 0x25741DF, 0xC6555C, 0x1C368F, 0x1070B5A, 0x30BB4ED, 0x3F6000B, 0x3758ECD, 0x332D8, 0x9BD8F0, 0x12C6423, 0x39CE074, 0x18A36B2, 0x2D258E, 0x1C61E14, 0x16D2F6E, 0x2996BBA, 0xA568D3, 0xBCA12, -0x32175D4, 0x321319B, 0x3E78059, 0x10E1E42, 0x1A3BED, 0x1ABB5B0, 0x253D5DF, 0x33E4DF3, 0x398ECF0, 0x304777, 0x2C5A6A, 0xA753FA, 0x37A459A, 0x347A239, 0x72E421, 0x3312959, 0x188C0C4, 0x3BB869E, 0x2B633CA, 0x2DA03A, 0x3ECB0B6, 0x2361823, 0x3A6285, 0x39E9009, 0x343099, 0x29A8F63, 0x26D9D74, 0x1886BA2, 0x293DBC5, 0x3D2028, -0x2047D0A, 0x238B4A1, 0xE20F06, 0x175B1DC, 0x14999B, 0x385DA79, 0x1D29648, 0x1CD9D18, 0x14B0568, 0x78AEB5, 0x3E9C06E, 0x3733308, 0x1745482, 0x27ED52D, 0x7B18A1, 0x29A7A06, 0x3225BBF, 0x1C13036, 0x383461C, 0x2F2C2C, 0x25492C7, 0x366E7B5, 0x10773B2, 0x3942067, 0x649C7, 0x365E3CB, 0x2D9BA9E, 0x303FCDA, 0x10312AB, 0x49B15B }; - - -// The table below consists of four mini-tables each generated using window width W = 8. -// Number of point entries = 4 * 2^6 = 256 points, where each point (x,y) is represented using coordinates (x+y,y-x,2*d*t). -// Each coordinate (in GF(p^2)) is encoded using the redundant representation 23|26|26|26|26-bit for each field element -// in a+b*i, where the 23-bit digit is the most significant digit. Digits are stored in 32-bit words. -// Table size = 256 * 3 * 320 = 30KB - -static const uint32_t DOUBLE_SCALAR_TABLE[7680] = { -0x303E631, 0x28D3CE9, 0x2B5FE18, 0x2FC7540, 0x287460, 0xF90353, 0x398BDF9, 0x2CDEE02, 0xDE2E1A, 0xC3BA0, 0x937EDC, 0x3C3E62C, 0x55590B, 0x1E093C3, 0x740B7C, 0x3A01366, 0x48E448, 0x17A5B32, 0x2E4EA55, 0x4FFCF5, 0x3DA42BB, 0x2BF32EA, 0x17C6297, 0xDD55B2, 0x5948D1, 0x330684C, 0x2268E4C, 0x1F27A81, 0x1C828D0, 0xCAF2B, -0x3CF68C4, 0x1D5AC56, 0x26BA892, 0x1F26294, 0x5742F7, 0x1F89F9B, 0x296877A, 0x10F7340, 0x2BB9D7, 0x14EF68, 0x43A41F, 0x1DC3850, 0x3C9584E, 0x445B0C, 0x212C4, 0x24DC0E2, 0x1E479B7, 0x1D2835B, 0x214635, 0x5949DF, 0xED10DB, 0x38481D1, 0x11D36A0, 0x33A110E, 0x5A5183, 0x2FDBA50, 0x62C562, 0x20886F6, 0x1F91C38, 0x2CE203, -0x264BA3C, 0x27E8526, 0x4511F4, 0x354659C, 0x5F9876, 0x355586B, 0x41415B, 0x1726030, 0x155F63F, 0x20F1A, 0x6D86C8, 0x32C5D6C, 0x390DF4, 0x2F9FF96, 0x694FBC, 0x16A1B67, 0xCA529D, 0x38EC793, 0x2492D63, 0x9DBE9, 0x1F197B6, 0x3D100F3, 0xBA7590, 0x27F21E8, 0x1C0796, 0x2252367, 0x191DDC4, 0x3096C49, 0x1808BC6, 0x550897, -0x12E1775, 0x28D8791, 0x38FBEFD, 0x332B320, 0x7A0A0C, 0x3E5DC5F, 0x39E4702, 0xB93B07, 0x2D0632F, 0x24D9B6, 0x2117E03, 0x25C3CF1, 0x1595497, 0x1632E5B, 0x3986A1, 0x292612B, 0x161B39, 0x39D68F8, 0x369F937, 0x305CAF, 0x52914A, 0x70B819, 0x3079C1A, 0x303AD60, 0x7EF989, 0x364B099, 0x196C7DC, 0x1C6B3A7, 0x1B560A6, 0x4FEE23, -0x30E8419, 0x4257DD, 0x3C096F8, 0x1ADCE5A, 0x53BBD8, 0x32B4210, 0xAE9C9A, 0x78B2B7, 0x74A407, 0x625DDA, 0xCD2B3E, 0x16F1EC6, 0x13320FF, 0x231CD63, 0x55659, 0xFF50D7, 0x4917C8, 0x297E099, 0x397A466, 0xE7F58, 0x3758756, 0x1584DCE, 0x198A5A0, 0x24FE1F0, 0x6447BC, 0x34C7520, 0xC18130, 0x1BFAF92, 0x951D70, 0x6B2144, -0x27F9497, 0x3798B5A, 0x93CE93, 0x2761BD1, 0x212945, 0x64CFE4, 0xE531F1, 0x21B3456, 0x3FB0FD2, 0x612434, 0x4261F3, 0x247B77D, 0x23FF1ED, 0x153E782, 0xC6D38, 0xA7E4E3, 0x3454C62, 0x110CD3F, 0x2F7285A, 0x24691F, 0x3625C9D, 0x1D19735, 0x19F4BE9, 0x34DCFDD, 0x2AA61C, 0x326D62B, 0x355D8E8, 0x20BA824, 0x2794368, 0x1A0AE3, -0x2E4C3B9, 0x321207, 0x18C632D, 0x427374, 0x6C3687, 0x3BF95DA, 0x1C5C50B, 0x29C6E52, 0x7ED4B3, 0x67BFA4, 0xA01474, 0x135A822, 0x26264E2, 0x2B8FED, 0x49A6CA, 0x103191E, 0x3E3EAA4, 0x3062D67, 0x3D6A9B4, 0x674888, 0x2E85A99, 0x1CE0930, 0x33144BA, 0x34634D6, 0x406B2F, 0x2728AC1, 0x21EC6FA, 0x160EA70, 0x88C5EC, 0x11D2F2, -0x223A469, 0x11B801F, 0x31A2F89, 0x3EAEF38, 0x22A196, 0x1216BA, 0x27B86F7, 0x2324530, 0x254E09E, 0x240FE9, 0x3AEB5C7, 0x32E26D8, 0x31B0F9F, 0x127B45A, 0x603B81, 0x2CF61FB, 0x61DB00, 0x148BB1F, 0x2BD84BE, 0x4A5E32, 0x29A8813, 0x246BB79, 0x3AA5FC4, 0x244DB94, 0x1AD937, 0x1E6C123, 0x1436C75, 0x12CA5DA, 0x3DFF8B0, 0x2F4014, -0x285C3F5, 0x19E475D, 0x1A9BE4F, 0x870DD1, 0x4C2185, 0x555F97, 0x1486BE6, 0x1A7B0C0, 0xA54F2B, 0x1462A1, 0x2452C1B, 0xAAD8F5, 0x3B870BB, 0xC7B262, 0x5783C5, 0x5DBC9C, 0x37BD4D8, 0x2719737, 0x2E4C3A1, 0x49F982, 0x35211E3, 0x59E432, 0x299E75B, 0x1D336EB, 0x45AD65, 0x2EC9851, 0xADCB7F, 0xC88106, 0x27311A, 0x45029A, -0x3F3A1F2, 0x90379D, 0x10EA532, 0x7AAA6B, 0x17BD29, 0x2F8A0A0, 0xB5FBF0, 0x378EE0A, 0x148085, 0x3A7412, 0x2CC90DF, 0x3EC25DA, 0x31F3B0D, 0x2DA269E, 0x7FD603, 0x300D6C6, 0x95E732, 0x9A3115, 0xED8C61, 0x634074, 0x20CDA01, 0x291CA43, 0x2F0BEBA, 0x29B54FF, 0x143265, 0x181E75A, 0x975BF6, 0x2F2A453, 0x330E561, 0xE9780, -0x28628DD, 0x1A3481E, 0x2841A4F, 0x2D4624F, 0x50D230, 0x3B504B6, 0x35DA692, 0x292EF3B, 0x18E5A4, 0x55975C, 0x3B8756F, 0x9EE897, 0x31FD077, 0x33E3B5C, 0x7FF86, 0x35A1F0, 0x1FE9033, 0x18FCEF5, 0x1C21D08, 0x70753A, 0x2B973C, 0x255388D, 0x1F86615, 0x23C6967, 0x5AA9D6, 0x1E44468, 0x3A7A7FD, 0x3D233B8, 0x535828, 0x2E7491, -0x176DB4B, 0x6E47B0, 0x2E6D14A, 0x18F5A6A, 0x55F91A, 0x31B6D27, 0xE08C9E, 0x2AFDF42, 0x11D7EB2, 0x2ACF1F, 0x1B58968, 0x1A6D1E, 0x1F77FD9, 0x3971291, 0x3BAAF4, 0x27AAAB6, 0x326AE62, 0x981A2A, 0x14063D4, 0x5466CB, 0x1BA3205, 0x2E89DDC, 0x3BE43E6, 0x337A86E, 0x31EA90, 0x557393, 0x105AD7, 0xD7D000, 0x1056944, 0x464CB0, -0x2FF2BBF, 0x21F481, 0x2736D02, 0x335FEAF, 0x2B9C8E, 0x2CAAB0D, 0x15B4E10, 0xA7B2B, 0x2DDD9DC, 0x46EA0, 0x2317310, 0x29EA227, 0x10CA113, 0x152FBDF, 0x5992A3, 0x10388BD, 0x36A53B5, 0x11543ED, 0x3DD9E0E, 0x52661, 0x19E28E0, 0x23B7DB8, 0x4E54C2, 0x3CB4BD9, 0x1D19C2, 0x335AB3D, 0xC85236, 0xF55D7, 0x52E0D6, 0x680C47, -0x380CCEC, 0x13CA0AE, 0x211CA37, 0x26F1DEB, 0x789E60, 0x19B45F2, 0x95DF56, 0x21B110D, 0x16E15DC, 0x1C548B, 0x543FDF, 0x3A9C9AD, 0xED27BA, 0x244B46D, 0x3C1562, 0x3E82082, 0x18880ED, 0x2900D63, 0x8F4F1E, 0x1414E5, 0x1D23A9, 0xD26547, 0xFB47CA, 0x178F3A0, 0x4DA426, 0xA9EF36, 0x7AEF2B, 0x180B798, 0x396D6FE, 0x4EBAC9, -0x9CB5A4, 0xB0773D, 0x340FABD, 0x3F09718, 0x3F54AC, 0x3D11CBD, 0x3BBFEAE, 0x1E8C202, 0x1F2CDA5, 0x67216B, 0x23FC9F1, 0x32F3E6C, 0x3AFBFF7, 0x37FDFE9, 0x2EEBEB, 0x111F85E, 0x15AFBE8, 0x102C711, 0x237D48A, 0x1B8FD9, 0x2519791, 0x23AEB58, 0x18ED6B2, 0x2583807, 0x6CF0EA, 0x6967D5, 0x5EF080, 0x30ADB46, 0x1972CF7, 0x323DA0, -0x1E24D9C, 0x21F41D0, 0x3CC2316, 0xAD4269, 0x2DB8F, 0x320527, 0xFE1649, 0x2D2B924, 0x3C075B9, 0x68C360, 0x3D5306A, 0x717A1D, 0x33A9235, 0x3F21717, 0x6F56CC, 0x3C4928F, 0x2594A0D, 0x39711B0, 0x155320F, 0xB3337, 0xC783EC, 0xC6F8B3, 0x64FE29, 0x252E823, 0x468296, 0xE2C6AC, 0x178D8D6, 0x11709F3, 0xCEC003, 0x1474B3, -0x3C96640, 0x1E49D5B, 0xC5E24D, 0x27FBE1A, 0x618FDA, 0x2FD9375, 0x3D6C496, 0x258B7F, 0x1F81103, 0x778DD9, 0x2219627, 0x3CC5221, 0x2811FBF, 0x3878A9F, 0x3417E1, 0x37B7BDC, 0x2566A23, 0x67221E, 0x3AE30F2, 0x3508C2, 0x11C430F, 0x3B37784, 0x2134827, 0x27EC1EA, 0x21BCB1, 0x32F5746, 0x7E942A, 0x18FAE0C, 0x2D3999, 0x401E68, -0x13693CC, 0x92EACC, 0x47A2CC, 0x4AE591, 0x20541C, 0x381C3CC, 0x25D6DBE, 0x17F7374, 0x3ECD138, 0x52905E, 0x18B5F9E, 0x1726D5B, 0x23EC79C, 0x1D74AE6, 0x3390BF, 0x15BF4E4, 0xE01F62, 0x2B517EF, 0x1690811, 0x281416, 0x26FE158, 0x33F424C, 0x22EC7F8, 0x13D325D, 0x3232FB, 0x194D25, 0x235B6B3, 0x5E8567, 0x3EC29D5, 0x6F7CAF, -0x23E7963, 0x2058DEC, 0xEF5BD9, 0x1F2E228, 0x691D7B, 0x2062914, 0x28C66B8, 0x285B10B, 0x13E0A56, 0x6FB14, 0x3F62F8F, 0x18825DA, 0x16B480E, 0x1C7B1CF, 0x2A4259, 0x1D10B1C, 0x2AA79D0, 0x69B800, 0x22F4681, 0x230D7D, 0x28DFE8C, 0x2B38DD0, 0x366765A, 0xA5FD62, 0xFCAB5, 0x3AF7B8, 0x3A549A5, 0x1AE7CF0, 0x16DD753, 0x7D9091, -0x2259D6B, 0x1691855, 0x21B5745, 0x18DD333, 0x29BCC0, 0x287AEFD, 0x2C3B4EA, 0x1D58F2F, 0x2BC3953, 0x211A06, 0x23DE9BC, 0x325715C, 0xCA76C0, 0x3B49402, 0x6299B6, 0x14DFB18, 0x18F9E13, 0x1B307FD, 0x1366F07, 0x2CC93B, 0x5D13EA, 0x1F8B513, 0x11A0EBC, 0x23534F4, 0x3278E1, 0x2215F79, 0x38F7496, 0x16D349, 0x542CC1, 0x7EB2A7, -0x2A094CB, 0xF5F57D, 0x140505F, 0x35238CC, 0x2A3771, 0x96F009, 0x3CE7A77, 0x99208E, 0xDCE8D9, 0x12248, 0x1FD4D33, 0x23E4BF2, 0xA6CF75, 0x31B7F4F, 0x2339D8, 0x22673B4, 0x25965, 0x10548B0, 0xFAE676, 0x746FF4, 0x2422EFF, 0x3370152, 0x3F0C47E, 0x32099ED, 0x33D8F7, 0x21A42AE, 0x3802B24, 0x18E622F, 0xF4C7F3, 0x31E57F, -0x1C50869, 0x48C568, 0x3BAFBB9, 0x2C3E9FA, 0x4AC8CD, 0x973EDF, 0x75D298, 0x5D7054, 0xCD2CB, 0x723490, 0xADFA33, 0x1517DCC, 0x2C96F2E, 0x398F6D6, 0x224E44, 0x6FDEB9, 0x28F4017, 0x36B5FCB, 0x3995664, 0x2C93A4, 0x188D758, 0x1E835EB, 0x1CDD772, 0x859C6, 0x2E3310, 0x32C0254, 0x3BE26BA, 0x2FB47B2, 0x2DD1D62, 0x1F6DE5, -0x14FB321, 0x22411C4, 0x180D6AE, 0x269BB1B, 0x3D605E, 0x3A874D8, 0x24571C9, 0x112F18E, 0x2D7A742, 0x699088, 0x2056F10, 0xD11863, 0x20B3AF9, 0x37E0917, 0x1B9169, 0xF4C891, 0x230CF5C, 0x13C35EB, 0x3EC88AC, 0x1609DD, 0x1B366ED, 0x722174, 0x27B0813, 0x27672C6, 0x7BC3CF, 0x3C93968, 0x1D1E34B, 0x27F5D29, 0x15CE93A, 0x13CBB4, -0xD5986B, 0x1ED7319, 0x35DCDD3, 0x35F606A, 0x7ED3D1, 0x3973C9E, 0xD217C8, 0x11D7AC5, 0x174CCEE, 0x70567, 0x3186C1, 0x17484F1, 0x1F14ADE, 0x15ED2FF, 0x6A8BDF, 0x2717963, 0x3E22877, 0x3CE2A87, 0x82D466, 0x17F292, 0x395C610, 0xB5FEC3, 0xA7C7AF, 0x248EC51, 0x28D1D3, 0x32813E1, 0xF0F625, 0x32C18E7, 0x10318B9, 0x100B, -0x21F50DA, 0x39EA07E, 0x24484D, 0x3596276, 0x4FA391, 0x34B285, 0xD65BC2, 0x29CEBCC, 0x3582893, 0x4D4ACB, 0x1835A0D, 0x2A62E30, 0x1D0B97F, 0x238A640, 0x33ABCF, 0x1B3D082, 0x1CF465D, 0x148D60A, 0x290C96E, 0x60666A, 0x1284A39, 0x12B6DDA, 0x1B28AD5, 0x3444D82, 0x227A98, 0x26A3872, 0x387FF2B, 0x18C4A1, 0x12F59FE, 0x1E4EE4, -0x387D315, 0x131AED0, 0x7955A7, 0x302C6C6, 0x19428, 0xBBB055, 0x54F89C, 0xA615CC, 0x37C0371, 0x2B3CAB, 0x2924B57, 0x443009, 0x3217834, 0x217CB67, 0x2D30E9, 0x33389F5, 0x5A5E4C, 0x2417471, 0x1A86080, 0x53E3FD, 0x2E5864, 0x24F35E7, 0x2E82B13, 0xD79044, 0x58D929, 0x305B6D, 0x2269FB2, 0x337A869, 0x13BB8A3, 0x42A8FE, -0x1591901, 0x84BBC0, 0x31B974E, 0x1E80E5E, 0x327791, 0x1687544, 0x32F9B8F, 0x2FB67BB, 0x1C07426, 0xB8957, 0x503668, 0x2F23B9D, 0x3486CF, 0x16B6A7E, 0x48A992, 0x22D0F4E, 0x115D4EE, 0x3D66570, 0xE19888, 0x7D69CA, 0x117271F, 0x153388A, 0x11DEC70, 0x3844CED, 0x41BCE1, 0x1EC35E, 0x2B90B7E, 0x3CC33A3, 0x3D0B51, 0x7EAADA, -0x8A57CC, 0x4E3C41, 0x1A8F13B, 0x2F5FA45, 0x64F98A, 0x2A0C732, 0x657AC5, 0x34D27AF, 0x29E4758, 0x11BE81, 0x30F61B8, 0x237D1D0, 0x100497D, 0x2CE049C, 0x767C7, 0x3940AA6, 0x1244DBE, 0x2DBA3E9, 0x1033655, 0x3BDEE3, 0x11D2602, 0x3B13FE, 0x19DBB25, 0x15351FD, 0x4CDE24, 0xD978CB, 0x39D254C, 0x1835AF5, 0x846753, 0x5A8E2F, -0x20744DF, 0x32D5096, 0x1055DF1, 0x29EFCD7, 0x3D3B08, 0x1E4719C, 0xD7A0CB, 0xD42C63, 0x1F82455, 0x6EB8D9, 0x3D20DD9, 0x28FE378, 0x33DA2F6, 0x3F49D88, 0x13F23C, 0xFA41C, 0x1AE037F, 0x17D7B4A, 0x100EB3F, 0x58D876, 0x38E139B, 0x10881E2, 0x94D2AD, 0xAAEEBD, 0x73DBEE, 0x1ECA3C8, 0x9D6224, 0x93809A, 0x3C45E2C, 0x6EF9A9, -0xC637DA, 0x3A7B2E4, 0x1A7CFC7, 0x17F042C, 0x3A0434, 0x2F9FF1F, 0x13A72D8, 0x33D8C02, 0xEA92A8, 0x6C4F9C, 0x15AC1F0, 0x35A656E, 0x3A1B049, 0x14655D8, 0x224384, 0xAC276, 0x19A8C5C, 0x1905A14, 0x48E8C9, 0x600FB7, 0xD35A24, 0x2469928, 0x31089D3, 0x1541907, 0x3B093B, 0x12E221F, 0x17796FF, 0x1B1E227, 0x119658F, 0x25F5E7, -0x3E7FB84, 0x8041FD, 0x1A8E3E2, 0xEF06E1, 0x6F06A2, 0x2EB0E48, 0x263467D, 0x1FDAB41, 0x1876B51, 0x5DC117, 0x2B52A0D, 0xF924A, 0x3528BA3, 0x31CF6A7, 0x127C69, 0x2C790BE, 0x16DC33D, 0x1A5CD3A, 0x3142067, 0xD72B0, 0x2C2CDF7, 0x3E4358B, 0x2DAF193, 0x33F13D1, 0x67F7D0, 0x2F380EA, 0x3020F54, 0x3F4D7AE, 0x36928A2, 0x7C0A1D, -0x8CBA7, 0x3483F98, 0x159546F, 0x2450457, 0x7A588C, 0xF45F78, 0x74F6BB, 0x30368FB, 0x302539E, 0x851DA, 0x232A892, 0x29DB8A, 0x1B2FCAE, 0x4C8B77, 0x104F86, 0x21F9006, 0x360791B, 0x912B79, 0x35E8926, 0x1E4D28, 0x34B89BF, 0x5D74E5, 0x15C2AF3, 0x3E69A71, 0x613D00, 0x265226F, 0x1A20FA3, 0x2F0523F, 0x35971B7, 0x72F7E, -0x338E243, 0x3990EE, 0x1B15669, 0x29F2462, 0x1A81C4, 0x24749AE, 0x1B459A7, 0x190C105, 0x29D063C, 0x137F2, 0x2699D16, 0x649E5B, 0x237CED3, 0x2D170E2, 0x3ED76D, 0x1A88243, 0x21B451D, 0x316E78E, 0x2B31256, 0x45985A, 0x28F1030, 0x1720823, 0x278247D, 0x1A2D343, 0x6DBE5F, 0xF7F26B, 0xF4060B, 0x2DBD08D, 0x3385CBE, 0x64C375, -0x20C2875, 0x3DB6CE8, 0x1250BA0, 0x294C90, 0x57E1D9, 0x1F8E63E, 0x150CFF7, 0x2D9B031, 0x1438DF6, 0x333447, 0x269B016, 0x3034A2B, 0x154762C, 0x3D8403, 0x435FE8, 0x29D3DF5, 0x12BAA19, 0x14A9587, 0xFE0993, 0x3B9691, 0x31314B0, 0x19F5AE, 0xCCF738, 0x9331BD, 0x48CCCF, 0xB777AF, 0x38AEF5A, 0x1D666F5, 0x1EEA58D, 0x34C2C3, -0xFDBE5B, 0x54D240, 0x1D43D73, 0x25EBF84, 0x4E4F9D, 0x11B73D7, 0x104850D, 0x25AE81B, 0x2B446, 0x1D48D1, 0xE2B151, 0x3B9D98A, 0x2FB82A4, 0x24061DF, 0x34902E, 0x3D45394, 0x2158683, 0x1AC0B5A, 0x3DB5735, 0x44317A, 0x1068BAD, 0x3091A7A, 0x1022354, 0x9D872B, 0x771FE, 0x12D0F2B, 0x1DBBA08, 0x1495FDA, 0x31B510D, 0x76CDEE, -0x3E825FD, 0x2615D6C, 0x3A1B55C, 0x17B575C, 0x298332, 0x3F466E7, 0x31313EC, 0xBB0563, 0x2904CCE, 0x731B0F, 0x3A05240, 0x1467295, 0x3351DEB, 0x26D713D, 0x7A7E90, 0x12AB28E, 0x1F054F7, 0x30B3EFB, 0x21977B8, 0x11CA1C, 0x19D7DE1, 0x32A0D23, 0xF9A013, 0x36BBBA3, 0x575E0B, 0x83AF7F, 0x3262884, 0x2EE3464, 0x3617084, 0x683DDC, -0xD02019, 0x7C2AD3, 0x24C8117, 0x71FD35, 0x22C7E0, 0x32E2ED3, 0x3B03BCF, 0x76C972, 0x3098469, 0x623F83, 0x1AA25A1, 0xFC5AFA, 0xE0B99B, 0x151A264, 0x2D3EBC, 0x2A4D5F2, 0x3EEA151, 0x6E35D5, 0x24674A6, 0x4716E6, 0x3F5D6C, 0xBCAF06, 0x14BE3AB, 0x34E4433, 0x5F6257, 0xF94F2B, 0x31BCA9E, 0x24F5341, 0x2E2941, 0x6EE839, -0x3CA8E7, 0x1902E6E, 0x3E3D9D8, 0x349119A, 0x33C5A, 0x30DFD59, 0x335A36C, 0x33F66F6, 0x703E43, 0x52AA6B, 0x3403646, 0x2F35325, 0x1FB0FE7, 0x32582C1, 0x11AB3F, 0x35896DA, 0x212DDD5, 0x2137245, 0x3AE4CB6, 0x427F8D, 0x5306F0, 0x28A32D4, 0x20E928, 0x1BFA18E, 0x4AE91, 0x1B9CD3F, 0x2AA6244, 0x1145AAB, 0x2EA6536, 0x59E588, -0x2EE9E9F, 0x3C6BEAF, 0x3F8E9B8, 0x3BCD1DB, 0x4FFC7, 0x137B146, 0x3D4F3A6, 0x448E9C, 0x2A00549, 0x73FE42, 0x3BBAAAD, 0x2F68F3C, 0x4A4224, 0x15B5662, 0x5FA850, 0x345726B, 0x3BAB520, 0x36D28E6, 0xDB505C, 0x92309, 0x1AD6BE7, 0x27AD634, 0x1EAFE67, 0x26D4C2C, 0x8BB75, 0x60E24B, 0x23AD49E, 0x3278968, 0xB6BE6B, 0x13704D, -0x3906F14, 0x1CEB1C6, 0x1EB5D92, 0x3EEF503, 0x57EE05, 0x29E61EB, 0x223866E, 0x1699B77, 0x3707176, 0x7967B6, 0x30127C0, 0x10FF08, 0x119C36E, 0x262ECB5, 0x2A7165, 0x14D1B07, 0x1ECA103, 0x2A3FC01, 0x1227DD5, 0x1D3BFA, 0x318D36, 0x1CEAFC9, 0x225D4AD, 0x3D4F849, 0x1915E6, 0x1F89084, 0x269F250, 0x22FCB21, 0x1EA3D1D, 0x228008, -0x3D1B4E7, 0xA00D17, 0x3D798EB, 0x3B95DCB, 0x55B8D4, 0x22CE2E1, 0x18E9E1F, 0x3885C9E, 0x2B6F6A4, 0x685741, 0x1A3B491, 0x20C2AC4, 0x2428FFB, 0x87E4D5, 0x7E8911, 0x2629078, 0x2BA89C6, 0x2639C03, 0xCA4C28, 0x71C459, 0x3F81795, 0x3C64A9A, 0x2DF1E7D, 0x23C60EA, 0x704AEE, 0x27A63D7, 0x36D56A2, 0x33C06D, 0x21D8FCC, 0x52556D, -0x30C33A7, 0x2D1631B, 0x395AB76, 0x21F0D8A, 0x28666B, 0xC27B9B, 0x2B95D69, 0x2DFD365, 0x44158A, 0x36EF35, 0x127F9C7, 0x1577636, 0x2C9E899, 0x1F5A961, 0x526E78, 0x3F988C1, 0x382F13B, 0x946762, 0x2D2D6B9, 0x6C9523, 0xCFE95F, 0x2A425A2, 0x2DA3E90, 0x322CFFE, 0x658A7D, 0x258520F, 0x522E9E, 0x11CFBEE, 0x1F615CF, 0x681900, -0x1BE9C5, 0xED7B05, 0x3C2775D, 0x2B98D3C, 0x4BC236, 0x330E894, 0xBEA6E2, 0x2CBF119, 0xF87424, 0x4129D4, 0xBBEA45, 0x301A156, 0x2534FCA, 0x21EE045, 0x513E8D, 0x155ECD4, 0x1E85E68, 0x172F537, 0x795515, 0x6C9353, 0x9AAD91, 0x37E06E8, 0x2250727, 0xE76F25, 0x75271, 0xBA172A, 0xC82C76, 0x9B150, 0x17D8286, 0x2281E8, -0xEBA6AF, 0x12DF549, 0xCA8716, 0x58FFA9, 0x50D387, 0x2AEBE78, 0x3785F58, 0x10DE90, 0x2E8A301, 0x6AB369, 0x295D138, 0x1F8C56, 0x3EFD17D, 0xD4914F, 0x58B496, 0x101B92, 0x28136EF, 0x1B0EB87, 0x3EDD5FA, 0x40A8F0, 0x16E64D1, 0x22D229A, 0x17D7214, 0xE8D430, 0x4E004A, 0x6B563E, 0x9FA7CE, 0x23C5179, 0x351078F, 0x29DA9C, -0x3417213, 0x34B70D5, 0x2644883, 0xFFDE18, 0x2E9465, 0x453DF1, 0xDEBD52, 0x393B53A, 0x2CF0C02, 0x4475D, 0x15E7204, 0x17E9360, 0x3CCD2D6, 0x2DD30B0, 0x231A2D, 0x34D97CD, 0x1CD3033, 0x11221FD, 0x30417DB, 0x32D255, 0x1EB07B0, 0x13F6480, 0x2F60BB7, 0x7206B1, 0x12E33F, 0x2A97072, 0x2990E6F, 0xBC1FB9, 0x1F6E737, 0x52E14B, -0x2E374CB, 0x2B06A46, 0x224637, 0x30D6B72, 0x1C8622, 0x24B7D33, 0x21B1429, 0x1893EB7, 0x29C8CA9, 0x362823, 0x8D472A, 0x36BF29A, 0x37C4F22, 0x3808DF, 0x18598F, 0x3F4BCF1, 0x2125EFF, 0x1B7F97B, 0x2C9EA71, 0x7ABF4C, 0x33D95D3, 0x1F1138E, 0x3A43EA4, 0x38F8609, 0x58728F, 0x1F902C8, 0xDA069B, 0x1E797FD, 0x2F7104F, 0x6DB1DB, -0x2C3007F, 0x3BFFB46, 0x3CABBC4, 0x11D1221, 0x7F31A5, 0x3D2519A, 0x166D3E2, 0x1DBE65, 0x316C309, 0x18A78E, 0x15D2030, 0x40A16C, 0xFF4F6E, 0x3A8866A, 0x5C1323, 0x1116B47, 0x2DBC837, 0x733134, 0x36F2266, 0x5D0ABD, 0x302E151, 0x264E4E1, 0x2D5DA3C, 0x2D253DA, 0xAB6AE, 0x21815E6, 0xED2FB6, 0x2FDC8CF, 0x8C0230, 0x546CE3, -0x38DBE39, 0x3AD69A9, 0x3DA4A10, 0x2A2E115, 0x26D2E8, 0x117D806, 0x332F8C5, 0x162C026, 0x12E04BD, 0x2A3517, 0x17A374D, 0x1C12677, 0x5E557D, 0x146B34F, 0x3368F9, 0x101062C, 0x2C9457E, 0x377490, 0x39F0C57, 0x316109, 0x29535CF, 0x83AE95, 0x170932E, 0xE7599F, 0x496A8C, 0x4D5E0F, 0x2025B71, 0x3B0557, 0xB39CE4, 0x608A16, -0xB37CAB, 0x399614B, 0x9A96B2, 0xB9BB58, 0x75B09A, 0x2602455, 0x212CC20, 0xB947AC, 0x2D653A1, 0x7690CB, 0x11973FB, 0x176B654, 0x6D0FC8, 0x2C23240, 0x738A74, 0x1D5883E, 0xCFF24E, 0x9F4832, 0x22D76CF, 0x7FBFC0, 0x35F7749, 0x3924FE, 0x37C181A, 0x3DF9A69, 0x2C255E, 0x1CC656F, 0x3C0B982, 0x3CEC234, 0x2E587C, 0x5960CF, -0x3B1F17A, 0xA5008D, 0x6ACAC7, 0x228A751, 0x434E03, 0xD8B0C6, 0x2824380, 0x73ECA6, 0x900079, 0x1F1AAD, 0x33399FE, 0x12DB704, 0x6076D6, 0x107A775, 0x899BA, 0x1BBF5DF, 0x242CFC9, 0xE11CA5, 0x1E2C363, 0x572179, 0x26DE9AC, 0x132C4F6, 0x375DD6B, 0x33D593, 0x3C8852, 0x1A7757F, 0x3EF541D, 0x2161649, 0x2E43F9C, 0x3F2593, -0x1DCC9A8, 0x3B94FA4, 0xA92E1B, 0x373931D, 0x10069, 0xAA855C, 0xE5A25E, 0x3D73EF8, 0x2060ED4, 0x6CD884, 0xE403EB, 0x37D6584, 0xA7F0B3, 0x3511860, 0x713225, 0x19850E2, 0xC44B31, 0x330CCC2, 0x2D9C28D, 0x105796, 0x22FA4CF, 0x1FD3B1E, 0x3253A14, 0x1CB5D6, 0x32DA1F, 0x285F560, 0x1D157, 0x1D604E7, 0x1A9DC7F, 0x76A537, -0xC699F7, 0x2D3EAF7, 0x2D9647E, 0x18CD31B, 0x4E45DB, 0x372546F, 0x19BCAEA, 0xDEC360, 0x19564FB, 0x4F480, 0x3BB5601, 0x3009E4F, 0x25263FE, 0x189AD92, 0x122F74, 0x3B4DBD, 0x3D9BFE, 0x3F1C21D, 0x3B7991, 0x137061, 0x8172C9, 0x20890AE, 0x1ADE57B, 0x2B7F719, 0x527DCB, 0x5C93F5, 0x326810E, 0xA6C5E9, 0x2794952, 0x64D1CF, -0x65C994, 0x1CCCDE, 0x1615BA0, 0x13942F3, 0x633EE1, 0x251095, 0x8A3B1, 0x308DF84, 0x204AA63, 0x49BB96, 0x2F96678, 0x15F4108, 0x20E182F, 0xEFFF9F, 0x6D7E4, 0x12AE863, 0x43329D, 0x3E38339, 0x1F19821, 0x4D46E, 0x27017C3, 0x24D7105, 0x9ACF14, 0x3D1340F, 0x3F22E2, 0x2CD6164, 0x258913C, 0x24DBA61, 0x339453E, 0x15A2B4, -0x2BBD0C4, 0x681312, 0x219D519, 0x183B322, 0xE7633, 0xA673C0, 0x160F061, 0x33E7FEF, 0x29F1D0, 0x75C2F3, 0x3C7C099, 0x1E95551, 0x3A83E94, 0x3913DB9, 0x245C7A, 0x2BEC2D4, 0x199BE79, 0x1AC867A, 0x8BE991, 0x5DE0B9, 0xB72E22, 0x2CF9297, 0x3A8EDD9, 0x3215EB6, 0x139C2, 0x23AC231, 0x3ADA3A1, 0x2DA0A7F, 0x3BE23B0, 0x501381, -0xF40B01, 0x231A91C, 0x3062B2B, 0x2F6E0D8, 0x51D65, 0x2D65714, 0x2410529, 0x1F0C4CE, 0x1497467, 0x1E510B, 0x1374CF6, 0x39C8FD7, 0x21F0569, 0x3F4E3F7, 0x4BFE02, 0x10F9AA2, 0x1167AF1, 0x1E32AE7, 0x2C5C37F, 0xF7E2C, 0x27407CB, 0x3688C9A, 0x18423C3, 0x2A17FF4, 0xCFC50, 0x285C3C8, 0x2CD3217, 0x333362A, 0x1912ECD, 0x22B4D9, -0x7C2D98, 0xC4ECF6, 0x133F57D, 0x72E924, 0x4F432C, 0xBEFC0C, 0xF447E9, 0x312E616, 0x48BF77, 0x1AB94E, 0x2FC85A8, 0x324D62B, 0x9B1FB7, 0x23E0765, 0x5B2006, 0x145C810, 0x3B5BFC8, 0x2D00CF8, 0x300864C, 0x5794AF, 0x36E35D5, 0x261EB66, 0x203E5C8, 0x2E0DBF6, 0x6BB1F4, 0x23EA1D7, 0x3C7FB52, 0x27BF794, 0xA37AC3, 0xCF6D1, -0x1878CF5, 0x30F8719, 0x200E54E, 0x1D8EE88, 0x2811, 0x1E77B71, 0x3645C14, 0x1E7F382, 0x2F404EA, 0x49E00C, 0xCF019C, 0x15DBA69, 0x333ACCF, 0xA1E5C0, 0x4B4A66, 0x1EDFC1F, 0x85A245, 0x1685F77, 0x3B2991E, 0x278EB5, 0x341CF06, 0x1722E73, 0x39DB8A9, 0x31E09B4, 0x6E58C9, 0x1F2FE75, 0x3846622, 0x18D4478, 0x6646F6, 0x73ECD2, -0x3BB9502, 0x1D47FA7, 0x2E9526E, 0x1C6C18C, 0x29825B, 0x38BB5C5, 0x1A3E5BB, 0x2004216, 0x2794F26, 0x2F2A89, 0x345CE8, 0xCA4BB5, 0x1DEC280, 0x1571F19, 0x727310, 0x36FE9A5, 0x2B2A712, 0x16713AA, 0x33B6A2F, 0x6228D3, 0x3C48EAF, 0x38B17BF, 0x1AC9773, 0x2783FAA, 0x17AB1, 0x2C8D766, 0x2784125, 0x30F960, 0x3E6CC0B, 0x121E89, -0x3E96480, 0x1F40282, 0x1BAB4E8, 0x5C2EA7, 0x9BD8D, 0x2CD4E52, 0x15BE51F, 0xF5BC67, 0x2EEF334, 0x2C9E40, 0x1FD812E, 0x16DDD9A, 0xD9E42A, 0x160EC20, 0x66ABA9, 0x16E77C1, 0x177E674, 0x2509EE5, 0x1717BFC, 0x4CC00C, 0x2B7C16B, 0x1357882, 0x3EFB8C8, 0x258613F, 0xAE5C, 0x2DCEF54, 0x17A40D1, 0x3B72B29, 0x28E770, 0x5D1BDA, -0x9B5EF2, 0x24B5CAA, 0x2C0E75F, 0x2327FE6, 0x259D99, 0x2C433C1, 0x33EDCA9, 0x36048A1, 0x752759, 0x23F5B7, 0x14F62EF, 0x363CC24, 0x2CA9478, 0x4D4863, 0x8FE61, 0x2488C4A, 0xB3A6F1, 0x3E2E4DA, 0x3F4D4A0, 0x15F1EA, 0x42EA0F, 0xAF8FAF, 0x2902C2D, 0x14E75BB, 0x2A5216, 0x3BDEEB2, 0x26414B9, 0x290CA1E, 0xC7B1E0, 0x3A8F26, -0x2BFBFE4, 0x1462A0B, 0x3DDCB71, 0x1C711BA, 0x247006, 0x2221F75, 0x14B5646, 0x3D926EF, 0x136DA99, 0x479461, 0xBAB607, 0x3D723B9, 0x35D5761, 0x23C8D86, 0x31D9DD, 0xDB3B29, 0x1564EFA, 0x22601A4, 0x197AC45, 0x7F06C3, 0x12EB65B, 0x1699B9, 0x2C189D3, 0x2D584FA, 0x5EDCFC, 0x14ADB75, 0x13F4A05, 0x1742EF3, 0x15D4EE7, 0x790F80, -0x1DFB797, 0x356B095, 0x8DB6EC, 0x36D766A, 0xCBE14, 0x2815528, 0x21B17BE, 0x1887C1C, 0xCEE967, 0x2C6361, 0x1F52297, 0x350B0B6, 0x2BD3C75, 0x30878B, 0x4BD354, 0xDE6903, 0x1F37DE4, 0x230A32E, 0x260DF4F, 0x1AAE3C, 0x1436F09, 0xA3927, 0x2D1CEED, 0x8D47BF, 0x779AE1, 0x19797A5, 0x51561F, 0x7B6E0, 0x133BA69, 0x25156E, -0x2C57119, 0x3F60BCA, 0x2ED2AC2, 0x1BC7460, 0x7F8C02, 0x3241611, 0x30363EC, 0x145AEAC, 0x1974B5D, 0x5968DB, 0x1121DBE, 0x9611AC, 0x39CF7D5, 0x35EE00C, 0x57949F, 0x1B66C01, 0x3FA6FB2, 0xEFA471, 0x15FC49A, 0x5C2700, 0x3F656E, 0x24918C2, 0x2CF9CE0, 0x737218, 0x16E824, 0x206B312, 0x2CAEF2A, 0x3416B7C, 0x1AF6218, 0x3C2593, -0x4A3827, 0x232A59B, 0x263E19B, 0x2B7F487, 0x1AE43B, 0x1A6AC6A, 0x3400AE5, 0x29971DF, 0x1FDB663, 0x4708E2, 0x31AC7D, 0x358C8B7, 0x1550B5F, 0x36685B7, 0x53BAF4, 0x63960C, 0x2A8BA95, 0x254F025, 0x31E8F3E, 0x5B5B33, 0x2E96B90, 0xA1E19B, 0x1C4FDBA, 0x3CFCE9B, 0x4748C1, 0xC154B, 0xFB016A, 0x3B96233, 0x1756E67, 0x4A4774, -0x26ECDED, 0x156C18A, 0x3170449, 0x3AA4057, 0x7791FE, 0x12FC35A, 0x2FD80F4, 0x26736, 0x2F294E2, 0x2632AD, 0x1697C4F, 0x3931238, 0x232D956, 0x3FA1803, 0x4EE9AD, 0x1A56406, 0x1010A83, 0x3353A58, 0x3D2FD15, 0x34A3D7, 0x176FAB, 0x3F53F80, 0x1F4E8D4, 0x3A15650, 0x15321E, 0x1F340A4, 0x1C07BE0, 0xBA8670, 0x1862BB4, 0xC7D7C, -0x1BFD9D6, 0x20C4C76, 0x2C65732, 0xE78C6E, 0x34935A, 0x2E8B991, 0x33EEF2A, 0xAC6466, 0x138611E, 0x250DD5, 0x240DE4F, 0x391B147, 0xE04659, 0x53FB14, 0x618EA0, 0xB877C, 0x13620C2, 0x4F7FE6, 0x2ED9A23, 0x572CAB, 0x2834146, 0x205D24E, 0x120A2C, 0x121D8DE, 0x6CD73, 0x336A849, 0x382DC5C, 0x13C4E3D, 0x146476D, 0x36942F, -0x2BF4D88, 0x26B7D9A, 0x3A96A3F, 0x12E021F, 0x2A9A14, 0x38A5B4F, 0x27FBF2D, 0xBABFE4, 0x2185A4A, 0x1BE40A, 0x121F7AA, 0x24065D5, 0x351007A, 0x29A0DA4, 0x1FC66E, 0x242380E, 0x3C0212E, 0x7374DB, 0x256E463, 0x1F3744, 0x380A00A, 0x11A55A8, 0x2A07B83, 0x1D9BF36, 0x1346F4, 0x6312C1, 0x2D5A275, 0x1316B4D, 0xC9D370, 0x775E7F, -0x32341C0, 0x2620A3C, 0x6DF078, 0x28CECFA, 0x144390, 0x1E9C5E4, 0x3181338, 0x87370B, 0x3780088, 0x127652, 0x1F50D45, 0x12F19A7, 0x21F4287, 0x2145405, 0x236F45, 0x18214E2, 0x293FF4C, 0x17453BF, 0x1859416, 0x7CC92A, 0x10EF72A, 0x383A482, 0xC3B2FA, 0x356713F, 0x26676B, 0xD1D447, 0x300C25D, 0x6E7220, 0x21FA61A, 0x664558, -0x760DDC, 0x12E3936, 0x2B63416, 0x1BE103F, 0x5517A8, 0x371CADE, 0x10B1B24, 0x20B9D9B, 0x33BB305, 0x3A7F03, 0xAE366C, 0x21B593, 0x2533DD4, 0x30C9E10, 0x1B6290, 0x3A0E82B, 0x3BF34A9, 0x131E144, 0x972843, 0x166219, 0xE208F1, 0x3344C46, 0x54EA9D, 0x1FB72C7, 0x5A90F9, 0x749D99, 0x11CCC72, 0xB5580C, 0xD5A6A0, 0x6F061A, -0x22A15C5, 0x2AFD867, 0x5F40F6, 0x2604889, 0x29106C, 0x1C9B47F, 0x2C4261, 0x3410586, 0xDE7857, 0x4F379A, 0x28E1678, 0x1BD177D, 0x283C2DD, 0x19EB66C, 0x2C4751, 0x12A6A3A, 0x1EA8025, 0x136523B, 0x9A9031, 0x5532BC, 0x196CE22, 0x2A2F8D, 0x256FA5C, 0x1EA6BBE, 0x4FA312, 0x173E67F, 0x3A10EAA, 0x3854944, 0x3516BA1, 0x3C7727, -0x1117EA4, 0x28B399D, 0x3B9C48F, 0x12B4EE, 0x7BCA8E, 0x26F88E5, 0x10E791C, 0x2286D57, 0x29A80EC, 0x3337D3, 0x2005953, 0x185CA40, 0xA1B0B, 0x19CD39B, 0x514BD7, 0x2D7C6B4, 0x3E24D3B, 0x375EF97, 0x33B9FC6, 0xABE13, 0x20E54A7, 0x2041E84, 0x11DF6C8, 0x1B5E9B8, 0x634F96, 0x1B94097, 0x1314C42, 0x373A504, 0x319548A, 0x68D49F, -0x444666, 0xA57363, 0x222469E, 0x17F6666, 0x542C4C, 0x35DA76F, 0x3E25062, 0x2CB413F, 0x1E1AA1F, 0x7133FA, 0x16402B4, 0x2024991, 0xDCC2F1, 0x29E4C71, 0x52DDAD, 0xC71590, 0x3C3484C, 0x18266EA, 0x28BB08C, 0x14EC2, 0x3B34DD6, 0x16D8510, 0x3219AC0, 0x3EAC462, 0x157ACB, 0x2D01099, 0xBD2E12, 0xEB8E4E, 0x1368A68, 0xABF4A, -0x19EAB1F, 0x14AE16D, 0x29ED5F8, 0x311C9A1, 0x1BD259, 0x3790A15, 0x19764A1, 0xA00CE5, 0x10AF3D6, 0x17A484, 0x381363B, 0x1465488, 0x207A01E, 0x1E9C436, 0x2336D0, 0x714CB2, 0x2FCBEF7, 0x38A9CFE, 0x11D308E, 0x2F7A51, 0xE3716C, 0x2CA81F5, 0x180677D, 0x34B4282, 0x40E8D8, 0x1401AE4, 0xD8F39B, 0x32FD644, 0x3811C05, 0x53F9CA, -0x1DADC53, 0x25BB363, 0x169658D, 0x13CF5B, 0x150282, 0x439CE2, 0x282485, 0x3E566F4, 0x1C48DB0, 0x738C53, 0x28BF9FC, 0x21FB9E8, 0xFEE64F, 0x1126A04, 0x4F1899, 0x321E376, 0x2A57D2F, 0x16BCD0A, 0x90A0E1, 0x6170CC, 0x364FF75, 0x31249F5, 0x20129DF, 0x158FE8B, 0x227EA1, 0x22CE087, 0x34D9958, 0x285EADD, 0x3A81E94, 0x473D3B, -0x12B0EB2, 0x261BB83, 0x18EDC0B, 0x31D0374, 0x78E584, 0x4FD508, 0x36FCC28, 0x2DBBD5A, 0x16AD66F, 0x1C6AED, 0x1DB5BA1, 0x17F32F, 0x231925D, 0x1ACFE25, 0x4A58FB, 0x18144FA, 0x3D8D0FF, 0x7E5DB2, 0x379B56C, 0x46A445, 0x3E9E1DA, 0x2819A13, 0x1C9FF67, 0x1648EC5, 0x57B251, 0x20A2A0C, 0xE65020, 0x1AB5504, 0x2C9BC13, 0x62F4B9, -0x1B033D, 0x3A98074, 0x3FFBE79, 0x5DBC43, 0x9BC6, 0x7ED39A, 0x2FFCBE4, 0x1400333, 0x27989B7, 0x253D0A, 0xBEF06A, 0x2EFB73F, 0x342D7A9, 0x1E96CE4, 0x2D1B6A, 0x124CC9A, 0x3ED18A8, 0x1CB0BAD, 0x2760B05, 0x2E8CDE, 0x2F1B2A1, 0x3E06F35, 0x12947C3, 0x23E901D, 0x4CB0B, 0xCBD304, 0x1B4F6CE, 0x3553FA3, 0xD10A20, 0x59FEF9, -0xB9342E, 0x20A9D07, 0x1982919, 0x2B05EC0, 0x7B9D63, 0x2293ECE, 0x2D3B096, 0x30A4530, 0x2B67E83, 0x611069, 0x256A79D, 0x18A966D, 0x15B07A2, 0x3D35B40, 0x6FE6F8, 0x25D4E50, 0x3F5D906, 0x111C2C2, 0x1C1632, 0x24B0C5, 0x2FE45AE, 0x220B924, 0x2761834, 0x6C4D0C, 0x68D0B0, 0x178B8C, 0x32AABE5, 0x3D010EA, 0x24C01B5, 0x123E3A, -0x37FF33C, 0xBF9A4D, 0x184BECF, 0x2AD1021, 0x4FC960, 0x2E45654, 0x31116C7, 0x1A932AD, 0x734742, 0x498968, 0x3E9E3B6, 0x25666B, 0x1C15795, 0x2FBBA18, 0x7F6FFB, 0x2391B25, 0xA167F5, 0x2D272ED, 0x8A26DA, 0x5E8BD5, 0x3CE7C79, 0x24A02B6, 0xD54C94, 0x3A197C3, 0x510999, 0x27D373B, 0x15CC533, 0xEF27F9, 0x292EAC2, 0x4B2C0E, -0x5A8DB, 0x33022D7, 0x36972D7, 0x3A88D41, 0x4609A0, 0x2C55B8, 0x12E8D46, 0x21E6E20, 0xFF7BD, 0x5E4D59, 0x26598B, 0xE10BCA, 0x2179FE6, 0x34EAD8A, 0x782A3F, 0x1E5A0F3, 0x6865E, 0x3BA2D2F, 0x2F758DF, 0xFB4C6, 0x20C350, 0x3D30648, 0x23C0FBF, 0x230F48, 0x148590, 0x1A2E55, 0x35EA227, 0x12CB65E, 0x3D20FE8, 0x1D78DA, -0x1BC09CA, 0x13446C0, 0x25155B5, 0x15CDCC1, 0x54FDE7, 0x5B63A7, 0x948C44, 0x1A99897, 0xE52B73, 0x712D1F, 0x1ABEFAB, 0x1001BBA, 0x1381B55, 0x1EEF531, 0x4DD8F, 0x1995549, 0x8ACE8C, 0x122F98D, 0x379AD5C, 0x637A53, 0x92DA63, 0x1F5A6D3, 0xDF6836, 0x2545094, 0x236F2A, 0x23D7B7C, 0x15426BD, 0x25F8B26, 0xDAF12D, 0x8522E, -0x2CE8301, 0x39C9404, 0xCE9ABA, 0x1C65EA6, 0x493B25, 0x30E5F0A, 0x2160E15, 0x2B6C331, 0x1EB1053, 0x65F547, 0x699753, 0xA8DA15, 0x3DB3D00, 0x2D365BB, 0x2BE693, 0x1BDC53D, 0x921046, 0x2E1E3B3, 0x3026A3E, 0x55691A, 0x14C047F, 0x278E514, 0x171F024, 0x3242DE0, 0x765674, 0xD6AB37, 0x1BD918E, 0x118F116, 0x2932D4B, 0x746ADB, -0x1DA6CDA, 0xA4D94F, 0xB6893E, 0x354BAFC, 0x5D004E, 0x2D96CC2, 0x1F10B42, 0xA7465C, 0x38472BF, 0x3350DB, 0x2FEF67, 0x233EA25, 0x1164C63, 0x3FF16B0, 0xFF2DF, 0x9AA471, 0x2D89799, 0x386D9E1, 0x340F4BA, 0x13A219, 0x2F9F7F9, 0xA167AE, 0x1ABCDB9, 0x2463DF5, 0x645C50, 0x1F7E5C6, 0x433FA6, 0x170D25C, 0x2D4FE40, 0x13D858, -0x2B1E7A6, 0x962C4E, 0x3FC4DDB, 0x127D384, 0x4849FF, 0x111CDA8, 0x21FEA15, 0x2F7A9EF, 0x134ED34, 0x48C50D, 0x7C9AC, 0x2108B20, 0x34916C9, 0x39968F4, 0x3FDD72, 0x1B4EC3B, 0x632C59, 0x3DAA56B, 0x3E78FF0, 0x6E2C6D, 0x1DC97A4, 0x2D6AA63, 0x3795F6D, 0x20BCFE, 0x423FD4, 0x24D0A0, 0x23D7B7D, 0x193642F, 0x9A584E, 0x1A091C, -0x3B2D58A, 0x70AEEC, 0x1099316, 0x27AC3EE, 0x2E8D33, 0x2FAB685, 0x3DF447D, 0x1A4B45E, 0x1A36CF, 0x7F222A, 0x2549A7C, 0x25BE5D0, 0x81C9AF, 0x3CC773B, 0x55370D, 0x31AF02E, 0x23A06C4, 0x2CEEDE9, 0x8A91D2, 0x58BD06, 0x2882E0D, 0x103E9F2, 0x24108AB, 0x255957C, 0x5B4DB1, 0x2DA77FD, 0x13AD1E6, 0xA7D475, 0xDF5BF2, 0x67A8A4, -0x30C9D58, 0x2095293, 0x618988, 0x20EFB43, 0x3232BA, 0x7B57D9, 0x2C37808, 0x2AA7587, 0x2FB2C25, 0x20DF6, 0x229A8AB, 0x390414A, 0x1A69EF9, 0x3B9C285, 0x4AE671, 0x23EE086, 0x33A5524, 0x2887167, 0x265B077, 0x6878C3, 0xAC097E, 0x31C4524, 0x1144B29, 0x27095DF, 0x1CF41A, 0x6D8D29, 0x1E5AE9, 0x3B65059, 0x32352BA, 0x1C2E6D, -0x9B4CB9, 0x2413E2B, 0x29E7BFB, 0xB1E191, 0x4EA174, 0x1BE57D, 0x8BD210, 0x132F5A4, 0x371DF5B, 0xBE0AF, 0x2C738D3, 0x1D972E8, 0x89D5E8, 0x1D41668, 0x7DAD04, 0xDF7DF6, 0x22B8310, 0x16D5928, 0x25DC568, 0x51C65F, 0x6FF9D4, 0x57531E, 0x9EFA96, 0xFB006, 0x507FFE, 0x204E41F, 0x3D19A11, 0xEFC1C1, 0x3559F6C, 0x282FE9, -0x1FB8178, 0x39FE0E0, 0xD57EBE, 0x117082, 0x5BD4B6, 0x39ED71, 0x1A51D0D, 0x5ECF35, 0x28DCFAF, 0x7CDDD5, 0x28E3B5F, 0x37CCF31, 0xDF3A58, 0x13D7172, 0x40E671, 0x1489131, 0x20752FF, 0x28C1EA8, 0x2D2470, 0x6B3640, 0xB6303B, 0x1D73D65, 0x254AD44, 0x3771CB9, 0x5B630C, 0x394CE3, 0x161EB4D, 0x2275A0B, 0x18053E1, 0x3EA3BA, -0x120EEEE, 0x3AB0497, 0x33BBC3D, 0x210428A, 0x2EF356, 0x1D7EC00, 0x2E8FEB1, 0x223CEE6, 0xCCDEAA, 0x5FABCB, 0x17889A, 0x84B9EE, 0x11D16B1, 0x216A6F9, 0xB37D2, 0x362BF51, 0x32950F5, 0x1D0617, 0x329A52C, 0x896B4, 0x277A1FB, 0x3765C60, 0x865E3A, 0x1F751E1, 0x553DEE, 0x36CF430, 0x2E3DFA4, 0xB90904, 0x3C82C91, 0x5B6A78, -0x14B9BFA, 0x21DB0A4, 0x29F4A2B, 0x14BA765, 0x704DE9, 0x37E7654, 0x3A86D52, 0x3660B04, 0x2CE5273, 0x5D307B, 0x1049D17, 0x1308FAF, 0x1063CEE, 0xEEC40C, 0x7A8829, 0x286F656, 0x210CAE0, 0x16E500B, 0x21A8588, 0x260A9C, 0x29D8686, 0x39B998, 0x2000D14, 0x3FD5806, 0x296011, 0x19B2928, 0x3C3DDB3, 0x3DD4536, 0x308EA08, 0x267409, -0x17F8423, 0x10410DE, 0x2CA90F0, 0x981658, 0x3DA610, 0x2098EA, 0x277FBA8, 0x20C52E6, 0x3BBB34E, 0x427E7E, 0x190793D, 0x3E97A29, 0x3C9175E, 0x37DB7F7, 0x1F5841, 0x1F3C326, 0x786E2E, 0x2E911AA, 0x260FF3, 0x7BD5B, 0x1C48939, 0x25082FA, 0x23DD16, 0x13C2EE4, 0x794033, 0x113764F, 0xCC3FFC, 0x6C79BB, 0x2C33F89, 0x674FF1, -0x129CBAB, 0x20FE35C, 0x904E20, 0x38F6701, 0x7E6223, 0x1E883A3, 0x10469F5, 0x15909BE, 0x1939F09, 0x726426, 0x412322, 0x3DE0ED7, 0x280BBB1, 0x23F5CDC, 0x46716E, 0x3FDE271, 0x18FBABB, 0x3B9FA3, 0x4C7F0B, 0x6C256C, 0x32CE1D8, 0x166AFF2, 0x2708132, 0x15BA5AA, 0x53B965, 0x119DA19, 0x1F23494, 0x995FAA, 0x10E3A7E, 0x5019F, -0x24F1EE, 0x7556A5, 0x3F0905E, 0x139B0BC, 0x63E8E1, 0x3A10158, 0x2113A65, 0xCA5E9D, 0x347A528, 0x51904E, 0x36BC6CC, 0x118B537, 0x2691B09, 0x239885C, 0x2EE530, 0x147187A, 0x10E1211, 0x1DD43F8, 0x2365CE7, 0x62B92B, 0x39F083C, 0x152AD74, 0x8D43CA, 0x181228A, 0x25B333, 0x2296979, 0x2C3F59F, 0xBB77CA, 0x1E0E58B, 0x58BA2E, -0x38F3515, 0x23C686, 0x3BDD778, 0x27B9C54, 0x290C21, 0x1128F01, 0x11107DE, 0x35E3758, 0x101BD44, 0x442DB, 0x30A9D, 0x7794BB, 0x36B2E74, 0x159619D, 0x374697, 0x33ADDE0, 0x130A9C5, 0xD0CBD6, 0x1B48222, 0x2280B6, 0x2DB0FB2, 0x2D4F2D2, 0x5BEDD1, 0x10E5311, 0x397496, 0x7FDD97, 0x1AA579F, 0x2D9553B, 0x1BD4125, 0x6EACDC, -0x38A5BB3, 0x3413015, 0x9C8178, 0x17D4857, 0xD171A, 0x104962E, 0x3405C71, 0x95EFE0, 0x152C881, 0x4EECE, 0x1DB67AA, 0x3451400, 0x20AFAC4, 0x19A5885, 0x6577C4, 0x1686AD7, 0x2B98B66, 0xD89CDD, 0x12348C, 0x7A053A, 0x205A880, 0x26A839, 0x232F1FF, 0x9BCD54, 0x5D2604, 0x13AAB89, 0x2BDEC2D, 0x378CFBD, 0x2E7AC37, 0x5EEF31, -0x3276288, 0x21F95AD, 0xD06FB7, 0x2A1F58C, 0x4DCCCB, 0xA44B01, 0x39292F0, 0x2E33415, 0x7DC762, 0xF0A98, 0x18A1502, 0x2EDE1BE, 0x628E0E, 0xF39C37, 0xEA4AA, 0x3EBE037, 0x1890185, 0x2B5B8D3, 0x3874800, 0x2D20C0, 0x1A30282, 0x3E2A904, 0x2C4C336, 0x1F605E3, 0x1D87C6, 0x26B63B, 0x237FE2B, 0x9BCE46, 0x2F5FC47, 0x266086, -0xD829F8, 0x3FBB720, 0x335505C, 0x280A774, 0x146902, 0x68394, 0x36E4C9F, 0xC38413, 0xDE47D9, 0x55FA41, 0x21BF9D, 0xE57040, 0xE45E06, 0x1A33DE7, 0x18D662, 0x1CC21BF, 0x2B9A163, 0x18F79E7, 0x2FA5ED5, 0x3AD51D, 0x2EEF43C, 0x24B1D1E, 0x2BB7067, 0x1E085C2, 0x27EC9B, 0x223E935, 0x3F28E52, 0x1126AA, 0x36636D9, 0x18F7CB, -0x36EDBFA, 0x11B39AC, 0x2A23341, 0x3AADF33, 0x1DCFB4, 0x1B35467, 0x26387D1, 0x1412684, 0x68EDC7, 0x1B20D7, 0x2602E3E, 0x1D7F253, 0x2C2B7A8, 0x291255D, 0x78C15F, 0xCAFE8A, 0xC9B407, 0x666B52, 0x1CC935C, 0x3F53F5, 0x1BCEBAA, 0x20DB8E, 0x884383, 0x1CF740B, 0x27A30C, 0x1F2306F, 0x3B5CF7B, 0xF745DF, 0x22DB3D5, 0x75EE4A, -0x11A8E3E, 0x3327161, 0x1F2D97E, 0x5A430F, 0x496B58, 0x1169E7D, 0x2E87F8B, 0x3BD3F7B, 0x120429D, 0x4B0618, 0x103C7BD, 0x1B590EE, 0x106D40E, 0xED45DA, 0x3C90F6, 0x11F1EC6, 0x11F86B1, 0xDCF92F, 0x1314F30, 0x70C245, 0x653B4E, 0x1D7491B, 0x30DFB5A, 0x29082A2, 0x7E5173, 0x1D0F4A3, 0x2D1311C, 0x1977CAF, 0x3A4AEF9, 0x69A3A4, -0x1FEB424, 0x24C60F3, 0x213026E, 0x2D5BE9F, 0x1E0489, 0x2FE9979, 0x2FBE99C, 0x1BF9669, 0x1A9FBD9, 0xF8AEA, 0x16B51FF, 0x2E20FAA, 0x867FF0, 0x1D8F0F2, 0x31A668, 0x301C9BE, 0x1E800A5, 0x3D63688, 0x335C321, 0x545644, 0x39410E0, 0x2DBEDF6, 0x33B9537, 0x3C408A6, 0x6CA227, 0x3F22468, 0x6D35C7, 0x29F9C7D, 0x34EC815, 0x522058, -0x251FB62, 0x2FF604E, 0x24A15F4, 0x28E9090, 0x105B94, 0x25857B, 0x3F6DA16, 0xBF96D, 0x2205514, 0x14D985, 0x7A9404, 0x2E0D453, 0x2F37B4D, 0x26215BE, 0x67AAF9, 0x1BC821C, 0x275FA57, 0x2BD3122, 0x5E8BDC, 0x7E617A, 0x222695A, 0x1336E9C, 0x3EAFE96, 0x3310281, 0x677619, 0xDF7538, 0xB0266A, 0x21AD7F8, 0x1D7B256, 0x2A2191, -0x11FF3DC, 0x2B051ED, 0x23EC755, 0x14BE1A0, 0x4A87F6, 0x350278D, 0x352A48F, 0x120A6D8, 0x2B26300, 0x4BB952, 0x31E482C, 0x315E9A8, 0x1653968, 0x5207F5, 0x855A1, 0x208D16D, 0x176DAB1, 0x39C63F0, 0x1D1B841, 0x33F9E5, 0x51F53A, 0x11638FB, 0xF811F3, 0x20DB3B3, 0x4AE3FC, 0x1875DDF, 0x2CB8B6D, 0x35E03C0, 0x498B2E, 0x42336A, -0x2DC3868, 0x14514F2, 0x22DEE36, 0x3F5BA02, 0x25081C, 0x72E53A, 0x2CC424A, 0x1C37D4C, 0x126846A, 0x16CA93, 0x11CB675, 0xEB9103, 0x16A4B1D, 0x3B2F32F, 0x41B2D6, 0x3D0CA1, 0x39AA113, 0x3023475, 0x37B6FC1, 0x2CD0E0, 0x3002A6E, 0x3511B76, 0x60985A, 0x1506699, 0x72A06E, 0x1A3276C, 0x1E4E1FA, 0xB109E7, 0x18EA502, 0x414A81, -0x391333E, 0x1ED4F05, 0x183825C, 0x242D0B3, 0x3EA571, 0x94F121, 0x2CD1AC, 0x2C9DBF2, 0x2EA66C1, 0x47570C, 0x6CB5F2, 0x340721D, 0x28EFE6B, 0x2313025, 0x3C0B0B, 0x2243975, 0x25A43, 0x3DA2B22, 0x3929590, 0x251737, 0x2B89DEA, 0x352448E, 0x5A63CD, 0x3478F31, 0x68748C, 0x2FFEA7B, 0x1D19A17, 0x3A29563, 0x721BAC, 0x4E4C5B, -0x2D32070, 0x2805D45, 0x13A3E1B, 0x28312EA, 0x5CDD35, 0x1E30288, 0x19A831F, 0xF1DBC, 0x15A83DC, 0x221071, 0x2C0F647, 0x1A4116A, 0xBC00FB, 0x25D8F42, 0x111DCB, 0x178CCED, 0x36CE7DB, 0x3892266, 0x21F1C0C, 0x2A325, 0x22A4B19, 0x325339A, 0x296F76F, 0x3C4BF3E, 0x474DB0, 0x377AC7, 0x1161302, 0x34C00C4, 0x3D0C503, 0x5F435B, -0xEEF7A3, 0x1070C05, 0x3F04B97, 0x8ED4DB, 0x54596C, 0x2A39450, 0x3D5AEDB, 0x2B10EAD, 0x1B86995, 0x32F24F, 0x1F54E3F, 0x8B9375, 0x607214, 0x1E14D98, 0xD6AD5, 0xBF9928, 0x18BFF74, 0xAA7F6F, 0x24C0057, 0x72569C, 0x31B9216, 0x24D5E64, 0x27EF42, 0x1015F9A, 0x49D6A, 0xBE0E05, 0xF88018, 0x29A4622, 0x3A5A77E, 0x20D91A, -0x56D47A, 0x46CD15, 0x2A0F026, 0x4F46F7, 0x601DD4, 0x3BC9BF8, 0x5EC9A, 0x2359E6B, 0x3CF5950, 0x63399F, 0x5A9764, 0x3F88941, 0x1BEBDBD, 0xCD1992, 0x10ACD9, 0x20535CE, 0xB56943, 0xAF1C65, 0x158E713, 0x49EFBD, 0x304DE65, 0x176F628, 0x1BE865A, 0x3B43A19, 0x8DDEB, 0x2715261, 0x20CDF57, 0x7E25DB, 0x1D525A0, 0x34CF4C, -0x244B56A, 0x31D05B, 0x3151D84, 0x34A0B62, 0x10FD30, 0x1C9568, 0x3F9B7CB, 0x22B136F, 0x22DB4C6, 0x66D8A3, 0x1984774, 0x2B4EA98, 0x3D9001F, 0x361ECC0, 0x412A9F, 0xF0EC9E, 0x2517B8, 0xF37272, 0x31FAA13, 0xC91B4, 0x15832F0, 0x18BC97F, 0x1842984, 0x1E31243, 0x6F4CD5, 0x1ED3342, 0x1F4930C, 0x1FD8ECC, 0x1A65145, 0x580AB9, -0x465B3E, 0x2A5FB66, 0xAE36D8, 0x3004242, 0x16995D, 0x15B774A, 0x9A9395, 0x3568506, 0x396A2A6, 0x82636, 0x23B413, 0x10D7312, 0x34E7A99, 0xF443BF, 0x41FC42, 0x2F9F9DD, 0x8DB739, 0x2ADC114, 0x312EF82, 0x6C3995, 0x135753D, 0x8B25D6, 0x2A78F3F, 0x9FB7B0, 0x6B1B3F, 0xCC292E, 0x36AB0CB, 0x15AADBA, 0x1BE28DE, 0x385603, -0x2C01D73, 0x5532AB, 0x349B947, 0x1CF8FC3, 0xA22E5, 0x1C184A3, 0x32527CE, 0x10C8C50, 0x21A14D7, 0x2AADD0, 0x3992446, 0x316EF97, 0xE9022B, 0x37F28EB, 0x15D36A, 0x26308F9, 0x438DE, 0x307038, 0x3635C60, 0x161B06, 0x2F8D056, 0x3D38EAF, 0x16BACFB, 0x3FA71DA, 0x2A1765, 0x18EF392, 0x1751338, 0x1E646A1, 0x2700DA7, 0x540523, -0xF58E29, 0x2F68484, 0x30355FA, 0xE5D2CD, 0x40CBB0, 0x1322520, 0x3F7CA1D, 0x547A29, 0x3A17B6B, 0x3B32AC, 0x1679DF8, 0x324AD07, 0x2F0C0F0, 0x351E9F4, 0x7F07EC, 0x2CCC95A, 0x3197016, 0x3221B5F, 0x29C18DB, 0xE8B1D, 0xE8A846, 0x2F44C7D, 0x3391B2E, 0x12AE95F, 0x7DF51E, 0x2F689ED, 0x3CF503F, 0x1582AA2, 0x57F797, 0xEE1E1, -0x2240AE6, 0x96D0B, 0x2AF2F7D, 0x3BFAD49, 0x29FC1B, 0x205E565, 0x3FF2BFC, 0x2E1E7C5, 0x3621038, 0x4CF4D0, 0xAD0FF6, 0x34604, 0x15FFB8B, 0x1A1AEA4, 0x44D3AF, 0x2A8D08C, 0x23F47BB, 0x1C4F86A, 0x2B73DBB, 0x3EB300, 0x3588186, 0xF099A, 0x28818DB, 0x34C07A5, 0x289D0F, 0xCCC170, 0xEE8983, 0xC77BA8, 0x151B3AC, 0x26EE69, -0x2C4EA05, 0x2762FE4, 0x37D1110, 0xD99C64, 0x33AA0, 0x2664A0B, 0x2640B96, 0x3B714BD, 0x3B513F6, 0x42BD48, 0x3A9622D, 0x27864D5, 0x222F735, 0xA1576B, 0xD6EE9, 0x3643859, 0x37AECC8, 0x366EC24, 0x3878645, 0x4C60FE, 0x1FAA328, 0x2BB03A6, 0x1B0A3BE, 0x385C271, 0x56C2A, 0x2710909, 0x2278318, 0x2CE17FE, 0x34EB139, 0x7E3B5C, -0x7370A4, 0x1921AB, 0x1429E9D, 0x18BA67E, 0x4B1A8C, 0x3C3197F, 0x2943882, 0x300B11, 0xF0976B, 0x75EC51, 0x1466DCA, 0x3F4192C, 0x308CFB9, 0x33F3964, 0x290379, 0x1B99943, 0x3B8FEDF, 0x2A44CA3, 0x3A4C3EB, 0x2AF7A3, 0x105E35B, 0x2539B45, 0x1F0D2, 0x2146078, 0x7D5345, 0x31D4CFE, 0x215C020, 0x90E902, 0x41741B, 0x419F25, -0x3828172, 0x79E7D7, 0x27985F7, 0xA90CC9, 0x2921E, 0x4D4A6A, 0x20554FA, 0x173DA09, 0x1064E1, 0x220C82, 0x320C927, 0x2D6DE3B, 0x2DDBFD2, 0x3718472, 0x3C99A2, 0x1ED2828, 0x91FF66, 0x90CFB1, 0xE7DC92, 0x4B3A37, 0x12D1017, 0x17A8B5F, 0x987777, 0x2D6EA2B, 0x3AB07C, 0xA6B5C3, 0x1448E88, 0x161F82E, 0x262AA08, 0x449650, -0x1AC4C0C, 0x1231DFA, 0x157D209, 0x12AC850, 0x521E93, 0x1FFEDAB, 0x13D014B, 0x327FC8F, 0x30BBD1B, 0x1DA963, 0x12FA2E0, 0x1D8C788, 0x184E3BE, 0x2234690, 0xD188E, 0x1DE4BAE, 0x20FB4E1, 0x30D9B44, 0x37A0F4A, 0x4FFADF, 0x2938608, 0x2F66946, 0x30E8ACE, 0x324C0A, 0x40968C, 0x6F3A5D, 0x110134, 0x2D37857, 0x1E98689, 0x3E9F47, -0x3D18D12, 0x7BF1F2, 0xB041DA, 0x11EE7CB, 0x4FB87A, 0x35C8CAF, 0x1A917A2, 0x35CC755, 0x2DDC8EC, 0x7F6991, 0x132635F, 0x4285A5, 0x152F3FA, 0x735CAF, 0x15E61B, 0x667C21, 0x11770ED, 0x1BAFE6B, 0x352EFCE, 0x45CF3B, 0x29D63F9, 0xEC18DA, 0x1567734, 0x3126B12, 0x457551, 0x3CEC018, 0x398468F, 0x392D331, 0x1611D5A, 0x7D19E2, -0x59655, 0x1477C5D, 0x3337789, 0x225A9E4, 0x573CD, 0x1458870, 0xDC487F, 0x3C3CB3E, 0x2C686FA, 0x3CC032, 0x24D5A41, 0x7741B4, 0x1FFA257, 0x3B12A8A, 0x17382, 0x1A224ED, 0x2A14305, 0x6D46CD, 0x3B8B5D6, 0x6AF59B, 0x3F80EE9, 0x34F1009, 0xF25287, 0x2E79475, 0x6AA570, 0x20490D5, 0x3CC9F17, 0x2DFBF29, 0x3E4FD0F, 0xFB62, -0x313277, 0x1B980B7, 0x39E7B0, 0x239C8EC, 0x5D8DC9, 0x1A589A, 0x18604C1, 0x31EC5BB, 0x4FBBDF, 0x2A4C9F, 0x3BABA6F, 0x27B72D2, 0x4AD943, 0x12539DE, 0x27F4D, 0x16BBC8D, 0x2B8A8BF, 0x2FF087, 0x2E906BB, 0x230F37, 0x2AAC09C, 0x1DB90F6, 0x1BE3638, 0x2B9B85F, 0x28ABD7, 0x982A, 0x135433C, 0x2A75D35, 0x2849CEB, 0x1DD774, -0x14B0902, 0x19624C3, 0x23D7243, 0x8E8B69, 0xDF507, 0x9628C5, 0x301EE6B, 0x39CC22B, 0x35A2AA4, 0x134123, 0xD450E2, 0x13B8B3C, 0x1ED14E8, 0x36F52AA, 0x53A8C6, 0x1610565, 0x39D0711, 0x3E19D06, 0x3E5BCBD, 0x608DA7, 0x2A0243C, 0x1FF27F9, 0x2B3D59B, 0x2D1BF47, 0xDA36B, 0x2914182, 0x477A0D, 0x31E409A, 0x3302E5B, 0x3BECC1, -0x171C17F, 0x2E2932B, 0x43C820, 0x340812A, 0x2A425D, 0x1523C28, 0x1FF6AB8, 0x2FB1F6F, 0x325879, 0x5FB74C, 0x37845A2, 0x1B83DCA, 0x27780C7, 0x5E51B3, 0x273DB1, 0x2A5934F, 0xB4D737, 0x1D5D7A2, 0x6C9099, 0x73AEEB, 0x2E4392D, 0x2986154, 0x1923938, 0x16FADB0, 0x605021, 0x1EFBF2F, 0x3DAADE0, 0x1354F32, 0x1711DB8, 0x2E4ECE, -0x3812356, 0x1296584, 0x33FDF2A, 0x3680624, 0x555185, 0x23BCE54, 0x3EFE561, 0x27A62FF, 0x270C607, 0x72644F, 0x1E3D624, 0x1AC7542, 0x17D098C, 0x1CC24E7, 0x5BDDD5, 0x150FBAD, 0x25D84F5, 0x3923DD1, 0x3E87EB2, 0x7671FA, 0xC5B802, 0x2ED8452, 0x3BD13D, 0x31CF337, 0x616BC5, 0x2FD8871, 0x1D6D311, 0x38700B1, 0x3AC002A, 0x498A1E, -0x3802521, 0x3C728B5, 0x3E5AA49, 0x1B33802, 0x690634, 0x3DD52B0, 0x30CF1C9, 0x1749F1B, 0x3CC48BF, 0x5D005F, 0x3C622E7, 0x62B575, 0x2E82513, 0x1B45791, 0x50F93D, 0x3FFF3B9, 0x3E8848F, 0x6E588D, 0x3F38F2B, 0x3848E6, 0x3EA4F5E, 0x3F0C68C, 0x144F6CE, 0x3713978, 0xCC5E7, 0x259A7E2, 0x2500B, 0x712EE2, 0x3F6E1BD, 0x257679, -0xB17E83, 0x1A254F6, 0x20704CF, 0x305B38, 0x710F97, 0x11E6AAD, 0x2E3A79, 0x24F6400, 0x2B3407C, 0x5AF48D, 0x3FCC916, 0x19E7574, 0x232209, 0x375F2FA, 0xA3538, 0xD990D2, 0x35EAE91, 0x1A62D6, 0x3CB52CB, 0x46C718, 0x378233C, 0xF5E668, 0x27A0995, 0x3D29A0B, 0x4F4E80, 0x3B77EEE, 0xBC12B2, 0x1B2D991, 0x1E6865, 0x317432, -0x1C11E3B, 0x33369A8, 0x3BE2ACC, 0x205F43C, 0x3FD895, 0x3F750EA, 0x36C5D9C, 0x9B6016, 0x668900, 0x635FC6, 0x30DA1E7, 0x11EACCD, 0x221B84, 0x2866AFD, 0x6C893A, 0x18508D0, 0x16B1C0F, 0x1CB55F3, 0xC9352B, 0x13533D, 0x2CE8512, 0x40DC37, 0xF4B846, 0x9BE69C, 0x2223F1, 0x3BF6A04, 0x3583C, 0x54318F, 0x22C83BE, 0x174BD7, -0x1698189, 0x1E6F1A8, 0x1302EB1, 0x1298805, 0x732BF4, 0xE4BC6, 0x148D0AF, 0x1E85983, 0x1030552, 0x53F66, 0x17C7CE8, 0x3BA2C0E, 0x227B65E, 0x3CE7CBE, 0x790451, 0x6CDB97, 0x3E832CA, 0x3BF236F, 0x3B13259, 0x46D07C, 0x868412, 0x126B38C, 0x34997C8, 0x274CE7B, 0x6DEE23, 0x273E57F, 0x1E1523C, 0x1EF08AB, 0x2F62DFD, 0x1C5BE, -0x2EAE93B, 0x397C10, 0x3DB6E44, 0x15FF815, 0x65583F, 0xA565C9, 0x1749B09, 0x302FE6D, 0x2829AB5, 0x6B3B87, 0x17B8E64, 0x1754CC4, 0x1F89D3F, 0x3436A4B, 0x4ADDB9, 0x20F9BFA, 0x3546643, 0x10A4F1B, 0x3B076F3, 0x30C624, 0x27023A0, 0x3CBC036, 0x34E4AFA, 0xCAD515, 0x3086E1, 0x13DD8C7, 0x36F52FF, 0x92E93B, 0x3B84CB2, 0x690976, -0x124D198, 0x30471E5, 0x73486F, 0x198B80B, 0xF6B95, 0x164F568, 0x22ECE15, 0x116B5B7, 0x3D575F3, 0x55C9B3, 0x2D1C07F, 0xC590D2, 0x12DEDF1, 0x34FF7CC, 0x93D67, 0x257B3D6, 0x34895FA, 0x439A1F, 0x2AF92D5, 0x4B5B18, 0x1D796B2, 0xA3D6D6, 0x1EA466C, 0x1B990BF, 0x7BAFFE, 0x1E68AE4, 0xDD4C99, 0x2D6FB9D, 0x35B7EC0, 0x40903B, -0x32D6BC8, 0x2563535, 0x1DD3357, 0x37A9196, 0x179330, 0x721AEF, 0x32A16F2, 0x32255A9, 0x2780F26, 0x209F0, 0x933495, 0x3CBA5CE, 0x10C5C0B, 0x135C579, 0x5E0DDE, 0x2A6B951, 0xF25AD9, 0x1358274, 0x222835B, 0x6AF961, 0xF2A709, 0xF1CA08, 0x19A0B2F, 0x350CC9F, 0x5E9B8F, 0x12177, 0x4FD4C9, 0x3ACEF0B, 0x3D3DD06, 0x7ABDEA, -0x11471F7, 0x18924D, 0x2D6C6F0, 0x2473FA7, 0x3204EB, 0x13AFA2, 0x2BC720F, 0x135BE09, 0x2235779, 0x6D70ED, 0x22FAA1F, 0x221CF46, 0x33DA207, 0x36E365B, 0x5C73BE, 0x3A1CE1D, 0x2ED01E8, 0x2A8541B, 0xBED53B, 0x7A40EC, 0x1D722E3, 0x15A72E7, 0x3B6AD6D, 0x1E01649, 0x10ACF6, 0xC0EA05, 0x1876063, 0x4E227C, 0x327D664, 0x57B175, -0x2B5A60D, 0x2D02F24, 0x389A4F7, 0x191ED1A, 0x51431F, 0x2A720E, 0x1375573, 0x1A1CCD8, 0x25C0202, 0x6B3605, 0x20E9A0F, 0xF97741, 0x37D478E, 0x6851B1, 0x630C0C, 0x28FEE2B, 0x96C37A, 0x206DC99, 0x1812C5A, 0x4728F0, 0x35B2318, 0x41418, 0x309BB46, 0xA06191, 0x2484F7, 0x23211DB, 0x3B4A894, 0x3A5BBE2, 0x3FC8F74, 0x6425D4, -0x17AEF5E, 0x1A30240, 0x15EEF08, 0x386B71B, 0x2733D1, 0x17F87E9, 0x76D27C, 0x1727A63, 0x33B5535, 0x36D753, 0x4AF537, 0x345FEDB, 0x1B04451, 0x1847755, 0x1DCC4D, 0x13E549B, 0x2378323, 0x2C3D0BB, 0x49C564, 0x2FB2CA, 0x3C31CED, 0x1E453FE, 0xD10D87, 0x2865941, 0x190809, 0x30277E3, 0x37596A5, 0x1FA44B, 0x19E2D2C, 0x13195C, -0x384D41F, 0x2906953, 0x508E69, 0x1C15979, 0x61C7C8, 0x30E1C9B, 0xB4BF1B, 0x152AECA, 0x203895C, 0x7F0654, 0x28873B0, 0x2A7DF0E, 0x324AFAA, 0x236D628, 0x43FCDB, 0x29DA7A2, 0x29E409B, 0x39D9969, 0x16BCCA2, 0x4EAB13, 0x3F85F7C, 0x2AABFA1, 0x2F32B38, 0x3F869AD, 0x69EBA4, 0xD27CBC, 0x1FDB1AD, 0x39E7560, 0x2FA9DD3, 0x273072, -0xD51DB0, 0x5278F2, 0xBFA3C1, 0x350CF0A, 0x161F8C, 0xFE70DA, 0x2987C86, 0x36BB765, 0x1017C80, 0x442B5D, 0x564CC1, 0x1E409C7, 0x95696F, 0xCD4173, 0x3D5DBB, 0x2AF395C, 0x28E2D11, 0xCE6621, 0x2D16EDC, 0x2DA978, 0x1A49388, 0x2B29C47, 0x2DF8755, 0xF957A1, 0x46F2E3, 0xB7BE24, 0x2D15357, 0x2D12FC5, 0x35A363D, 0x67DF47, -0x3898AAD, 0x3889224, 0x113E7A1, 0x272270B, 0x40021, 0x3D460D9, 0x1A79358, 0xAD26C9, 0x177E2D8, 0x4DF64D, 0x1FF89ED, 0x3B41691, 0x34EF1FE, 0x1679A12, 0x290C4B, 0x96ADB6, 0x3EF0F7C, 0x1C9697F, 0x1F9D958, 0x4AC603, 0x15E51B1, 0x100A644, 0x21BE1BC, 0x28765AA, 0x7169E0, 0x3AC596C, 0x157E2DA, 0x217843F, 0x180C206, 0x1CC6A0, -0x3C925D0, 0x34B6DA6, 0x29318E1, 0x3BB5474, 0x6FFB86, 0x20AF1B5, 0x7AC90B, 0x2D7C3AD, 0x54BF35, 0x338198, 0x1F13943, 0xE0525B, 0xB02C1F, 0xBF8718, 0x5D924, 0x27EEC81, 0x5D4437, 0x12B8396, 0x16B1DAC, 0x24D8BA, 0x2548483, 0x3ACB6E7, 0x13F0280, 0x1ECEA2A, 0x6C5131, 0x1BD1235, 0x27E424E, 0x1AA6B2A, 0x21B0E7E, 0x2DA9DE, -0x3127B97, 0x1245C40, 0x21697F5, 0x16B4C9D, 0x7BE2BE, 0x31E9DA7, 0x275C0F4, 0xA4EC96, 0xEEEC7E, 0x500DF, 0x1795917, 0x171DDA1, 0x1345EA0, 0x2A64B42, 0x49575, 0x1AABE35, 0x1FE378B, 0x35CBD56, 0xD6A380, 0x383FAD, 0x3D43D56, 0x14FACAE, 0xC5AB93, 0xC88618, 0x52B395, 0x1BEC03E, 0x12F351B, 0x279CF9E, 0x2D29ECC, 0x4B0DB0, -0x1FF80C0, 0x17DADB8, 0x22158CC, 0x1192D54, 0x1BD2CE, 0x39C4236, 0x23AC96C, 0x29FCD00, 0x2ECBD0A, 0x3B4CE5, 0x1153D9D, 0x925A07, 0x5B9E1F, 0x2C52F13, 0x3E022C, 0xED5A53, 0x740872, 0x4958A1, 0x2C962FB, 0x560D3F, 0x5A6012, 0x16149C, 0x2FD3F44, 0x34AC6CF, 0x5C8BCC, 0x27CE0C3, 0x303D8EE, 0x1BBAD93, 0x32CFA4F, 0x337798, -0xC8E88, 0x3DA33E0, 0x3CCE7A9, 0x1A27CFA, 0x579AFE, 0x3BDB4A9, 0x1075B37, 0x274E7DD, 0x31386C, 0x380241, 0xF0DB02, 0x1075DC3, 0x1E23642, 0xCF1D2F, 0x2F7C81, 0x2D1952E, 0xFF60D7, 0x237BF3C, 0x2228F3B, 0x741B1D, 0x283C68F, 0x6B9911, 0x11E174E, 0x2783DEA, 0xC80DD, 0x3E47C24, 0x135D07C, 0x1252398, 0x1EDA8E7, 0x4B3EB9, -0x1A9A195, 0x26E5043, 0x3B4132E, 0x37013B0, 0x11D09F, 0xDDEA30, 0x3F5794F, 0xE05F92, 0x1562783, 0x296E09, 0x283B696, 0x8030E8, 0x332C4E3, 0x2E913B2, 0x7E33FB, 0xAD0033, 0x300E5E4, 0x84ED3, 0x982D7B, 0x5C8EBB, 0x39D2E9D, 0x1D156ED, 0x3155A66, 0x2753D6E, 0x12FBEC, 0x10D8D49, 0x17DAED3, 0x180B3AA, 0x3B5C197, 0xCA652, -0x11C6F83, 0xE1D4F5, 0x39F2B79, 0x306980B, 0x41644A, 0x3C38F7D, 0x8F5358, 0x1282842, 0x12E2E4C, 0x71057B, 0x25F47B5, 0x3E80545, 0x201D39, 0x28670CF, 0x7536C8, 0x26C2DAD, 0x4F2905, 0x511BE7, 0x30AD066, 0x456C98, 0xB44658, 0xFC9787, 0x1BA7479, 0x73F868, 0x1D002F, 0x1A27957, 0x3B5B878, 0x2BDB9F9, 0x380A109, 0x95DEC, -0x1001D6, 0x34D04BF, 0x1E99E57, 0x28366C9, 0x481C63, 0x1C02AA5, 0x1ADAE83, 0x683C75, 0x1F5E7B5, 0x24AF04, 0x3B68275, 0x2B0404C, 0x2012E37, 0x395C200, 0x418B45, 0x3900CE, 0x1E37BC3, 0x1740875, 0x248EE32, 0x7C5661, 0x1DA7366, 0x2AC29BF, 0x2B3C5F4, 0x209793, 0xAC610, 0x9E4D32, 0x14A390B, 0x1DE8308, 0x1E64CF7, 0x436E59, -0x3352222, 0x1BAF848, 0x3DC70CD, 0x2291BCC, 0x63D176, 0x38C9273, 0x31577FC, 0x11D96C, 0x369C501, 0x474438, 0x1CF6402, 0x12E26E0, 0x1050A18, 0x28EA99D, 0x6BF820, 0x3391F0E, 0x11C8105, 0xCBA3BD, 0x2166A38, 0x3F2B8F, 0x125DA29, 0x149586C, 0x97DED9, 0x31D0594, 0x7EB1A, 0x1B0A9FD, 0x1C3431F, 0xA6CC3F, 0x29224A5, 0x5AB896, -0x20018D, 0x18225D7, 0x456D4B, 0xE7C974, 0x6243B0, 0x24C7464, 0x1BA63F0, 0x1E42F76, 0x2748A4, 0x20035C, 0x3462524, 0x396D7BA, 0x657CC0, 0x15417E4, 0x24BCBA, 0xFA9BF6, 0x2636639, 0x24D443A, 0x3106209, 0x3B621E, 0x35C812C, 0x11CBF88, 0x2D23633, 0x1128CC5, 0x31A208, 0x3F5D0BD, 0x2036DF5, 0xF947B, 0x3D598DE, 0x22D482, -0x2142020, 0x89F714, 0x137E4DF, 0x1892FC, 0x25076D, 0x2657645, 0x29BB8C2, 0x2433CB4, 0x36ED6B6, 0xCE469, 0x25B9C5B, 0x18947D, 0x3538FDB, 0x1D2A3A3, 0x44F822, 0x1EC4B97, 0x2932073, 0x1D0D98F, 0x346AED8, 0xCCD61, 0xC5AB54, 0x30DC4D1, 0x3919B9D, 0x333E397, 0x35DCD9, 0x1BA5722, 0x3207CDA, 0x24DA67F, 0x286BD80, 0x121B5A, -0x3B1F1FA, 0x6C5AC3, 0x5BE0B, 0x35B4EC6, 0x4DC688, 0xE40104, 0x61F3C4, 0x2CD905C, 0x31D0F6A, 0x71AF39, 0x2ACF4B3, 0x7A5FE0, 0x3F5BE69, 0x16E490E, 0xC4630, 0x3DE616B, 0xEBC4DF, 0x27F6B06, 0xA87781, 0x4E26E7, 0x2221004, 0x399B406, 0x10B161F, 0x9033A9, 0x172FE9, 0x2EFFEFC, 0x1434DEC, 0x2A5D4FF, 0x2AC2E26, 0x6BE02, -0x17AF213, 0x2AAE59C, 0x3FFCDD4, 0x156D629, 0x323225, 0x89326D, 0xAAA596, 0x17A4781, 0x3D83591, 0x1BD608, 0x199E141, 0x2DAD13A, 0x28A82C7, 0x2A1E15A, 0x113A86, 0x3814855, 0x11A7F0C, 0x1586D95, 0xDBDAFF, 0x4A18DC, 0x3854FD3, 0x1AD82F6, 0x3A90070, 0x1A15943, 0x4DC356, 0x1F8DCC, 0x3DF3F39, 0x2AE824E, 0xF98CB7, 0x19049C, -0x3138A89, 0x2938A2D, 0x7E25C9, 0x2F07942, 0xF0B7D, 0x2A362D2, 0x1273719, 0x2C85EBF, 0x57995D, 0x19E4B8, 0x3B6063D, 0x18147B8, 0x28E4189, 0x10614A9, 0x9ECC7, 0x3986795, 0x240D37B, 0x35E8400, 0x3FCE6D2, 0x36B440, 0x28AF1CB, 0x991FB8, 0x33A89BC, 0x277454A, 0x62613C, 0x3AE5F2D, 0x62B977, 0x855C20, 0x3CC0027, 0x29CE5E, -0x21D2B1C, 0x14D562C, 0x9360B6, 0x1747DD, 0x45E2C5, 0x553D3, 0x10DCC9, 0x2310483, 0x26216A0, 0x52856, 0x3924181, 0x3500B8C, 0x1555A90, 0x3708F2E, 0x5E610E, 0x7D28A, 0x242B9F8, 0x2E37288, 0x2DAC7AE, 0x7E5132, 0x94CB1B, 0x94BADF, 0x36720D5, 0x3A87F6E, 0x308DDA, 0x1730534, 0x2B02D0C, 0x25B999F, 0x358EE4C, 0x77D54E, -0x2C6EC3F, 0x11EF2DD, 0x2D864D6, 0x8BB0B2, 0xE968B, 0xB08A35, 0xAD7B0C, 0x2B5C4B2, 0xED48C9, 0x3B31DF, 0x2C3105D, 0x13D8E36, 0x3ED3BE8, 0x3C4CFB2, 0x7DB085, 0x2F2CF2A, 0x2DA5165, 0x20AA7A8, 0x305745B, 0x67B2E6, 0x173011D, 0x22C82FC, 0x3864480, 0x2FEF99B, 0x25D5FB, 0x1743987, 0x3CFCF36, 0x1DF3F67, 0x3A2585F, 0x654250, -0x20A88C0, 0x69F82E, 0x350AF5A, 0x205E66A, 0x3616C7, 0x1256902, 0x2709E8B, 0x17DD266, 0x20384BF, 0x3A8EC3, 0x2418E30, 0xD87D12, 0x3645A25, 0x748C, 0x2942F3, 0x35A4133, 0x74EDD4, 0x3BDF60F, 0x284B971, 0x14DEAA, 0x10C8D6F, 0x27EFB38, 0x3D008, 0x4DD5F2, 0x4BF7C3, 0x370567D, 0x28C2FEE, 0x3FB365A, 0x2BD0268, 0x4FED47, -0x1D3DAF5, 0x15F761D, 0x9BB075, 0x8E038F, 0x36C49C, 0x29AE02F, 0x3D90F4C, 0x3CB0A21, 0x11D0CE2, 0x6CF6F7, 0x20F3012, 0x3DE04DA, 0x29D4B5D, 0x37CB803, 0x31FB2, 0x35E79CD, 0x1BF32F9, 0x3B7D4D8, 0x3820B40, 0x23F890, 0x10A8262, 0x1A87FED, 0x3302571, 0x2A9B3C, 0x199B5, 0x6090D5, 0x2F8D47E, 0x13646A1, 0x3F04CD2, 0x36095E, -0xE46920, 0x1D4AFA3, 0x2F65FFE, 0xD02D94, 0x65047A, 0x3156A6, 0x3B957F4, 0x9F6320, 0x1149E10, 0x5AF6AA, 0x1CF3850, 0x2BCA0EC, 0x2F96A6C, 0x23D841C, 0x4E3A98, 0x318F680, 0x3D9E49A, 0x1A4A750, 0xA49F48, 0x9FC3F, 0x2CE1169, 0x224F0BC, 0x3E75914, 0x19C486C, 0x4D15B3, 0x23774F2, 0x49565C, 0x54A6CB, 0x30A9C38, 0x3EE5B8, -0x184D2D7, 0x26CD463, 0x3A587DD, 0x1694FD5, 0x147D5A, 0x1842A05, 0x342412B, 0x1845E1B, 0xA7269, 0x3A0F3B, 0x1410074, 0xF00C98, 0xC17715, 0x19CDC16, 0x4E203D, 0x1F28AF9, 0x33D72DE, 0x39F4EBE, 0x261C309, 0x574B88, 0x1F444A4, 0xC5EC01, 0xB1E326, 0x112CD3D, 0x7480DA, 0x6275B2, 0x3C87371, 0x24E07C5, 0x2E7B893, 0x210494, -0xAA4EAD, 0x3DB287D, 0x19403CB, 0xB32FFB, 0x6BF387, 0x3A566CA, 0x22A119C, 0x1A219E, 0x5A6410, 0x61A80E, 0x39B3824, 0x380A79F, 0x2BD3EA2, 0x21197C3, 0x576229, 0x29F9AF0, 0x38DB525, 0x799F60, 0x6014C7, 0xB826, 0xF3C61D, 0x83B257, 0x7D0171, 0x35B7A17, 0x47196C, 0x2A5CBA3, 0x11F3FD1, 0x226BB0D, 0x35FB7, 0x292714, -0x116B894, 0x2420D4D, 0x2C8E835, 0x2307BB7, 0x4BC57F, 0x1C0DB5F, 0x177BB79, 0x9FB2EC, 0x22B129A, 0x7B9FC4, 0xFC10E5, 0x339B7A2, 0x23D6F82, 0x29C380E, 0x6C5D84, 0x2A531F9, 0x847F13, 0x203E88A, 0x396462A, 0x7D5583, 0x2FB8F39, 0xB5C1D9, 0x318ABDF, 0x1F9CD2A, 0x592649, 0xCACB4, 0x261CAE0, 0x22D6D6A, 0x35487F4, 0x757C1C, -0x3EC4BC0, 0x142C304, 0x299A22D, 0x34D2857, 0x288A77, 0x2D3D4EB, 0x239E3F3, 0x216995C, 0x27056F, 0x45ECE1, 0x132128, 0x3BC98B4, 0x39D6878, 0x27F6624, 0x48110E, 0x2E7741E, 0x3150974, 0x1C65E3F, 0x39C7D66, 0x50CA6, 0x19FE375, 0x3E5D817, 0x10E4E02, 0x26FFB90, 0x2AF48B, 0x3229A54, 0x1287041, 0x1FEFD3, 0x3372F5, 0x43DC6F, -0x1CC562B, 0x13ADF59, 0x124815B, 0x374FA4, 0x369A7B, 0x1D32810, 0x21D847B, 0x1D7F2B0, 0x1377382, 0x116B23, 0x290D2B0, 0xF2BA3, 0x2038CDB, 0x561D59, 0x4017D5, 0x242B600, 0x1E4DCE4, 0x12A1081, 0x398CF35, 0x5086E8, 0x1CB8087, 0x36BB855, 0x27E9F5D, 0x2D83566, 0x773311, 0x2DF2068, 0x16A82B2, 0x288236E, 0xA06C64, 0x7126A4, -0x37ECEF8, 0x437D53, 0x3F6754A, 0x3EEDB8C, 0x3CD7D2, 0x2740C25, 0x7ADB0F, 0x32DEAC3, 0x12103FA, 0x517DB5, 0x3CE27A2, 0x32C9A6C, 0x19E7F17, 0x3347732, 0x4A8FE, 0x385A2DA, 0x21F07CB, 0x1B01FC8, 0x1096ED5, 0x280DA7, 0x16A5A53, 0x3DCBD49, 0x3465A1A, 0x27F1167, 0x71DA83, 0x2587F71, 0xFF9EE9, 0x3335C20, 0x7DDE93, 0x8A420, -0x3014D96, 0x27A95A0, 0x1AE26CB, 0x1DADAD4, 0x5DA170, 0x3E3997F, 0x2B05A30, 0xF7AB55, 0x8DCF5E, 0x41B9A3, 0x600A50, 0x1600CEE, 0x16CC96F, 0xFB2765, 0x6EBFBA, 0x3D66D0C, 0x2220DC1, 0x226F0FF, 0xD5114, 0x2F562B, 0x3D368C7, 0x23F0056, 0x227E238, 0xB389FD, 0x2B7D80, 0x24083A6, 0x3C0DA70, 0x1AAA301, 0x3DB6A35, 0x77E139, -0x3FCE43C, 0x215D1A5, 0x21B69F7, 0x2515950, 0x2726C, 0xE26585, 0x1801CCE, 0x2FF1AD, 0x17ACD70, 0x6134CC, 0xDCD1E1, 0x39CE727, 0x3A1377A, 0x10C8CF1, 0x4E965, 0x1ED2E10, 0xE49B73, 0x3B7397D, 0x1F8533A, 0x6BCDFF, 0x2489774, 0x1AB8ACC, 0x2F8D9C4, 0x26811CE, 0x4A97B, 0xD9784E, 0x142F644, 0xAD7B03, 0x34F8859, 0x448212, -0x1F5E995, 0x19380D4, 0x28D6F34, 0x92CEB7, 0x68AB4D, 0x2AF20, 0x15354D0, 0x2AEB868, 0x3F977A4, 0x613F7F, 0x8A370A, 0x16D3D18, 0x2688B38, 0x3B2F1BC, 0x220DCC, 0x2BD3680, 0x3B14E12, 0xFD8C31, 0x1068800, 0x25A828, 0x694236, 0x3908941, 0x1F5D19, 0x135505E, 0xBC112, 0xD99176, 0x21947B7, 0x1C76080, 0x88F50A, 0x41B81F, -0x2CC80C6, 0x372C998, 0x1F741A6, 0x2C3BC34, 0xB101F, 0x327119, 0xABB63E, 0x23406F0, 0x1DB3312, 0x5B4C51, 0x16EE8ED, 0x3BF4803, 0x7498FC, 0x1CA1FD1, 0x54812, 0x2DD1341, 0x3FB287E, 0x16CE1E, 0x627724, 0xE74BC, 0x346CB12, 0x11C0D4F, 0x90CE90, 0xD156F2, 0x69513D, 0x32497D1, 0xDA1BC7, 0x1086950, 0x2DDCCFC, 0x280A0B, -0xA91DEA, 0x17E664C, 0x387814E, 0x212C0F0, 0x7840AD, 0x3DBCEB1, 0xCB1953, 0xECF46E, 0x208AECB, 0x7E88D2, 0x3F8661D, 0x22A2BB5, 0x1804D7, 0x105BBD0, 0x70EB17, 0x474A10, 0x1BC5CCF, 0x4EF97B, 0x3F131C2, 0x3D0D27, 0x1EE0D71, 0x3D83DA8, 0x1302730, 0x22174F6, 0x7BF6E3, 0x2F1611, 0x22BCCDD, 0x22EDA1E, 0x3B04A60, 0x73B798, -0x642A70, 0x1A6EC25, 0x4A40F6, 0x379E27F, 0x142927, 0x298CBD7, 0x63807E, 0x14510DB, 0x1D9D2F8, 0x6AE4D3, 0x38B4B74, 0x17A63C5, 0x2C6C717, 0xC66F14, 0x40534E, 0x4DB879, 0x47494B, 0x1AAEB72, 0x3E3CFB4, 0x1A7651, 0x150CC97, 0x290E4CB, 0x3599C9, 0xADC807, 0x630B23, 0x35B3839, 0x35DD5D7, 0x64A327, 0x39E10C6, 0x5F0E19, -0x30A3BB6, 0x2C02D94, 0x3857BFB, 0x18365FD, 0x19BA9D, 0x4BD764, 0x1DE779D, 0x11E1759, 0x18C5DDE, 0x5FACBE, 0x16A3B61, 0x3A94471, 0x31DC74, 0x3A1020F, 0x1D8909, 0x7AF492, 0x252D4, 0x2B37CD2, 0x301F24A, 0x2EF1B9, 0x494FC9, 0x3B67BE, 0x1FA1843, 0x39B7D6C, 0x3F9170, 0x3175D73, 0x236F20D, 0x10221FB, 0x2D60023, 0x65B961, -0x2864B6F, 0x2BEDA05, 0x31E97E1, 0xB14D21, 0x54C4B9, 0x161EF84, 0x773583, 0x1298C0A, 0x3A64B10, 0x4390F0, 0x3C27348, 0x138B22D, 0x308C1E5, 0x68BB0, 0x7A987E, 0x4A89FC, 0xBEF643, 0x209DEE4, 0x1FE35F1, 0x1ED8C7, 0x19A3346, 0x37B6B2A, 0x34ED569, 0x39827FB, 0xEB471, 0x39FD03C, 0x1548A8E, 0x2BFBC91, 0x2C919D1, 0x726453, -0x1C5FA4B, 0xF2B94F, 0x2EF64ED, 0x2CD2E7F, 0x1BF1E4, 0x1FAC3, 0x37E7C1, 0x2E2F085, 0xF2C908, 0xA58D3, 0x3ECFC29, 0x25FE580, 0x32E3D1, 0x36E8731, 0x45E46E, 0xD9BCAF, 0x324CC4, 0x33F96C, 0x1162F8B, 0x18DE3A, 0x12A49A, 0x19796F3, 0x3187C9E, 0x17C08E6, 0x71A534, 0x5710B3, 0xECBC07, 0x2C4A53B, 0x3D5F38B, 0x438350, -0x20391DB, 0x34DA29D, 0x1FE2DBB, 0x237FB0A, 0x403363, 0x33117FD, 0x2B5D7B5, 0x2E8C297, 0x3552C41, 0x269C08, 0x38B4218, 0xF93F48, 0xAAAA4E, 0x32C2482, 0x1F48A1, 0x1CF5FA7, 0x1454C45, 0x374A957, 0x3C21DBE, 0x59FEEF, 0x3368710, 0x36D2BDB, 0x34D4FDE, 0x39E7E99, 0x79BE1F, 0x707C39, 0x35AEED3, 0x3BB1689, 0x526415, 0x394A45, -0x1FC43B3, 0x1F53EC8, 0x23BF588, 0x3F12D70, 0x37628D, 0x334BD13, 0x2DDA512, 0xB36C66, 0x2A2916F, 0x6E97F0, 0x1EDBBDD, 0x1AC404E, 0x18333AC, 0x11AD9D1, 0x313F48, 0x3CD9037, 0x16365DD, 0x2537F87, 0x23C09A4, 0x2FDC9, 0x3F833A5, 0x27CE07F, 0x105C9E7, 0x1A3126C, 0x25AC5D, 0x2D8C9EE, 0x3D2281D, 0x3B31E9, 0x327FA55, 0x788C85, -0x3B3B778, 0x3A9476C, 0xDB2776, 0x10156D9, 0x7C4, 0x2FDDD25, 0x24B0AA0, 0x29763C3, 0xEFA3B, 0x650002, 0x37C50AB, 0x6599AA, 0x2D39EA1, 0x3A3E0, 0x528B27, 0x18B8282, 0x2BDE5C4, 0xD3AC4A, 0x354E0B8, 0x55E5A7, 0x9F1039, 0x202C8BA, 0x7A015A, 0x1652C49, 0x199F68, 0x1622435, 0x18C1D43, 0x35B28D5, 0x2331B7B, 0x2687F4, -0x1EAFAEE, 0x2C30096, 0x195AA16, 0x20D3A4D, 0x7AEB10, 0x2679D9F, 0x2F3CD33, 0x3C87E31, 0x20A7BB8, 0x4E2C19, 0xA35C9D, 0x21A72E3, 0x4EBA46, 0x4F0214, 0x3CD353, 0x298389C, 0x23B299B, 0x2740A08, 0x3C3DC98, 0x44C562, 0x47523A, 0x2E28A3D, 0x3576D3E, 0x166FF8D, 0x43A0E0, 0xF2E0AD, 0xB175B4, 0xC61031, 0x3C34585, 0x5F30AA, -0x182B0AA, 0x269E9B8, 0x296B6F0, 0x3486A08, 0x575DB3, 0x27925E6, 0x26ED7BB, 0x2FE3659, 0x1163730, 0x93F89, 0x151FC84, 0x12BDE14, 0x148070C, 0x3146779, 0x1230C0, 0x75A4F6, 0x1BE3E4C, 0x1C6D0E6, 0x898CE9, 0x5DE4A1, 0x1ECE1B6, 0x273E0FE, 0x1617DB9, 0x1284F7B, 0x1C3ACD, 0x359C447, 0x39A79A3, 0x3C164DF, 0x3DDC568, 0x482BA1, -0x1EA8FD, 0x3B4A9F2, 0x172FEFE, 0x3ADC4E9, 0x4E089E, 0x3DF322B, 0x31D2B3E, 0x2FBF78B, 0x33A3AE1, 0x4B4951, 0x2C6714D, 0x3EDAD1A, 0x14040EA, 0x34427DF, 0x72913E, 0x1E9E3A2, 0x22FDBF3, 0x2D62B49, 0x20FBD77, 0x3C08A2, 0x248B346, 0x27D64E, 0x9809AF, 0x1074035, 0x7ED524, 0x1DF44AC, 0x3A10FB9, 0x2151A78, 0x35082A1, 0x25DB12, -0x1D049E0, 0x1039FA2, 0x16CCEC8, 0x3A4A2FE, 0x5A34CB, 0x1DA882C, 0x1771549, 0x27DD87, 0xBF095A, 0x2AF444, 0x2EF5A5D, 0x3ED0A30, 0x3D9C089, 0x3382028, 0xB573A, 0xC240DA, 0x1CA0955, 0x2BC96F5, 0x299C1ED, 0x425CED, 0x1E824FB, 0x16A30F7, 0x1DA994B, 0x1A9116D, 0x264F6F, 0x5C1EB7, 0xA48647, 0x1533ADF, 0x20BE939, 0x5E302E, -0x237F35, 0x1C4BF11, 0x2E66F51, 0xB10B95, 0x2B0AF6, 0x1C71B75, 0x24B2D36, 0x295B103, 0x3FA115E, 0x4D7E08, 0x4534BF, 0x2E7B3AC, 0x21E5210, 0x71F623, 0x73329D, 0xBA79E9, 0x290FF6D, 0x1E87667, 0x109DBB, 0x3435EC, 0x278B098, 0x2150286, 0x3B7838B, 0x1B99E4E, 0x4F6C26, 0x2754041, 0x3A8D45C, 0x3DAC447, 0x1D0B630, 0x109D7B, -0x2A4F6F7, 0x32AD174, 0x3D2AE3C, 0x2DCFCEE, 0x59040B, 0x245BAB5, 0x2CE7595, 0x2AA730, 0x34BE0CE, 0x5C61AE, 0x3CF83E1, 0x850F78, 0xF9A99, 0x2E226DC, 0x13455C, 0x310E2F7, 0x1208A2C, 0x333A546, 0x14BB83D, 0x5B8377, 0x1EA0DD9, 0xE9B039, 0x29A2392, 0x7F272, 0x5EBEBD, 0x3D94258, 0x30816E3, 0x2CD5A34, 0xA682D, 0x7D1A10, -0x1A46527, 0xF00890, 0x3C626C8, 0x175AF87, 0x4127C8, 0x27240B6, 0x21BFD72, 0x1CD926F, 0x79F765, 0x216739, 0x78CAEF, 0x9D41AB, 0x2A0F792, 0x66E20C, 0x1A2CF9, 0x5CF574, 0x1149990, 0x3AE1077, 0x1FD5C4, 0x38095A, 0xA3E7E7, 0x3AAE617, 0x1639E5E, 0x3BADCD3, 0x6A5DD9, 0x4A4822, 0x9FAF9, 0x1F0991, 0x3ED37A7, 0x311085, -0x66C3B5, 0xD87884, 0x288F33F, 0x34B7F21, 0x550091, 0x32BAC13, 0xD17154, 0x31B5376, 0x23E6424, 0xAA089, 0x1D3226E, 0x3CCD1B9, 0x2AFCEA2, 0x2ADE1DD, 0x208790, 0x33850C, 0x30AB98D, 0x2F52AC7, 0xEB6843, 0x3C5C37, 0x34CDE6A, 0x6D000F, 0x178196C, 0x311D702, 0x4546A9, 0x536294, 0x187F4FA, 0x10666C9, 0x18FF035, 0x43F36E, -0x2167F4, 0x1807631, 0x71B296, 0x30E1595, 0x241C1F, 0x1CE9617, 0x289F84, 0x24EEDB0, 0x206DF9F, 0x603811, 0x1AC4D52, 0x1DADD8D, 0x1A38C10, 0x3FEE37, 0x16601, 0xC43E0B, 0x23DA6FB, 0xB17523, 0x8055C2, 0x63303A, 0x3B22591, 0x2B6CB96, 0x131DE8B, 0x38862D8, 0x3A10A4, 0x3F1A7B3, 0x2AC06AA, 0x126F236, 0x69A290, 0x1CE8A5, -0x2A87928, 0x1DD78A8, 0x30159E, 0x3AD2DCE, 0x770B48, 0x36BF79B, 0xF0AF85, 0x6600B4, 0x3ED909, 0x195785, 0xB0AB2A, 0x157B87B, 0x34F0444, 0x2845BBB, 0x620CEA, 0x2183F6B, 0x23D8B2D, 0x15DB019, 0x23ABCB5, 0x3274F7, 0xF7ED5F, 0x2939183, 0x2F43D2B, 0x1EF02D9, 0x19CFC1, 0x1942722, 0x39BD16C, 0x3B58CBA, 0x11CE451, 0x5D93E4, -0x28FC597, 0x602E74, 0x1A46D07, 0x86CBA9, 0x35372B, 0x3083464, 0x19CD1DF, 0x26C2ED2, 0x3BB315F, 0x7A9EBE, 0x1A6B314, 0x36646A0, 0x1E58B51, 0x3D3B5B, 0x35E7D9, 0x110DD05, 0x878825, 0x138045F, 0x9C5723, 0x446FFD, 0x36D3E76, 0x2D71EA6, 0x12E6E69, 0x27EE5A4, 0x1379E7, 0x1508738, 0x72122F, 0x3FF6C16, 0x127633, 0x22264A, -0x37EF7B3, 0x869A3F, 0x3333323, 0x331616, 0x57B0E5, 0x1D764E7, 0x23196EA, 0xF51C0, 0x24FE4BC, 0x5534C7, 0x2FE6FE4, 0x33B65FA, 0x199C7A1, 0x1CE4809, 0x6B8933, 0x2DD3FF2, 0x317E3F2, 0x653618, 0x2CF9952, 0x2A8719, 0x200F0C, 0x2A7B171, 0x1A58346, 0x340343B, 0x7A36B8, 0x21ACCD, 0x2C8ADD4, 0x2465844, 0x167F17, 0x769737, -0x19F266, 0x2E9A6D4, 0x1AC1DB1, 0x8C176E, 0x177724, 0x24B6FF3, 0x3446B49, 0x3053491, 0x37F15E8, 0x136198, 0x1EB7176, 0x33191D0, 0x36504A6, 0x325FF25, 0x14E811, 0x1BE386D, 0x1199FDC, 0x2B5E6B6, 0x2B7C067, 0x3286FC, 0x18E10A0, 0x1647D12, 0x67B3F2, 0x30CB7DE, 0x674FA7, 0x100DCF2, 0x23B1FB8, 0x32A4BAE, 0x288298D, 0x3B2C0, -0x1E24474, 0x35DAD1C, 0x216B4C8, 0x29EE28C, 0x421FB6, 0x28F514D, 0xAF6CBF, 0x33F5C67, 0x14F66BF, 0x202AF6, 0x226953E, 0x17E03E5, 0x2FFC05E, 0x28F3350, 0x7B721F, 0xF70479, 0x2392070, 0x362B99D, 0x1D1B48C, 0x54C31, 0x30E8025, 0x3CB8810, 0x20C2FBE, 0x3C0B9DC, 0x60E1E3, 0x2B347E3, 0x1C061D3, 0x37A6161, 0x3A432ED, 0x363924, -0x23A1A6A, 0x3D7B861, 0x1FE2180, 0x11BFDE, 0x2F79C0, 0x35447F0, 0x1E619B8, 0x1636446, 0x375CF83, 0x1C64C6, 0x341469D, 0x5D5598, 0x1E871D8, 0x2596E2E, 0x5BA634, 0x36952A5, 0x21D13E5, 0xC8A8F4, 0x348F6E5, 0x744F28, 0x232DA34, 0x3A12C7C, 0x2E8CD15, 0x1EA8E2F, 0x556F3D, 0x66EF89, 0x24F15BA, 0x2B03146, 0x2683E07, 0x1564FB, -0x22B483F, 0x3BB55BE, 0x1957E97, 0x6BAF4, 0x6D3F7E, 0x23B3E45, 0x3C4A2B2, 0x13A9AE8, 0x16E8ABF, 0x3D41E8, 0x36D8328, 0x3921792, 0x98EE4F, 0x130FA60, 0x65C49B, 0xD6E91A, 0x2803815, 0x1006E96, 0x89CE33, 0x394A21, 0x3D90785, 0x3328F7F, 0x965715, 0x3785624, 0x7BC3DC, 0x3A148AC, 0x3711E8C, 0x3D556DC, 0x35B6F86, 0x435DB9, -0x3D89C27, 0x3535B83, 0x553D74, 0x3DA94E0, 0x25E727, 0x2C5189A, 0x1049EE9, 0x139CBE5, 0xF0F986, 0x65C87D, 0x2809E2E, 0x29B448A, 0x2AFAC34, 0x21FC25F, 0x7DE6B7, 0x1758661, 0x22DB9C0, 0x2046B8F, 0x2FE5C10, 0x10705F, 0x1A2D44F, 0x58539B, 0x70F159, 0x1BB1BAC, 0x7C74F2, 0x249C60, 0x2630694, 0x23029AD, 0x113592C, 0x6E1BBD, -0x7790F9, 0x33B9D81, 0x3690937, 0x31C2FAB, 0x5B4CCB, 0x1AE0DAE, 0x39E6B9D, 0x3556332, 0x587424, 0x2E6394, 0x36849F0, 0x1E2FDA3, 0x272D4B3, 0x2833AEE, 0x6C419F, 0x273F2A4, 0x10C7878, 0x24FD8BB, 0x202EF80, 0x357CEC, 0x22F11A9, 0x1BA44E5, 0x371883A, 0xB8948, 0x7808DF, 0x3C49E13, 0x242D76A, 0x172AB66, 0x964DD6, 0x6CEF23, -0x1E551FC, 0x2B1D9ED, 0x3D36D18, 0x3776C56, 0x5A0BA1, 0x19E3E1F, 0x248F788, 0x1CFE6F7, 0x2236E6D, 0x3EC235, 0x83BBE0, 0x308F50B, 0x18CDA4F, 0x1DA19, 0x21581A, 0x110218E, 0x16DB95C, 0x77FA29, 0x2C7457D, 0x3E7BBA, 0x30D0635, 0x1B00F4F, 0x3B5226, 0x22A56F8, 0x4174F0, 0x38CBEB8, 0x299D2AE, 0x3390AA1, 0x2954EB8, 0x6BDF6B, -0x38545E2, 0x7E0918, 0x1E538A3, 0x283C3BB, 0x2A9E37, 0x3A69F65, 0x294F632, 0x2316148, 0x3A05B65, 0x64C587, 0x3E97E93, 0x280A291, 0x2E0E777, 0x11AFBEC, 0x13728E, 0x22FA665, 0xE2D121, 0x35F6131, 0xE2A1DD, 0xFCA8C, 0x1D067FA, 0x12F5603, 0x102ECC4, 0x190B88B, 0x40F2F7, 0xC2F728, 0x2E9B6E0, 0x37F9AB3, 0xB8B496, 0x5068AA, -0x22923FF, 0x2A10B02, 0x1965A8, 0x228401B, 0x67C39E, 0x1460A84, 0x32E7FD5, 0x25188F5, 0x829066, 0x2E735C, 0x3DC2DA4, 0x3B8FF2F, 0x12BD0C6, 0x182A1EE, 0x5BF6ED, 0xE1608F, 0x3384C3A, 0x28E85E4, 0x2CF9400, 0x932CE, 0x38CBFB8, 0x33E2814, 0x37BD793, 0x30062FA, 0x4E89E2, 0x35060DE, 0x29E590B, 0x16CAA, 0x29344FC, 0x542A38, -0x663E55, 0x3F66AB, 0xE4A1B, 0x7DECCF, 0x5158BF, 0x24119FE, 0x3A0BD96, 0x1FC4060, 0x1A6854A, 0x323470, 0x27480D5, 0x1BBDA44, 0x2FFE5C9, 0x84D37D, 0x409A90, 0x36C7013, 0x230E4BA, 0xA79DBE, 0x22C38ED, 0x73F2C4, 0x37E2DFF, 0x14182E4, 0x11D9DDF, 0x49E18, 0x1534F9, 0xFD49A7, 0x3F8A6B9, 0x1F29F47, 0x3F9D54E, 0x7A2C0B, -0x30FE7FB, 0x1346CA, 0x350E19E, 0x2F60606, 0x56381E, 0x1DF4AC3, 0x25C3092, 0x36855C8, 0x33A3B79, 0x8ACAE, 0xA0D20B, 0x3C69C6B, 0x2C3AC44, 0x2364B92, 0x623EDC, 0x5A0C95, 0x1A9F962, 0x116E549, 0xDCC56C, 0x20A9BA, 0x1F5B55D, 0x161CE02, 0x2FF5376, 0x10BAFBC, 0x23C44C, 0x3724573, 0x25B6487, 0x29EB56A, 0x16DCAE2, 0x321781, -0x350A489, 0x6D0BD4, 0x70B2CC, 0x1BA16E3, 0x31F0B3, 0x12CE34D, 0x2972313, 0x2EAC504, 0x9ACDE1, 0x1AF8EA, 0x3AFD62F, 0x31789B5, 0x3F9A69B, 0x341091E, 0x21E399, 0x8EFB27, 0x3599DAA, 0x2FD46E6, 0x2E7FA69, 0x476212, 0x4168DE, 0x3ED94A, 0x1408074, 0x2F9979A, 0x5F7570, 0x1D81C29, 0x1B0C9E7, 0x2D6D016, 0x2401F13, 0x656548, -0x35219C9, 0x2D6F4DE, 0x10F7BAF, 0x27CB88, 0x2517, 0x1C1198B, 0x98E6D4, 0x28110D2, 0x3CEF8F7, 0xF3C0D, 0x3374EEF, 0x984AF8, 0x20AA355, 0x10C63FE, 0x83474, 0x157A00C, 0x3C7078D, 0x2614CB9, 0x3D77A2D, 0x20C359, 0xD05268, 0x2520A8D, 0x35D7D31, 0x1C6D1AD, 0x421657, 0xD911A6, 0xDB08BA, 0x7E1CA3, 0x1C019E9, 0x4D072F, -0x1555E9E, 0xB1BC40, 0x1697902, 0x2BDF30B, 0x4C8C7E, 0x44817, 0x2042CB6, 0x9C1629, 0x3006C1B, 0x25110B, 0x39EAFF7, 0x2701AFC, 0x34E81BF, 0x547D4A, 0x6CC36F, 0xB62068, 0x1CE9A85, 0x382176B, 0x372276C, 0x47DCB0, 0x3670F41, 0x37FAB0B, 0x417FE9, 0x24EE5CF, 0x625B5C, 0x133C1A, 0x3245E4C, 0x3B2B5F8, 0xF8264A, 0x6BD35F, -0x1409E5E, 0x14E46A1, 0x276703B, 0x3F458D8, 0x7981D8, 0x2A23CB6, 0x172028C, 0x2DC3DB4, 0x1FBD22C, 0x67356A, 0x301ADAF, 0x248DBA7, 0x2BD6618, 0x153940C, 0x7A1E9, 0x69727E, 0x189C667, 0x1453D, 0x1058F3B, 0x25D67E, 0x3A80738, 0x1EEDA98, 0x3C7418E, 0x132D772, 0x3112BE, 0x1643F04, 0x2B5B4E0, 0x334DAD9, 0x32FAAED, 0x116112, -0x166FF07, 0x8EAF8B, 0x36DE326, 0x20890, 0x4D7803, 0x497808, 0x3419B01, 0x27009BE, 0x2CE73A1, 0x40DB29, 0x5602A5, 0x397ACE0, 0x340D6E5, 0x3CAC6E6, 0x52F227, 0x3A18394, 0xB13065, 0x30BC51C, 0x108FB89, 0x6D8BCA, 0x3B07F72, 0x1833FA3, 0xC6D6E, 0xE5C3E5, 0x7DD66C, 0x20B17CC, 0x3A9ED66, 0x172966A, 0x2C00583, 0x75FCF8, -0x32402B5, 0x317A8E6, 0x2DD5BED, 0x1097FB, 0xDC360, 0x31B6631, 0x777CB2, 0x176AADC, 0xF8EABA, 0x205EE9, 0x3C98498, 0x32D38CC, 0x72C7A2, 0x1DDB55A, 0x7D12EB, 0x1599FE, 0xE6F06D, 0x14278E3, 0x294FEB2, 0x4600F0, 0x2F14304, 0x274E8DC, 0x3742104, 0x82C8B6, 0x7B54E0, 0x2A35739, 0x1E5889C, 0x30C4D56, 0x22C4457, 0x27A117, -0x1C35E1B, 0x2CE7586, 0x14866CF, 0x346A767, 0x5CB96F, 0x34FFFEA, 0x173BDFE, 0x21D1AF4, 0x2CAEA4C, 0x4A73D7, 0xBE86AC, 0x11AD2A0, 0x373444B, 0x15E7A3, 0x2769B5, 0x72F372, 0x344279D, 0x15B6AB5, 0x2872FA6, 0x2BCCFB, 0x332F6A3, 0x98454, 0x1C53C00, 0x1F68558, 0x7ACB28, 0xD90DD9, 0x1571823, 0x2E13215, 0x2CA1E36, 0x7731D1, -0x1A0649, 0x8AFD7B, 0x467321, 0x28603B2, 0x592B5F, 0x19CE66F, 0x2F86D6B, 0x19D876, 0x316ECD2, 0x484C1C, 0x330B06C, 0x1331096, 0x22CE08E, 0x17C3934, 0x276606, 0x187AEFF, 0x2095912, 0x16A6D90, 0x331FF11, 0x3A835F, 0x806D69, 0x20074B3, 0x3839F4D, 0x3B32957, 0x41D767, 0x1F1E74F, 0x3AA7F40, 0x11CBF2D, 0x25D1888, 0x74D01B, -0x29F80CC, 0x38A02B4, 0x3231E43, 0x29A40A6, 0x5CDF66, 0x862CD9, 0x195680F, 0xC79E8D, 0x2D6343A, 0x388E38, 0xFF83A2, 0x2AA9212, 0x3E745D9, 0x3B935B2, 0x14D6FB, 0x12109EA, 0x1B736A4, 0x3541042, 0x335D752, 0x1BB7B9, 0x39B8505, 0x301412C, 0x367A3A3, 0x220ACC4, 0x35A3C5, 0x1A6473A, 0x1E4F58D, 0x2034678, 0x1F88080, 0x66ABCA, -0x1300021, 0x3FC76B, 0x26844A9, 0xD95134, 0x18F290, 0x369095D, 0x1B4E6E3, 0xCC8203, 0x1591746, 0x36490F, 0x2D3562E, 0x135CDA2, 0x87E941, 0x2811561, 0x7F8108, 0x253E40B, 0x2C395B5, 0x37BB93D, 0x3AA5814, 0x3F413, 0x37267EF, 0x1C5EDDF, 0x3305984, 0x29297B9, 0x6C5D9D, 0x2060A49, 0xC6370D, 0x239E725, 0x3E39E68, 0x274397, -0x95BAB0, 0x1E596D0, 0x26E3BDA, 0x14171E1, 0x6292B2, 0x32D6733, 0x3EDA819, 0x2859451, 0x3D0090A, 0x37C560, 0x18F1423, 0x395BAE0, 0x2EE1151, 0x21BC308, 0x634519, 0x25C75E1, 0x17FE1CE, 0x396A927, 0x1CD28CA, 0x178CDC, 0x244AACC, 0x2B6EC90, 0x2DDCFF7, 0x2837F65, 0x76518A, 0x1071219, 0x3072320, 0x26DC161, 0x130428F, 0x584D4, -0x1E9ACAB, 0x1CA0A82, 0xEBD272, 0x270D145, 0x1298E4, 0x21C0E6D, 0xF41672, 0x2BC6032, 0x2E1F749, 0x6072C8, 0xD36F26, 0x328AACA, 0x2A2A36E, 0x2D6AB93, 0x2A977C, 0x39546A7, 0x1F50E82, 0x1D29F15, 0x3C3B598, 0x4D60A, 0x26E4402, 0x3042004, 0x117134B, 0x2688562, 0x7677EF, 0xF07A40, 0xDE5C9E, 0x20ACBD1, 0x3C2D5FC, 0x32C0DA, -0x366DC2E, 0xFF46E0, 0x30ECBC8, 0x21B1399, 0x6CD072, 0x339DC8A, 0x12168FC, 0x3E8BF35, 0x217C891, 0x6E7E92, 0x1943BAE, 0x674E82, 0x12A6A9D, 0x25FBB61, 0x43FA51, 0x3242FE, 0x4E6281, 0xBE9F91, 0x32C4C4B, 0x4A100D, 0x110A692, 0x3F61AFA, 0x3937BE2, 0xA4DA35, 0x614FD8, 0x12D578F, 0x2A62C6A, 0xA7EDB5, 0xF86C37, 0x46F1D2, -0x2813F36, 0x1319C97, 0xE918BF, 0x1E04A7, 0x68BC89, 0x28F5C7F, 0x1940EB8, 0xCE5FF5, 0x139085E, 0x2B6E0F, 0x65895B, 0x33651FB, 0x36C9A97, 0x198A03F, 0x7AA90B, 0x158A0A0, 0x2BCCB7C, 0x3713EBB, 0x2B00AE, 0x6A748D, 0x1E83397, 0x26D7586, 0x1B69DF7, 0x11FD921, 0x169349, 0x2B32881, 0x615486, 0x20C9E75, 0x9DA9D1, 0x20791E, -0x2C70BC9, 0x3100F88, 0x21D3D25, 0x146399B, 0xBF079, 0x13711DE, 0x35725C7, 0x232C45D, 0x1A8C2F8, 0x66BD2C, 0x29E53CA, 0x20A796C, 0x8A2607, 0x139C770, 0x30ED41, 0x20F1D39, 0xE1627A, 0x3C5C3FD, 0x487CDF, 0x5A881A, 0x987A4F, 0x24E50DA, 0x2E2D27B, 0x1784EBE, 0x321FE4, 0xF33EA0, 0x3ADD420, 0x2BBDC6F, 0x14BD17B, 0x2166D, -0xCD2FC9, 0x9A86C3, 0x3D6D150, 0x24F56F0, 0x1141BE, 0x3C676C9, 0x37D81B, 0x38C4FD2, 0x1AC02B5, 0x4059E2, 0x2C6B505, 0x26D0273, 0x38E5070, 0x3A2B3D1, 0x68F020, 0x1DF0CFC, 0x35DF47D, 0x2FF1875, 0x1DCD458, 0x66EB37, 0x17A32AE, 0x120AAD0, 0x3935860, 0x11F6884, 0x21175F, 0x99ECDB, 0x3FC3368, 0x38E7A07, 0x5DEB8A, 0x26AE5F, -0x120EAF7, 0x1C3A94, 0x3F81A9A, 0x3AE9CE0, 0x2581FE, 0x1FA2A7A, 0x3C4DFC, 0x2C6249E, 0x1473F47, 0x7FE93C, 0x3777F7E, 0x136F729, 0x23FF2D7, 0x2E9D3A0, 0x562DA2, 0x1D77A2E, 0x2D3E182, 0xEC8543, 0x84BC8D, 0x3A0F65, 0x270EBC6, 0xB8FFA8, 0x1E11F84, 0xB1A8A2, 0x452432, 0x26A8576, 0x5621EA, 0x29A3808, 0x14F0FF8, 0x46F49D, -0x1B0D388, 0x24F3B7F, 0x2D2BBCC, 0x1DBBC2, 0x4DB312, 0x3654AD9, 0x33558DC, 0x115F1F2, 0x25C00D3, 0x4C6446, 0x1D7130C, 0xB36FF5, 0x2AE534D, 0x340F6C, 0x619895, 0x3AF1B1A, 0x252DCB, 0x8F7736, 0x1AA7C96, 0x1F6CA4, 0x259772A, 0x2C0F58, 0x23A8CBA, 0xF635AF, 0x24E5A2, 0x14F6D6F, 0x38D032, 0x466709, 0x1FB953A, 0x287BA2, -0x2D607F0, 0x832208, 0x3171873, 0x94277, 0x44FD58, 0x3F6BA4A, 0x3026182, 0xC0AF35, 0xC3BDDF, 0x6CF531, 0xB48CE4, 0x459E80, 0xB48AA8, 0x1DD6C36, 0x649F4C, 0x298D33D, 0x95A0FB, 0x21A659A, 0x1F441, 0x651479, 0x36E16DA, 0x121D047, 0xBF2155, 0x191D501, 0x411D03, 0x162458A, 0x7C4D10, 0x1368C23, 0x18CFDF7, 0x4F36B7, -0x27A89A4, 0x3770293, 0x1542A98, 0x369BAB3, 0x55D8A5, 0x1ED45, 0x3ED22C4, 0x2B95C3, 0x332BE9C, 0x5C7785, 0x16AFB79, 0xDA7F48, 0x669A64, 0x3C43A50, 0x1F405E, 0x327B6F, 0x3D20C6F, 0x82D755, 0x1F5C7C6, 0x2BC1B6, 0x17777D0, 0x2C573FB, 0x13268EA, 0x3560366, 0x517370, 0x1C9DB39, 0x6DD5C0, 0x3621081, 0x32D4BDE, 0x234D84, -0x1DAE49C, 0x313EF77, 0x1E1970, 0x1091D79, 0x3BA8D8, 0x35E7008, 0x83DAB5, 0x3345B07, 0x314612F, 0x275CD5, 0xD3796A, 0x260CE22, 0x2F35EB, 0x37F7847, 0x1B3A42, 0x12B5F4D, 0x1523F82, 0x3CC9946, 0x2D43C98, 0x119917, 0x20A3F82, 0x37A5568, 0x3FF4622, 0x28E9831, 0x6A552E, 0x150372A, 0x8C04E0, 0x36A7C79, 0x2546379, 0x18083B, -0x13B5455, 0x2DD375F, 0x33455F, 0x269C3FE, 0x523EEA, 0x2356271, 0x129CCD7, 0x31D2599, 0x3D810C3, 0x3BB011, 0x1D6EA7C, 0x10D32E8, 0x17801EC, 0x2583FAD, 0x69B632, 0x1EBF07, 0x14105D5, 0x383046C, 0x3EFE748, 0x1470B, 0x9BC5CC, 0x147D301, 0x28EDE95, 0xE3CD7B, 0x1C1246, 0x1E83A38, 0x28EA450, 0x127B09C, 0xF9D044, 0x44DAAF, -0x2FCD10, 0x11C5EDB, 0x114B0E5, 0x1AC9B56, 0x518AB4, 0x341E833, 0x2B1B208, 0x672528, 0x267507D, 0x2247FA, 0xF327A2, 0xC31970, 0x21B4ABE, 0x4ADDF, 0x3AC74E, 0xE86B3, 0x3BF5A53, 0xE4E35D, 0x42FF98, 0x7C382E, 0x2B8461C, 0x38E0A65, 0x1E1AF37, 0x714D8C, 0x4D4748, 0xF7B7B1, 0x3C59E21, 0x643AC8, 0x1AAC693, 0x5AE1BB, -0x29DF80, 0x2C09642, 0x3FA163E, 0x218A9E4, 0x623126, 0xE09807, 0x3890BC7, 0x3D2B6E1, 0xC2ABB3, 0x7BF961, 0xBFF70A, 0x17A7A84, 0x126EEDC, 0x8CEE53, 0x66B548, 0x14D30AB, 0x1C3B916, 0x353E70C, 0x1B22044, 0x79B000, 0x1710C1F, 0x9E4B24, 0x3539435, 0x14606A7, 0xC7BF1, 0x1E3779C, 0x257B9DA, 0x2DCDFC9, 0x1B2E773, 0x44871C, -0x9A046B, 0x202EFCB, 0xCD90D1, 0x2631174, 0x5445C5, 0x75FB94, 0x2CC8E1A, 0xF35DEF, 0x14D5598, 0x5B0D23, 0x33A7973, 0x37AA096, 0x2147BE1, 0x31386EF, 0x10658E, 0x2D77504, 0x3D7A1FE, 0x169448A, 0x32529F7, 0x55F5D3, 0xAD6BA2, 0xF9C18B, 0x2A31A9A, 0x38FE783, 0x36C0A7, 0x10AEBC, 0x359485C, 0x3598C4B, 0x3F22E7E, 0x1D031D, -0x113889E, 0x870410, 0x2477E36, 0x3B879A9, 0x774B77, 0x3785FD7, 0x316E281, 0xADA124, 0x3C60623, 0x5A6C0D, 0x272D7BE, 0x3735517, 0x2966F4A, 0x3FED9AE, 0x38100F, 0x3E4A4D0, 0x32F8D7, 0x175210, 0xB7C14B, 0x4489BE, 0x326899F, 0x288900E, 0x3928A03, 0x283FB04, 0x5AE4A0, 0x2795EAA, 0x3EFEE00, 0x253289D, 0x2713B3C, 0x34917E, -0x20CBBB3, 0x24D9D31, 0x71D64B, 0x1418CD3, 0x25C098, 0x1EE34E1, 0x8FD9BC, 0x1E48A7, 0x37D23C5, 0x3A960A, 0x1E440BB, 0x3CE19A5, 0x15E6659, 0x2FDBA02, 0x577A0F, 0xB25496, 0x1066C3D, 0x65B8EF, 0x28C2E79, 0x44176, 0x2013E6F, 0x21C177C, 0x1CE77A9, 0x120EAA5, 0x77D0B2, 0x38EFFD7, 0x3A45EE5, 0x2C7F309, 0x16FFA50, 0x8F1E5, -0xD613C8, 0x1058A6E, 0x149FFC2, 0xD735A2, 0x140F2E, 0x2E4F9FA, 0x267DA8F, 0x33B5388, 0x3374C03, 0x7ABC8E, 0x1D23EAF, 0x2B48D41, 0x2C7CD3D, 0x24DDAC, 0x75E73F, 0x20EC5C4, 0x12998ED, 0x2EC9564, 0x22BF87A, 0x511ADE, 0x1ABCA89, 0x17F938, 0x1F6CBB0, 0x1CEC328, 0x2838DE, 0xD28C8, 0x299962E, 0x1EC800, 0x1872459, 0x48AABA, -0x304D4E2, 0x1D672AD, 0x3ECA222, 0x3033A16, 0x106DD3, 0x1F69D03, 0x38738D0, 0x2E10A1C, 0x43A393, 0x1651B2, 0x133E136, 0xA6979C, 0x27B9473, 0x11F724D, 0x58C02F, 0x7B02AF, 0x32E95A5, 0x1B4809D, 0x45E049, 0x435C25, 0xCD9315, 0x25E68FC, 0x3DDBD56, 0x29908AB, 0x2F02B0, 0x331F32D, 0x83D401, 0xB5D239, 0x8E08CB, 0xAB833, -0xBB60DB, 0xF46FEC, 0x2EC2A7B, 0x280F1BA, 0x2342C2, 0xD5282E, 0x39B9685, 0x224BAC5, 0x77710A, 0x5B9A42, 0x143E056, 0x1419051, 0x14E6018, 0x1F02125, 0x6D7C37, 0x3DB1CED, 0x2FDC7B4, 0x9794F8, 0x3056AC4, 0x5150DB, 0x33A7FBF, 0x14286CD, 0x70500B, 0x30F4C91, 0x140BE5, 0x25B7911, 0x16FFA5B, 0x3F3E500, 0x156EC4C, 0x77CEA5, -0x3A973C6, 0x786A75, 0x28D32AB, 0x260C525, 0x3897AC, 0x323C9C3, 0x3DD1D92, 0x17CD9E0, 0x2EB9066, 0x2E5ECB, 0xF12433, 0xBA9592, 0x269C43E, 0x392023, 0x3A515A, 0x36BB343, 0xDB00F0, 0xE4317D, 0x14D207, 0x44CEBD, 0x21B0D76, 0x2195B0, 0x1BE6890, 0x3F2E402, 0x2F8513, 0x8A0E83, 0x8FE408, 0x2A732E2, 0x353BF4D, 0x3828C2, -0x1A8527D, 0x1F593E2, 0x151DBF1, 0xAE5958, 0x59EBB4, 0x7DC5EF, 0x2F1C917, 0x15EC7D7, 0xA832DA, 0x191B68, 0x19F9ADD, 0x45CBEB, 0x3DCF893, 0x2EEACBA, 0x239B6C, 0x3002DD1, 0xCBE729, 0x18B5769, 0x123E163, 0xC1405, 0x3741EA5, 0x2B7F773, 0x25676C7, 0x2E78B86, 0x3B39C4, 0x7436FE, 0xD6897E, 0xAE0C51, 0x3B2DF74, 0x690D8F, -0xDA10C2, 0xA9863B, 0x3832D78, 0x2CC4D33, 0x4F2A84, 0x3C955A4, 0x207DC6E, 0x239835A, 0x3B592B9, 0x457F88, 0x1479985, 0x3ADC70C, 0x2B32C27, 0x215A6A, 0x4AE918, 0x275EB11, 0xBA5E16, 0x2D36A5F, 0x266E0DF, 0x619CB1, 0x1484E40, 0x39644AE, 0xAF00E7, 0x3A1FF6F, 0x3B5831, 0x12B039D, 0x9DE70B, 0x61A49A, 0x381FD24, 0x3D4B81, -0x222B55C, 0x66C3E8, 0x14E2AA1, 0x46D0BF, 0x265C1B, 0x1DD7637, 0x34A3946, 0x24B26B4, 0x2403796, 0x3D2DA7, 0x1F21D63, 0x1961AD, 0x2E1899B, 0x3679A87, 0x4CE62B, 0x32F8C92, 0x79D4E4, 0x3834B67, 0x874261, 0x390B78, 0x3F1FC2F, 0x3DF1CF0, 0x3AF91AD, 0x2A14529, 0x78C636, 0x11E7533, 0xECD7F8, 0x3FD7AEE, 0x249E3DB, 0x7FBD19, -0x23D77DE, 0x2AEFD0D, 0x3F6A41A, 0x3C62E5A, 0x1B27FD, 0x39A981C, 0x192877F, 0x3F9ADA2, 0x214C302, 0x36EFC0, 0x1830854, 0x2218A97, 0x26CB5BD, 0x17995B, 0x23D7C9, 0x364FDCF, 0xCC9316, 0x265F452, 0xDD88E1, 0x36627F, 0x6D5EA4, 0x3D4E497, 0x3022564, 0x21A8644, 0x17C7CC, 0x3B009B, 0x36D4A95, 0x194BF90, 0x3247E2E, 0x15192D, -0x39EC6E2, 0x3E8F069, 0x2ECC80B, 0x3AA65DC, 0x48FCA8, 0xF1824, 0x1A8EE63, 0x1E22FEE, 0x15806E0, 0x46A8C7, 0x895FCC, 0xF100AA, 0x1CE22CB, 0x27010C1, 0x1D1DFF, 0x5971D7, 0x3BF9E3A, 0xB2C1A, 0x20EAD6A, 0x79C6A0, 0x1BF2518, 0x31EF297, 0xA02379, 0x22626CF, 0x241935, 0x25F470E, 0x10B273E, 0x459C9C, 0x33A46C5, 0x4481C2, -0x226DECA, 0x137A87A, 0x21016B0, 0x3BB8342, 0x26EE3C, 0xFA4F26, 0x331AF36, 0xFBDE36, 0x25C65D9, 0x4D1470, 0x31FEF52, 0x1CA9C1B, 0x2EC1E05, 0x1A688FC, 0xF75FB, 0x64CF3B, 0x392C12A, 0x77332A, 0x2A46E51, 0xB6373, 0x3D088AF, 0x3CAF082, 0x17471A8, 0x16B3900, 0x586B0D, 0x3C51433, 0x1AC2528, 0x1C7A0E, 0x8F09A3, 0x1752A1, -0x1770EA1, 0x30AA4ED, 0x2B43643, 0x346DC7B, 0x536CB9, 0xCC6B3F, 0x2C14974, 0x2FC36BF, 0x3BB0EB7, 0x1F4DCF, 0xBF57F0, 0x5A774, 0x144928A, 0x2A8828D, 0x1336C9, 0x3AD7A1, 0x3681A1C, 0x203DBBC, 0x20A0F07, 0x5E3347, 0x18FDAF2, 0x25DCCF4, 0x3867F19, 0x1E85EE, 0x789AF5, 0x1695613, 0x1C30535, 0x256F799, 0x10BA114, 0x794523, -0x226A018, 0xBE4D9E, 0x1FBE6C1, 0x723E73, 0x11BEDA, 0xB30929, 0x39B77C9, 0x755720, 0x78F951, 0x7706E9, 0x26C9059, 0xE074E, 0x424446, 0x1B0D9E3, 0x7E0191, 0x1D2BB07, 0x10D5698, 0x29E2602, 0x75E721, 0x68BAE0, 0x5DF659, 0x32F3FCA, 0x933F21, 0x1ABAD5F, 0x2F7EE, 0x3266467, 0x3C1E305, 0x2103CE0, 0x36D45C2, 0x39B4F, -0x1A4AE1B, 0x383FB18, 0x695D5D, 0x54A9DE, 0x33D37A, 0x210DED9, 0x139039A, 0x3213EA6, 0x10DCEF6, 0x1F1D39, 0x359FD14, 0x1662DBB, 0x2283F63, 0x370EB94, 0x57922A, 0x376F4A1, 0x2A43862, 0x1DAEE39, 0x3AF662A, 0x27F3DB, 0x3804B3, 0x1E77670, 0x2D73181, 0x30644B8, 0x511D72, 0x32F3225, 0x7DB492, 0x197F88E, 0x267370A, 0x560099, -0x3D82FE4, 0x37E5DED, 0x2226DA6, 0x2EE98F6, 0x76F746, 0x3F13BD7, 0x3EB3F0E, 0xE180B5, 0x13D9BC, 0x4A31EB, 0x3CFE28F, 0x39CF579, 0x20B18AC, 0x6F0C30, 0x19AA73, 0x3400317, 0x25E7F9C, 0x3C9FA91, 0x1C68271, 0x6795CE, 0x33700BA, 0x1554064, 0x114D93D, 0x3C22C7F, 0x3850EA, 0x1EDCA71, 0x316AF22, 0x25EF450, 0x2122F7E, 0x1BE5DB, -0x3CF082B, 0x19F4FD3, 0x3C45776, 0x1B12092, 0x673B6E, 0x3E0ED6D, 0xB04A97, 0x3ED16F2, 0x1C8AD77, 0x6EE6, 0x28D0C62, 0x284F051, 0x26E4B47, 0x1E68DE3, 0x405648, 0x1D3AB58, 0x1AD54EA, 0x3CFD0BC, 0x1E79137, 0x21761C, 0x31050FB, 0xDBB4FA, 0x1D366F, 0x4C4B74, 0x2E67DF, 0xDBF2AD, 0x113129A, 0x39C487, 0xADB5B8, 0x784496, -0x378B65F, 0x2D59D74, 0x1DF0E07, 0x2A8F4B0, 0x336262, 0x38C2E2B, 0x29599F5, 0x1E2A320, 0x365B769, 0x4F668D, 0x7B8470, 0x1559E57, 0x2BB1E21, 0x16CBBE0, 0x306190, 0x32206B6, 0x14C8451, 0x3A39AEE, 0x1FFB835, 0x1F8737, 0x3B1A0C7, 0x16314A8, 0x3FFDDAC, 0x3813C35, 0x6E3C4C, 0xA3990F, 0x3FB1175, 0x22C2FDF, 0x3DE78AB, 0x4B5340, -0x3D1F332, 0x1F22DEC, 0x2E1A053, 0x112B8D7, 0x552927, 0x1057F1E, 0xDB439B, 0x33D7423, 0x2799172, 0x5AC40E, 0x5E46C7, 0x3DEDFE1, 0x3150848, 0x19CDBBF, 0x74BDA8, 0x3AF4F35, 0x2B3D88A, 0x27BB891, 0x273CE59, 0x14BCEF, 0x331B9B3, 0x4D5365, 0x259F9AA, 0x2864A57, 0x27E855, 0x662ED0, 0xA6A384, 0x37A71A8, 0x3E7B110, 0x3BBC43, -0x1DE5341, 0x2A2C72C, 0x15498BF, 0x201A34D, 0x343277, 0x1F25A48, 0x201F690, 0x2552E3D, 0x738BBF, 0x1BB6EE, 0x3D3AFFC, 0x26F7B42, 0x17A808D, 0x14A66A4, 0x290F1C, 0x22F8939, 0x37F5E55, 0x211FDA8, 0x39A1A18, 0x1BF7AA, 0x860765, 0x1B6BE6F, 0xA0B2AB, 0xBC3AD6, 0x7BEF6E, 0x439B94, 0x1BEAADF, 0xBDA874, 0x1DD42EE, 0x17EA8, -0x39C9A7C, 0x3FAC888, 0x3A13F8D, 0x34A21EC, 0x35CEC0, 0x1F1606, 0x2A52B18, 0x176768A, 0xFE2E98, 0x747055, 0x1EAC410, 0x253E472, 0x3732378, 0x3AE0CDF, 0x55B22A, 0x1A2FE94, 0x23643CA, 0x23A53F, 0x22BB1A4, 0xAEC06, 0x16182EE, 0x1858AB, 0x1475405, 0x3C6B079, 0x6A327F, 0x12DF3C8, 0x35C2552, 0x3521968, 0x18657D9, 0x3F93F4, -0x170D546, 0xEF28A2, 0x13C2498, 0xC4A5AD, 0x2716B9, 0x2B6F8B3, 0x2C04C99, 0x369FF42, 0x212DA8, 0x76F29B, 0x2216249, 0x21D2488, 0x2CFD8E2, 0x34E9B06, 0x4F2FA1, 0x2B8F954, 0x199A5FA, 0xCE50EE, 0xA3FB03, 0x37C33E, 0x2FD1DD1, 0x10678AE, 0xDAA7D0, 0x10A66E5, 0x1F04D, 0x1E5C7E6, 0x1AAEF0, 0x32A95EC, 0x18183B5, 0x3A24C6, -0x3960F26, 0x1D93857, 0x1C3A0DB, 0x357567E, 0x1D5973, 0x6D9149, 0x3098237, 0x3518F3D, 0x1186DCB, 0x1D80E0, 0x1C8B09C, 0x1373352, 0xD5E226, 0x28918CD, 0x1F03E7, 0x31FC2AD, 0x38E21C6, 0xBD32D6, 0x2B93CA1, 0x418588, 0x3E1AFA2, 0x2C1C6EB, 0x18DC3EF, 0x3202F7, 0x799BA, 0x1CC4A76, 0x9CC88B, 0xB23A6B, 0x22B229, 0x13859F, -0x23C5ACB, 0x12B3099, 0x2DF6019, 0x16F42EF, 0x459FA5, 0x6F8CC1, 0x155543C, 0x2A751B0, 0x3519667, 0x9E5FA, 0x32958A3, 0x245BBDD, 0x24316B3, 0x306024E, 0x4AAAA5, 0x210F594, 0x140F8D9, 0x23B98E1, 0x1412289, 0x620EF5, 0xB8ECBC, 0x2258F23, 0x15A5A2, 0x2C6DD03, 0x6AEE46, 0x19CFEA9, 0xE581BD, 0x28B167E, 0x38DDDF2, 0x13A579, -0xBB8DE3, 0x364BD87, 0xEAB45A, 0x690918, 0x53068A, 0x2379578, 0x18D51B7, 0x8609B1, 0x29F3B35, 0x7BF38, 0x1282DE1, 0x31DC0C7, 0xFFCF84, 0x18E67DE, 0x402AED, 0x15F6D45, 0xF7480A, 0x2ECDFB8, 0x15CD034, 0x3702E2, 0x39B97C8, 0x2CB62E5, 0xC51B8D, 0x34A942C, 0x617526, 0x13017DB, 0x3D89E0C, 0x1D92D86, 0x37B157E, 0x2F35EE, -0x117973B, 0x2DA524D, 0x309AEEC, 0x1D382EB, 0x7A111A, 0x39F63D, 0x31B6A3B, 0x3746B82, 0x18C4A0C, 0x421707, 0x1DA88D9, 0x1B447CF, 0x28875D3, 0x2CD83CA, 0x7BAEBC, 0x13D6CBB, 0x26D8B42, 0x28FD982, 0xBDBB7C, 0x10F17A, 0x3F40626, 0x3BE90D4, 0xE34FE3, 0x197046, 0x731CA3, 0x3960895, 0x159E209, 0x355C618, 0x293D318, 0x7F906, -0x2657142, 0x3673443, 0x21CE361, 0x114B761, 0x2B5F5D, 0x104B69A, 0x77C17, 0xDBA3E, 0x2F133F3, 0x533723, 0x1BBCCB5, 0xE12BE8, 0xDFD820, 0x1056576, 0x4E67E9, 0x1E13646, 0x369420E, 0x29680F8, 0x15CE965, 0x6887A0, 0x2CE7343, 0x37477D6, 0x16C8E93, 0x3CA3B3E, 0xD4076, 0x8074C1, 0xBE1526, 0xB60BA, 0x529069, 0x5EB2A3, -0x276B27E, 0x3DB49E9, 0x114E49F, 0x102B354, 0x15F7CA, 0xD7DA95, 0x7E69D4, 0x2B5CC17, 0x2247DE7, 0x3BEDBE, 0x3F74FF, 0x10EF3AE, 0x1A4C5B6, 0x3CEA8CC, 0x88B1A, 0x260C4A9, 0x2531F82, 0x3151DE2, 0x3F20482, 0xA0770, 0x7C1937, 0x2DD5E83, 0x1C88F09, 0x300FBF6, 0x34B797, 0x28CC49, 0x38FB6CB, 0x122C051, 0x3B16B14, 0x66DB34, -0x3DC8CBF, 0x37834F4, 0x18CF95F, 0x3204860, 0x797897, 0x310F89D, 0x11B465D, 0x9C51FD, 0x1600FE0, 0x533A50, 0x9FD211, 0x3C70243, 0x1D82B60, 0x1E6BD72, 0x4A7C34, 0x8CF5A5, 0x30FFE93, 0x21F4BF, 0x2984C1A, 0x6949F4, 0x27ABCDC, 0x1325271, 0x23C4D81, 0xCC59A1, 0x419A5E, 0x10895E0, 0x191BDAF, 0x145A9DE, 0x1126951, 0x497CC1, -0x3E126F2, 0x2CC491F, 0x347769E, 0x8CE59F, 0x323C83, 0x178127D, 0x36D34F, 0x380C52E, 0x2239EE4, 0x42A0E1, 0x2E34E7E, 0x2C04711, 0x34AA3A6, 0x22AA682, 0x79F416, 0x308FA4D, 0x1C28975, 0xDC8942, 0x632F27, 0x2BB286, 0x606819, 0x391AEC1, 0x2982741, 0x14BECB3, 0x2790C, 0x2D0AF10, 0x2E4B431, 0x14126DB, 0x2B8187A, 0x32AA96, -0xD98EAA, 0x19C0324, 0x3136137, 0x1943029, 0x4D1DFE, 0x9AFF20, 0x1FE3BBE, 0x2E37B39, 0x128106E, 0x4836AC, 0xA02B8, 0x3041DAA, 0x3A37F37, 0x28919EC, 0xD063F, 0x16B7E7B, 0x3C985ED, 0x1A5D498, 0x2536E16, 0x65EF11, 0x28FFCB6, 0x3897574, 0x83D1F, 0x3D25167, 0x228EE6, 0x9B15B, 0x3A0ACEC, 0xBFC6B7, 0x17BC7F3, 0x713B18, -0x360C298, 0x11A3C7F, 0x1614552, 0x1740946, 0x2B7BA6, 0x30816C2, 0x1AB643F, 0x2E58A8, 0x24A12F4, 0x7BF924, 0x34C2D10, 0x2315B91, 0x540300, 0x2DDFE01, 0x171473, 0x3DEA766, 0x2DE741E, 0x151F15F, 0x31EE6C2, 0x66AC67, 0x2D2F652, 0x32856ED, 0x217B34B, 0x349A1F5, 0x13C63D, 0x315C19C, 0x16B88DD, 0x3EF2C51, 0x19D97EF, 0xE543C, -0x156FB5E, 0x3203EBC, 0x22B8668, 0xECF665, 0x1E2E9E, 0x394E113, 0x2EA8990, 0x19CF89E, 0x873C59, 0x322ADD, 0x3619F8E, 0x1B899CC, 0x7F0F9E, 0x22DE485, 0x723BFC, 0x17E092F, 0x3BCA0DF, 0x101179A, 0x164A42D, 0x1AA61C, 0x2499CD3, 0x16B95DA, 0x20DA995, 0x1A96873, 0x2C3D6E, 0x1A8676B, 0x133E866, 0x285FB86, 0x7C4200, 0x4961A2, -0x39D997B, 0x38612B7, 0x2725828, 0x2A5F9F3, 0xC84BD, 0x14CFCC5, 0x1D19DC2, 0x105BE69, 0x1DCE518, 0x4EC8CD, 0x15FB5C6, 0x1A06F32, 0x3269A48, 0x3E3DE97, 0x6ADE87, 0x139C03D, 0x1D8A3F7, 0x18429B9, 0x387C67C, 0x3BDE0E, 0x17C0CA1, 0x2323EC4, 0x29BF4EF, 0x3E341F7, 0x769BF8, 0x38A57E5, 0x17D0D6D, 0xF9CC8F, 0x2A186EF, 0x79987A, -0xB02022, 0x3155C81, 0x29D17F6, 0x644471, 0x119BD8, 0x33EB70B, 0x187BC02, 0x3B39F0C, 0x1446FED, 0x4317F0, 0x4D608E, 0xAE513A, 0x29CB36A, 0x18B68CD, 0x1C1A38, 0x5F1775, 0x2FD1C42, 0x19AD37D, 0x7841D0, 0x383575, 0x307A3FE, 0x2C3212E, 0x13A004A, 0x3EF1F7C, 0x63758B, 0x154F23E, 0x3F4832D, 0x15B715F, 0x110AC54, 0x1FF11C, -0x3937186, 0xDDFC6F, 0x38F8171, 0x20AE0D4, 0x615EFE, 0x352A761, 0x39F3EB8, 0x1502321, 0x9F5C90, 0x7AF024, 0x2CC559F, 0x11B91FC, 0x1D75865, 0x2805D96, 0x65A1D8, 0xBFDAC9, 0x362A96E, 0x623C95, 0x32DA264, 0x1E887, 0x26BCC3D, 0x23BA311, 0x108FF1F, 0x13CAE8E, 0x40CE5E, 0xC16B35, 0x2068D20, 0x8CDD2B, 0xFAAFDC, 0x51625D, -0x105739, 0x1DC2884, 0x18A044D, 0x12808A5, 0x7F1DE7, 0x1E8F732, 0x13246F4, 0x2460FBE, 0x32C4D61, 0x204FBA, 0x22E3CF8, 0x1E425F5, 0x311497D, 0x16464D2, 0x541AD5, 0x141909, 0x2D1E467, 0x1ECFFDF, 0x394913F, 0x354926, 0x2E994B0, 0x6C2838, 0x3D1629, 0x1A74E9B, 0x2B9A9A, 0x2371E7, 0x26F950C, 0x1CD818, 0x177C6A2, 0x3645C6, -0x1E3877B, 0x1183D11, 0x126EDF0, 0x3714B43, 0x7EA384, 0x146B6B0, 0x397DDA3, 0x14E60C2, 0x36A9F17, 0x1F6E62, 0x32343EE, 0x9ACCE, 0x372CF8B, 0x321D9F4, 0x2B7183, 0x2731517, 0x1746DAD, 0xD60BD4, 0xA1F11C, 0x4DDB3D, 0x263ECE2, 0x76E900, 0x45F103, 0x2835964, 0x4E737F, 0x109B455, 0x326341F, 0x37F58CB, 0xA28AF2, 0x34A351 }; - - -#endif \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_32bit/README.md b/ffi-deps/FourQlib/FourQ_32bit/README.md deleted file mode 100644 index 1198060..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/README.md +++ /dev/null @@ -1,74 +0,0 @@ -# FourQlib v3.0 (C Edition): portable 32-bit implementation - -## Contents - -The `FourQ_32bit` folder contains: - -* [`FourQ_32bit/Visual Studio/`](Visual%20Studio/): folder with Visual Studio 2015 solution and project files for compilation and testing in Windows. -* [`FourQ_32bit/makefile`](makefile): Makefile for compilation using GNU GCC or clang compilers on Linux. -* Main .c and .h files: library and header files. Public API for ECC scalar multiplication, key exchange and signatures is in [`FourQ_32bit/FourQ_api.h`](FourQ_api.h). -* [`FourQ_32bit/generic/`](generic/): folder with library files for 32-bit implementation. -* [`FourQ_32bit/tests/`](tests/): test files. -* [`FourQ_32bit/README.md`](README.md): this readme file. - -## Supported platforms - -This implementation is supported on 32-bit platforms such as x86 and ARM-based processors running Windows or Linux. We have tested the library with Microsoft Visual Studio 2015, GNU GCC v4.9 and clang v3.8. - -See instructions below to choose an implementation option and compile on one of the supported platforms. - -## Complementary crypto functions - -Random values are generated with `/dev/urandom` in the case of Linux, and with the function `BCryptGenRandom()` in the case of Windows. - -The library includes an implementation of SHA-512 which is used by default by SchnorrQ signatures. - -Users can experiment with different options by replacing functions in the `random` and `sha512` folders and -applying the corresponding changes to the settings in [`FourQ.h`](FourQ.h). - -## Instructions for Windows - -### Building the library with Visual Studio - -Open the solution file ([`FourQ.sln`](Visual%20Studio/FourQ/FourQ.sln)) in Visual Studio 2015, select the "Generic" configurations from the Solution Configurations menu (Win32 should appear as Solution Platform). - -By default, `USE_ENDO=true` is defined. To modify this configuration, go to the property window of the FourQ project, go to `Configuration Properties > C/C++ > Preprocessor`. Make any suitable changes, e.g., `USE_ENDO=true` or `false`. Repeat these steps for the `fp_tests`, `ecc_tests` and `crypto_tests` projects. - -Finally, select "Build Solution" from the "Build" menu. - -### Running the tests - -After building the solution, run `fp_tests.exe`, `ecc_tests.exe` and `crypto_tests.exe`. - -### Using the library - -After building the solution, add the `FourQ.lib` file to the set of References for a project, and add [`FourQ.h`](FourQ.h) and [`FourQ_api.h`](FourQ_api.h) to the list of header files of a project. - -## Instructions for Linux - -### Building the library and executing the tests with GNU GCC or clang - -To compile on Linux using the GNU GCC compiler or the clang compiler, execute the following command from the command prompt: - -```sh -$ make ARCH=[x86/ARM] CC=[gcc/clang] USE_ENDO=[TRUE/FALSE] EXTENDED_SET=[TRUE/FALSE] CACHE_MEM=[TRUE/FALSE] -``` - -After compilation, run `fp_tests`, `ecc_tests` or `crypto_tests`. - -By default GNU GCC is used, as well as endomorphisms and extended settings. Similarly, `CACHE_MEM=TRUE` is set by default indicating that the targeted platform contains a cache memory. - -For example, to compile using clang with the efficient endomorphisms on an x86 machine, execute: - -```sh -$ make ARCH=x86 CC=clang -``` - -As another example, to compile using GNU GCC with the efficient endomorphisms on an ARM machine, execute: - -```sh -$ make ARCH=ARM -``` - -By default `EXTENDED_SET` is enabled, which sets the following compilation flags: `-fwrapv -fomit-frame-pointer -march=native`. To disable this, use `EXTENDED_SET=FALSE`. -Users are encouraged to experiment with the different flag options. diff --git a/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/FourQ/FourQ.sln b/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/FourQ/FourQ.sln deleted file mode 100644 index 2751090..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/FourQ/FourQ.sln +++ /dev/null @@ -1,40 +0,0 @@ - -Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio 14 -VisualStudioVersion = 14.0.25420.1 -MinimumVisualStudioVersion = 10.0.40219.1 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "FourQ", "FourQ.vcxproj", "{719F1A49-62B2-41E2-B500-40FAD83AB12A}" -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "fp_tests", "..\fp_tests\fp_tests.vcxproj", "{D36D493E-EFD2-4FF1-8CAE-2D16EEA76CAC}" -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "crypto_tests", "..\crypto_tests\crypto_tests.vcxproj", "{47A9BC7F-1C7F-4BB3-B5D1-7AC7DDAC0E04}" -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ecc_tests", "..\ecc_tests\ecc_tests.vcxproj", "{A6DB2ADB-C570-47D5-BAAA-06904D60C091}" -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|Win32 = Debug|Win32 - Generic|Win32 = Generic|Win32 - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {719F1A49-62B2-41E2-B500-40FAD83AB12A}.Debug|Win32.ActiveCfg = Debug|Win32 - {719F1A49-62B2-41E2-B500-40FAD83AB12A}.Debug|Win32.Build.0 = Debug|Win32 - {719F1A49-62B2-41E2-B500-40FAD83AB12A}.Generic|Win32.ActiveCfg = Generic|Win32 - {719F1A49-62B2-41E2-B500-40FAD83AB12A}.Generic|Win32.Build.0 = Generic|Win32 - {D36D493E-EFD2-4FF1-8CAE-2D16EEA76CAC}.Debug|Win32.ActiveCfg = Debug|Win32 - {D36D493E-EFD2-4FF1-8CAE-2D16EEA76CAC}.Debug|Win32.Build.0 = Debug|Win32 - {D36D493E-EFD2-4FF1-8CAE-2D16EEA76CAC}.Generic|Win32.ActiveCfg = Generic|Win32 - {D36D493E-EFD2-4FF1-8CAE-2D16EEA76CAC}.Generic|Win32.Build.0 = Generic|Win32 - {47A9BC7F-1C7F-4BB3-B5D1-7AC7DDAC0E04}.Debug|Win32.ActiveCfg = Debug|Win32 - {47A9BC7F-1C7F-4BB3-B5D1-7AC7DDAC0E04}.Debug|Win32.Build.0 = Debug|Win32 - {47A9BC7F-1C7F-4BB3-B5D1-7AC7DDAC0E04}.Generic|Win32.ActiveCfg = Generic|Win32 - {47A9BC7F-1C7F-4BB3-B5D1-7AC7DDAC0E04}.Generic|Win32.Build.0 = Generic|Win32 - {A6DB2ADB-C570-47D5-BAAA-06904D60C091}.Debug|Win32.ActiveCfg = Debug|Win32 - {A6DB2ADB-C570-47D5-BAAA-06904D60C091}.Debug|Win32.Build.0 = Debug|Win32 - {A6DB2ADB-C570-47D5-BAAA-06904D60C091}.Generic|Win32.ActiveCfg = Generic|Win32 - {A6DB2ADB-C570-47D5-BAAA-06904D60C091}.Generic|Win32.Build.0 = Generic|Win32 - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection -EndGlobal diff --git a/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/FourQ/FourQ.vcxproj b/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/FourQ/FourQ.vcxproj deleted file mode 100644 index cc9ef49..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/FourQ/FourQ.vcxproj +++ /dev/null @@ -1,240 +0,0 @@ - - - - - Debug - Win32 - - - Debug - x64 - - - Generic - Win32 - - - Generic - x64 - - - Release - Win32 - - - Release - x64 - - - - {719F1A49-62B2-41E2-B500-40FAD83AB12A} - Win32Proj - Core - FourQ - - - - StaticLibrary - true - Unicode - v140 - - - StaticLibrary - false - true - Unicode - v140 - - - StaticLibrary - false - true - Unicode - v140 - - - StaticLibrary - v140 - - - StaticLibrary - v140 - - - StaticLibrary - v140 - - - - - - - - - - - - - - - - - - - - - Level3 - Disabled - __WINDOWS__; _X86_; _GENERIC_; USE_ENDO=true; - Default - ProgramDatabase - - - Windows - true - - - bcrypt.lib - true - MachineX86 - - - - - Level3 - - - MaxSpeed - true - true - __WINDOWS__; _X86_; _GENERIC_; USE_ENDO=true; - - - Windows - true - true - true - - - bcrypt.lib - - - - - Level3 - - - MaxSpeed - true - true - __WINDOWS__; _X86_; _GENERIC_; USE_ENDO=true; - - - Windows - true - true - true - - - bcrypt.lib - - - - - - - MaxSpeed - AdvancedVectorExtensions - AVX - None - None - None - Neither - Default - false - false - NoListing - Level3 - true - true - __WINDOWS__; _AMD64_; _AVX_; USE_ENDO=true; - true - - - - true - bcrypt.lib - - - - - - - MaxSpeed - AdvancedVectorExtensions - AVX - None - None - None - Neither - Default - false - false - NoListing - Level3 - true - true - __WINDOWS__; _AMD64_; _GENERIC_; USE_ENDO=true; - true - - - - - true - bcrypt.lib - - - - - - - AdvancedVectorExtensions - Disabled - Level3 - true - false - __WINDOWS__; _AMD64_; _AVX_; USE_ENDO=true; - true - Default - MultiThreadedDebugDLL - - - - true - bcrypt.lib - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/FourQ/FourQ.vcxproj.filters b/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/FourQ/FourQ.vcxproj.filters deleted file mode 100644 index b4f9f51..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/FourQ/FourQ.vcxproj.filters +++ /dev/null @@ -1,66 +0,0 @@ - - - - - {4FC737F1-C7A5-4376-A066-2A32D752A2FF} - cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx - - - {93995380-89BD-4b04-88EB-625FBE52EBFB} - h;hpp;hxx;hm;inl;inc;xsd - - - {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} - rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms - - - {6739ac49-cc8b-46e9-8303-bb86f346d251} - - - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files\generic - - - Header Files - - - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Header Files - - - Source Files - - - Source Files - - - Source Files - - - \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/crypto_tests/crypto_tests.vcxproj b/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/crypto_tests/crypto_tests.vcxproj deleted file mode 100644 index 829a8a7..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/crypto_tests/crypto_tests.vcxproj +++ /dev/null @@ -1,243 +0,0 @@ - - - - - Debug - Win32 - - - Debug - x64 - - - Generic - Win32 - - - Generic - x64 - - - Release - Win32 - - - Release - x64 - - - - {47A9BC7F-1C7F-4BB3-B5D1-7AC7DDAC0E04} - Win32Proj - MyLibrary - crypto_tests - - - - Application - true - Unicode - v140 - - - Application - false - true - Unicode - v140 - - - Application - false - true - Unicode - v140 - - - Application - v140 - - - Application - v140 - - - Application - v140 - - - - - - - - - - - - - - - - - - - - - false - - - false - - - - - - Level3 - Disabled - __WINDOWS__; _X86_; _GENERIC_; USE_ENDO=true; - ProgramDatabase - Default - - - Console - true - - - - - - - Level3 - - - MaxSpeed - true - true - __WINDOWS__; _X86_; _GENERIC_; USE_ENDO=true; - - - Console - true - true - true - - - - - Level3 - - - MaxSpeed - true - true - __WINDOWS__; _X86_; _GENERIC_; USE_ENDO=true; - - - Console - true - true - true - - - - - true - - - MaxSpeed - AdvancedVectorExtensions - AVX - None - None - None - Neither - Default - false - false - Level3 - - __WINDOWS__; _AMD64_; USE_ENDO=true; - - - UseLinkTimeCodeGeneration - true - - - - - - - - - - - true - - - MaxSpeed - AdvancedVectorExtensions - AVX - None - None - None - Neither - Default - false - false - Level3 - - - __WINDOWS__; _AMD64_; _GENERIC_; USE_ENDO=true; - - - UseLinkTimeCodeGeneration - true - - - - - - - - - - - true - - - - - MSVCRTD - - - true - AdvancedVectorExtensions - Disabled - __WINDOWS__; _AMD64_; USE_ENDO=true; - Level3 - - - - - {719f1a49-62b2-41e2-b500-40fad83ab12a} - false - true - false - true - false - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/crypto_tests/crypto_tests.vcxproj.filters b/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/crypto_tests/crypto_tests.vcxproj.filters deleted file mode 100644 index 6305347..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/crypto_tests/crypto_tests.vcxproj.filters +++ /dev/null @@ -1,33 +0,0 @@ - - - - - {4FC737F1-C7A5-4376-A066-2A32D752A2FF} - cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx - - - {93995380-89BD-4b04-88EB-625FBE52EBFB} - h;hpp;hxx;hm;inl;inc;xsd - - - {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} - rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms - - - - - Header Files - - - Header Files - - - - - Source Files - - - Source Files - - - \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/ecc_tests/ecc_tests.vcxproj b/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/ecc_tests/ecc_tests.vcxproj deleted file mode 100644 index b93c18f..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/ecc_tests/ecc_tests.vcxproj +++ /dev/null @@ -1,243 +0,0 @@ - - - - - Debug - Win32 - - - Debug - x64 - - - Generic - Win32 - - - Generic - x64 - - - Release - Win32 - - - Release - x64 - - - - {A6DB2ADB-C570-47D5-BAAA-06904D60C091} - Win32Proj - MyLibrary - ecc_tests - - - - Application - true - Unicode - v140 - - - Application - false - true - Unicode - v140 - - - Application - false - true - Unicode - v140 - - - Application - v140 - - - Application - v140 - - - Application - v140 - - - - - - - - - - - - - - - - - - - - - false - - - false - - - - - - Level3 - Disabled - __WINDOWS__; _X86_; _GENERIC_; USE_ENDO=true; - ProgramDatabase - Default - - - Console - true - - - - - - - Level3 - - - MaxSpeed - true - true - __WINDOWS__; _X86_; _GENERIC_; USE_ENDO=true; - - - Console - true - true - true - - - - - Level3 - - - MaxSpeed - true - true - __WINDOWS__; _X86_; _GENERIC_; USE_ENDO=true; - - - Console - true - true - true - - - - - true - - - MaxSpeed - AdvancedVectorExtensions - AVX - None - None - None - Neither - Default - false - false - Level3 - - __WINDOWS__; _AMD64_; USE_ENDO=true; - - - UseLinkTimeCodeGeneration - true - - - - - - - - - - - true - - - MaxSpeed - AdvancedVectorExtensions - AVX - None - None - None - Neither - Default - false - false - Level3 - - - __WINDOWS__; _AMD64_; _GENERIC_; USE_ENDO=true; - - - UseLinkTimeCodeGeneration - true - - - - - - - - - - - true - - - - - MSVCRTD - - - true - AdvancedVectorExtensions - Disabled - __WINDOWS__; _AMD64_; USE_ENDO=true; - Level3 - - - - - {719f1a49-62b2-41e2-b500-40fad83ab12a} - false - true - false - true - false - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/ecc_tests/ecc_tests.vcxproj.filters b/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/ecc_tests/ecc_tests.vcxproj.filters deleted file mode 100644 index 7358a58..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/ecc_tests/ecc_tests.vcxproj.filters +++ /dev/null @@ -1,33 +0,0 @@ - - - - - {4FC737F1-C7A5-4376-A066-2A32D752A2FF} - cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx - - - {93995380-89BD-4b04-88EB-625FBE52EBFB} - h;hpp;hxx;hm;inl;inc;xsd - - - {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} - rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms - - - - - Header Files - - - Header Files - - - - - Source Files - - - Source Files - - - \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/fp_tests/fp_tests.vcxproj b/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/fp_tests/fp_tests.vcxproj deleted file mode 100644 index 1238639..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/fp_tests/fp_tests.vcxproj +++ /dev/null @@ -1,226 +0,0 @@ - - - - - Debug - Win32 - - - Debug - x64 - - - Generic - Win32 - - - Generic - x64 - - - Release - Win32 - - - Release - x64 - - - - {D36D493E-EFD2-4FF1-8CAE-2D16EEA76CAC} - Win32Proj - MyLibrary - fp_tests - 8.1 - - - - Application - true - Unicode - v140 - - - Application - false - true - Unicode - v140 - - - Application - false - true - Unicode - v140 - - - Application - v140 - - - Application - v140 - - - Application - v140 - - - - - - - - - - - - - - - - - - - - - false - - - false - - - - - - Level3 - Disabled - __WINDOWS__; _X86_; _GENERIC_; USE_ENDO=true; - Default - MultiThreadedDebugDLL - ProgramDatabase - - - Console - true - - - - - - - Level3 - - - MaxSpeed - true - true - __WINDOWS__; _X86_; _GENERIC_; USE_ENDO=true; - - - Console - true - true - true - - - - - Level3 - - - MaxSpeed - true - true - __WINDOWS__; _X86_; _GENERIC_; USE_ENDO=true; - - - Console - true - true - true - - - - - true - - - MaxSpeed - AdvancedVectorExtensions - AVX - None - None - None - Neither - Default - false - false - __WINDOWS__; _AMD64_; USE_ENDO=true; - Level3 - - - - UseLinkTimeCodeGeneration - - - - - true - - - MaxSpeed - AdvancedVectorExtensions - AVX - None - None - None - Neither - Default - false - false - __WINDOWS__; _AMD64_; _GENERIC_; USE_ENDO=true; - Level3 - - - - - UseLinkTimeCodeGeneration - - - - - true - - - - - MSVCRTD - - - true - AdvancedVectorExtensions - Disabled - __WINDOWS__; _AMD64_; USE_ENDO=true; - Level3 - - - - - - - - - {719f1a49-62b2-41e2-b500-40fad83ab12a} - - - - - - - - - - - \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/fp_tests/fp_tests.vcxproj.filters b/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/fp_tests/fp_tests.vcxproj.filters deleted file mode 100644 index ec0d183..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/Visual Studio/fp_tests/fp_tests.vcxproj.filters +++ /dev/null @@ -1,33 +0,0 @@ - - - - - {4FC737F1-C7A5-4376-A066-2A32D752A2FF} - cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx - - - {93995380-89BD-4b04-88EB-625FBE52EBFB} - h;hpp;hxx;hm;inl;inc;xsd - - - {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} - rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms - - - - - Header Files - - - Header Files - - - - - Source Files - - - Source Files - - - \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_32bit/crypto_util.c b/ffi-deps/FourQlib/FourQ_32bit/crypto_util.c deleted file mode 100644 index 0a1c518..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/crypto_util.c +++ /dev/null @@ -1,174 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: crypto utility functions -************************************************************************************/ - -#include "FourQ_internal.h" -#include "FourQ_params.h" -#include - -static digit_t mask4000 = (digit_t)1 << (sizeof(digit_t)*8 - 2); -static digit_t mask7fff = (digit_t)(-1) >> 1; - - -bool is_zero_ct(digit_t* a, unsigned int nwords) -{ // Check if multiprecision element is zero - digit_t x; - unsigned int i; - - x = a[0]; - for (i = 1; i < nwords; i++) { - x |= a[i]; - } - - return (bool)(1 ^ ((x | (0-x)) >> (RADIX-1))); -} - - -void encode(point_t P, unsigned char* Pencoded) -{ // Encode point P - // SECURITY NOTE: this function does not run in constant time. - digit_t temp1 = (P->x[1][NWORDS_FIELD-1] & mask4000) << 1; - digit_t temp2 = (P->x[0][NWORDS_FIELD-1] & mask4000) << 1; - - memmove(Pencoded, P->y, 32); - if (is_zero_ct((digit_t*)P->x, NWORDS_FIELD) == true) { - ((digit_t*)Pencoded)[2*NWORDS_FIELD-1] |= temp1; - } else { - ((digit_t*)Pencoded)[2*NWORDS_FIELD-1] |= temp2; - } -} - - -ECCRYPTO_STATUS decode(const unsigned char* Pencoded, point_t P) -{ // Decode point P - // SECURITY NOTE: this function does not run in constant time. - velm_t r, t, t0, t1, t2, t3, t4; - v2elm_t u, v, one = {0}; - digit_t sign_dec; - vpoint_extproj_t R; - vpoint_t VP; - unsigned int i, sign; - - one[0] = 1; - memmove((unsigned char*)P->y, Pencoded, 32); // Decoding y-coordinate and sign - sign = (unsigned int)(Pencoded[31] >> 7); - P->y[1][NWORDS_FIELD-1] &= mask7fff; - from_std_to_ext(P->y, VP->y); - - v2sqr1271(VP->y, u); - v2mul1271(u, (digit_t*)&PARAMETER_d, v); - v2sub1271(u, one, u); - v2add1271(v, one, v); - - vsqr1271(&v[0], t0); // t0 = v0^2 - vsqr1271(&v[VWORDS_FIELD], t1); // t1 = v1^2 - vadd1271(t0, t1, t0); // t0 = t0+t1 - vmul1271(&u[0], &v[0], t1); // t1 = u0*v0 - vmul1271(&u[VWORDS_FIELD], &v[VWORDS_FIELD], t2); // t2 = u1*v1 - vadd1271(t1, t2, t1); // t1 = t1+t2 - vmul1271(&u[VWORDS_FIELD], &v[0], t2); // t2 = u1*v0 - vmul1271(&u[0], &v[VWORDS_FIELD], t3); // t3 = u0*v1 - vsub1271(t2, t3, t2); // t2 = t2-t3 - vsqr1271(t1, t3); // t3 = t1^2 - vsqr1271(t2, t4); // t4 = t2^2 - vadd1271(t3, t4, t3); // t3 = t3+t4 - for (i = 0; i < 125; i++) { // t3 = t3^(2^125) - vsqr1271(t3, t3); - } - - vadd1271(t1, t3, t); // t = t1+t3 - vmod1271(t, t); - if (is_zero_ct(t, VWORDS_FIELD) == true) { - vsub1271(t1, t3, t); // t = t1-t3 - } - vadd1271(t, t, t); // t = 2*t - vsqr1271(t0, t3); // t3 = t0^2 - vmul1271(t0, t3, t3); // t3 = t3*t0 - vmul1271(t, t3, t3); // t3 = t3*t - vexp1251(t3, r); // r = t3^(2^125-1) - vmul1271(t0, r, t3); // t3 = t0*r - vmul1271(t, t3, &VP->x[0]); // x0 = t*t3 - vsqr1271(&VP->x[0], t1); - vmul1271(t0, t1, t1); // t1 = t0*x0^2 - vdiv1271(&VP->x[0]); // x0 = x0/2 - vmul1271(t2, t3, &VP->x[VWORDS_FIELD]); // x1 = t3*t2 - - vsub1271(t, t1, t); - vmod1271(t, t); - if (is_zero_ct(t, VWORDS_FIELD) == false) { // If t != t1 then swap x0 and x1 - memmove((unsigned char*)t0, (unsigned char*)&VP->x[0], 20); - memmove((unsigned char*)&VP->x[0], (unsigned char*)&VP->x[VWORDS_FIELD], 20); - memmove((unsigned char*)&VP->x[VWORDS_FIELD], (unsigned char*)t0, 20); - } - - v2mod1271(VP->x, VP->x); - if (is_zero_ct(VP->x, VWORDS_FIELD) == true) { - sign_dec = VP->x[2*VWORDS_FIELD-1] >> 22; - } else { - sign_dec = VP->x[VWORDS_FIELD-1] >> 22; - } - - if (sign != (unsigned int)sign_dec) { // If sign of x-coordinate decoded != input sign bit, then negate x-coordinate - v2neg1271(VP->x); - } - - v2mod1271(VP->x, R->x); - v2mod1271(VP->y, R->y); - if (ecc_point_validate(R) == false) { - vneg1271(&R->x[VWORDS_FIELD]); - if (ecc_point_validate(R) == false) { // Final point validation - return ECCRYPTO_ERROR; - } - } - - v2mod1271(R->x, R->x); - from_ext_to_std(R->x, P->x); - from_ext_to_std(R->y, P->y); - - return ECCRYPTO_SUCCESS; -} - - -void to_Montgomery(const digit_t* ma, digit_t* c) -{ // Converting to Montgomery representation - - Montgomery_multiply_mod_order(ma, (digit_t*)&Montgomery_Rprime, c); -} - - -void from_Montgomery(const digit_t* a, digit_t* mc) -{ // Converting from Montgomery to standard representation - digit_t one[NWORDS_ORDER] = {0}; - one[0] = 1; - - Montgomery_multiply_mod_order(a, one, mc); -} - - -const char* FourQ_get_error_message(ECCRYPTO_STATUS Status) -{ // Output error/success message for a given ECCRYPTO_STATUS - struct error_mapping { - unsigned int index; - char* string; - } mapping[ECCRYPTO_STATUS_TYPE_SIZE] = { - {ECCRYPTO_ERROR, ECCRYPTO_MSG_ERROR}, - {ECCRYPTO_SUCCESS, ECCRYPTO_MSG_SUCCESS}, - {ECCRYPTO_ERROR_DURING_TEST, ECCRYPTO_MSG_ERROR_DURING_TEST}, - {ECCRYPTO_ERROR_UNKNOWN, ECCRYPTO_MSG_ERROR_UNKNOWN}, - {ECCRYPTO_ERROR_NOT_IMPLEMENTED, ECCRYPTO_MSG_ERROR_NOT_IMPLEMENTED}, - {ECCRYPTO_ERROR_NO_MEMORY, ECCRYPTO_MSG_ERROR_NO_MEMORY}, - {ECCRYPTO_ERROR_INVALID_PARAMETER, ECCRYPTO_MSG_ERROR_INVALID_PARAMETER}, - {ECCRYPTO_ERROR_SHARED_KEY, ECCRYPTO_MSG_ERROR_SHARED_KEY}, - {ECCRYPTO_ERROR_SIGNATURE_VERIFICATION, ECCRYPTO_MSG_ERROR_SIGNATURE_VERIFICATION}, - }; - - if (Status >= ECCRYPTO_STATUS_TYPE_SIZE || mapping[Status].string == NULL) { - return "Unrecognized ECCRYPTO_STATUS"; - } else { - return mapping[Status].string; - } -}; \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_32bit/eccp2.c b/ffi-deps/FourQlib/FourQ_32bit/eccp2.c deleted file mode 100644 index 9832f80..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/eccp2.c +++ /dev/null @@ -1,1146 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: ECC operations over GF(p^2) exploiting endomorphisms -* -* This code is based on the paper "FourQ: four-dimensional decompositions on a -* Q-curve over the Mersenne prime" by Craig Costello and Patrick Longa, in Advances -* in Cryptology - ASIACRYPT, 2015. -* Preprint available at http://eprint.iacr.org/2015/565. -************************************************************************************/ - -#include "FourQ_internal.h" -#include "FourQ_params.h" -#include "FourQ_tables.h" -#include "generic/fp.h" - - -/***********************************************/ -/************* GF(p^2) FUNCTIONS ***************/ - -void fp2copy1271(f2elm_t a, f2elm_t c) -{ // Copy of a GF(p^2) element, c = a - fpcopy1271(a[0], c[0]); - fpcopy1271(a[1], c[1]); -} - - -void v2copy1271(v2elm_t a, v2elm_t c) -{ // Copy vectorized GF(p^2) element, c <- a - - c[0] = a[0]; c[1] = a[1]; c[2] = a[2]; c[3] = a[3]; c[4] = a[4]; - c[5] = a[5]; c[6] = a[6]; c[7] = a[7]; c[8] = a[8]; c[9] = a[9]; -} - - -void v2zero1271(v2elm_t a) -{ // Zeroing vectorized GF(p^2) element, a = 0 - - a[0] = 0; a[1] = 0; a[2] = 0; a[3] = 0; a[4] = 0; - a[5] = 0; a[6] = 0; a[7] = 0; a[8] = 0; a[9] = 0; -} - - -__inline void v2add1271(v2elm_t a, v2elm_t b, v2elm_t c) -{ // Vectorized GF(p^2) addition, c = a+b in GF((2^127-1)^2) - vadd1271(&a[0], &b[0], &c[0]); - vadd1271(&a[VWORDS_FIELD], &b[VWORDS_FIELD], &c[VWORDS_FIELD]); -} - - -__inline void v2sub1271(v2elm_t a, v2elm_t b, v2elm_t c) -{ // Vectorized GF(p^2) subtraction, c = a-b in GF((2^127-1)^2) - vsub1271(&a[0], &b[0], &c[0]); - vsub1271(&a[VWORDS_FIELD], &b[VWORDS_FIELD], &c[VWORDS_FIELD]); -} - - -void v2dblsub1271(v2elm_t a, v2elm_t b, v2elm_t c) -{ // Vectorized GF(p^2) addition followed by subtraction, c = 2a-b in GF((2^127-1)^2) - - c[0] = (a[0] << 1) - b[0]; - c[1] = (a[1] << 1) - b[1]; - c[2] = (a[2] << 1) - b[2]; - c[3] = (a[3] << 1) - b[3]; - c[4] = (a[4] << 1) - b[4]; - c[5] = (a[5] << 1) - b[5]; - c[6] = (a[6] << 1) - b[6]; - c[7] = (a[7] << 1) - b[7]; - c[8] = (a[8] << 1) - b[8]; - c[9] = (a[9] << 1) - b[9]; -} - - -void v2neg1271(v2elm_t a) -{ // Vectorized GF(p^2) negation - // Representation: 23/26/26/26/26/23/26/26/26/26-bit - - a[0] = mask_26 - a[0]; - a[1] = mask_26 - a[1]; - a[2] = mask_26 - a[2]; - a[3] = mask_26 - a[3]; - a[4] = mask_23 - a[4]; - a[5] = mask_26 - a[5]; - a[6] = mask_26 - a[6]; - a[7] = mask_26 - a[7]; - a[8] = mask_26 - a[8]; - a[9] = mask_23 - a[9]; -} - - -void v2div1271(uint32_t* a) -{ // GF(p^2) division by two, c = a/2 mod p - vdiv1271(&a[0]); - vdiv1271(&a[VWORDS_FIELD]); -} - - -void v2mod1271_incomplete(uint32_t* a, uint32_t* c) -{ // Reduction of GF(p^2) element - vmod1271_incomplete(&a[0], &c[0]); - vmod1271_incomplete(&a[VWORDS_FIELD], &c[VWORDS_FIELD]); -} - - -void v2mod1271(uint32_t* a, uint32_t* c) -{ // Reduction of GF(p^2) element - vmod1271(&a[0], &c[0]); - vmod1271(&a[VWORDS_FIELD], &c[VWORDS_FIELD]); -} - - -void v2mul1271(v2elm_t a, v2elm_t b, v2elm_t c) -{ // GF(p^2) multiplication, c = a*b in GF((2^127-1)^2) - velm_t t1, t2, t3, t4; - - vmul1271(&a[0], &b[0], t1); // t1 = a0*b0 - vmul1271(&a[VWORDS_FIELD], &b[VWORDS_FIELD], t2); // t2 = a1*b1 - vadd1271(&a[0], &a[VWORDS_FIELD], t3); // t3 = a0+a1 - vadd1271(&b[0], &b[VWORDS_FIELD], t4); // t4 = b0+b1 - vsub1271(t1, t2, &c[0]); // c[0] = a0*b0 - a1*b1 - vmul1271(t3, t4, t3); // t3 = (a0+a1)*(b0+b1) - vsub1271(t3, t1, t3); // t3 = (a0+a1)*(b0+b1) - a0*b0 - vsub1271(t3, t2, &c[VWORDS_FIELD]); // c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 -} - - -void v2sqr1271(v2elm_t a, v2elm_t c) -{ // GF(p^2) squaring, c = a^2 in GF((2^127-1)^2) - velm_t t1, t2, t3; - - vadd1271(&a[0], &a[VWORDS_FIELD], t1); // t1 = a0+a1 - vsub1271(&a[0], &a[VWORDS_FIELD], t2); // t2 = a0-a1 - vmul1271(&a[0], &a[VWORDS_FIELD], t3); // t3 = a0*a1 - vmul1271(t1, t2, &c[0]); // c0 = (a0+a1)(a0-a1) - vadd1271(t3, t3, &c[VWORDS_FIELD]); // c1 = 2a0*a1 -} - - -void v2inv1271(v2elm_t a) -{ // Vectorized GF(p^2) inversion, a = (a0-i*a1)/(a0^2+a1^2) - velm_t t0, t1; - - vsqr1271(&a[0], t0); // t0 = a0^2 - vsqr1271(&a[VWORDS_FIELD], t1); // t1 = a1^2 - vadd1271(t0, t1, t0); // t0 = a0^2+a1^2 - vinv1271(t0); // t0 = (a0^2+a1^2)^-1 - vneg1271(&a[VWORDS_FIELD]); // a = a0-i*a1 - vmul1271(&a[0], t0, &a[0]); - vmul1271(&a[VWORDS_FIELD], t0, &a[VWORDS_FIELD]); // a = (a0-i*a1)*(a0^2+a1^2)^-1 -} - - -__inline void clear_words(void* mem, unsigned int nwords) -{ // Clear integer-size digits from memory. "nwords" indicates the number of integer digits to be zeroed. - // This function uses the volatile type qualifier to inform the compiler not to optimize out the memory clearing. - // It has been tested with MSVS 2013 and GNU GCC 4.6.3, 4.7.3, 4.8.2 and 4.8.4. Users are responsible for verifying correctness with different compilers. - // See "Compliant Solution (C99)" at https://www.securecoding.cert.org/confluence/display/c/MSC06-C.+Beware+of+compiler+optimizations - unsigned int i; - volatile unsigned int *v = mem; - - for (i = 0; i < nwords; i++) - v[i] = 0; -} - - -#if (USE_ENDO == true) - -// Fixed GF(p^2) constants for the endomorphisms -static v2elm_t ctau1 = {0x3CE74C3, 0x3355F3A, 0x120C74D, 0xB0EBEB, 0x1964DE, 0x12, 0x0, 0xC000, 0x0, 0x0}; -static v2elm_t ctaudual1 = {0x2CDF034, 0x2A9B677, 0x6529EC, 0x3AC8C16, 0x4AA740, 0x11, 0x0, 0x3FF4000, 0x3FFFFFF, 0x7FFFFF}; -static v2elm_t cphi0 = {0x3FFFFF7, 0x3FFFFFF, 0x5FFF, 0x0, 0x0, 0x366F81A, 0x154DB3B, 0x3294F6, 0x1D6460B, 0x2553A0}; -static v2elm_t cphi1 = {0x7, 0x0, 0x5000, 0x0, 0x0, 0x28296F9, 0x3643A78, 0x22CF334, 0x2831431, 0x62C8CA}; -static v2elm_t cphi2 = {0x15, 0x0, 0xF000, 0x0, 0x0, 0x31DF391, 0x32DC553, 0x1C982C2, 0xADB26D, 0x78DF26}; -static v2elm_t cphi3 = {0x3, 0x0, 0x2000, 0x0, 0x0, 0x3962EA4, 0x10115E9, 0x342A924, 0x12475D8, 0x5084C6}; -static v2elm_t cphi4 = {0x3, 0x0, 0x3000, 0x0, 0x0, 0x2EC6855, 0x263248E, 0x2EA4A10, 0x15E9E58, 0x124404}; -static v2elm_t cphi5 = {0xF, 0x0, 0xA000, 0x0, 0x0, 0x1052DF3, 0x2C874F1, 0x59E669, 0x1062863, 0x459195}; -static v2elm_t cphi6 = {0x18, 0x0, 0x12000, 0x0, 0x0, 0x20A5BE7, 0x190E9E2, 0xB3CCD3, 0x20C50C6, 0xB232A}; -static v2elm_t cphi7 = {0x23, 0x0, 0x18000, 0x0, 0x0, 0x348781A, 0x60C0D7, 0x2A1A66C, 0x72678B, 0x3963BC}; -static v2elm_t cphi8 = {0xF0, 0x0, 0xAA000, 0x0, 0x0, 0x35D0EF0, 0x94560A, 0xBE544E, 0x2180C5B, 0x1F529F}; -static v2elm_t cphi9 = {0xBEF, 0x0, 0x870000, 0x0, 0x0, 0x36E2505, 0x34F9225, 0x375B014, 0x273F800, 0xFD52E}; -static v2elm_t cpsi1 = {0x3E346EF, 0x1FD1D9, 0xA02EDF, 0x26A0F55, 0x2AF99E, 0x13A, 0x0, 0xDE000, 0x0, 0x0}; -static v2elm_t cpsi2 = {0x143, 0x0, 0xE4000, 0x0, 0x0, 0x203F372, 0x37ADDC3, 0x1F034C7, 0x1EE66A0, 0x21B8D0}; -static v2elm_t cpsi3 = {0x9, 0x0, 0x6000, 0x0, 0x0, 0x1E73A61, 0x39AAF9D, 0x29063A6, 0x5875F5, 0x4CB26F}; -static v2elm_t cpsi4 = {0x3FFFFF6, 0x3FFFFFF, 0x3FF9FFF, 0x3FFFFFF, 0x7FFFFF, 0x218C59E, 0x655062, 0x16F9C59, 0x3A78A0A, 0x334D90}; - -// Fixed integer constants for the decomposition -// Close "offset" vector -static uint64_t c1 = {0x72482C5251A4559C}; -static uint64_t c2 = {0x59F95B0ADD276F6C}; -static uint64_t c3 = {0x7DD2D17C4625FA78}; -static uint64_t c4 = {0x6BC57DEF56CE8877}; -// Optimal basis vectors -static uint64_t b11 = {0x0906FF27E0A0A196}; -static uint64_t b12 = {0x1363E862C22A2DA0}; -static uint64_t b13 = {0x07426031ECC8030F}; -static uint64_t b14 = {0x084F739986B9E651}; -static uint64_t b21 = {0x1D495BEA84FCC2D4}; -static uint64_t b24 = {0x25DBC5BC8DD167D0}; -static uint64_t b31 = {0x17ABAD1D231F0302}; -static uint64_t b32 = {0x02C4211AE388DA51}; -static uint64_t b33 = {0x2E4D21C98927C49F}; -static uint64_t b34 = {0x0A9E6F44C02ECD97}; -static uint64_t b41 = {0x136E340A9108C83F}; -static uint64_t b42 = {0x3122DF2DC3E0FF32}; -static uint64_t b43 = {0x068A49F02AA8A9B5}; -static uint64_t b44 = {0x18D5087896DE0AEA}; -// Precomputed integers for fast-Babai rounding -static uint64_t ell1[4] = {0x259686E09D1A7D4F, 0xF75682ACE6A6BD66, 0xFC5BB5C5EA2BE5DF, 0x07}; -static uint64_t ell2[4] = {0xD1BA1D84DD627AFB, 0x2BD235580F468D8D, 0x8FD4B04CAA6C0F8A, 0x03}; -static uint64_t ell3[4] = {0x9B291A33678C203C, 0xC42BD6C965DCA902, 0xD038BF8D0BFFBAF6, 0x00}; -static uint64_t ell4[4] = {0x12E5666B77E7FDC0, 0x81CBDC3714983D82, 0x1B073877A22D8410, 0x03}; - - -/***********************************************/ -/********** CURVE/SCALAR FUNCTIONS ***********/ - -static __inline void ecc_tau(vpoint_extproj_t P) -{ // Apply tau mapping to a point, P = tau(P) - // Input: P = (X1:Y1:Z1) on E in twisted Edwards coordinates - // Output: P = (Xfinal:Yfinal:Zfinal) on Ehat in twisted Edwards coordinates - v2elm_t t0, t1; - - v2sqr1271(P->x, t0); // t0 = X1^2 - v2sqr1271(P->y, t1); // t1 = Y1^2 - v2mul1271(P->x, P->y, P->x); // X = X1*Y1 - v2sqr1271(P->z, P->y); // Y = Z1^2 - v2add1271(t1, t0, P->z); // Z = X1^2+Y1^2 - v2sub1271(t1, t0, t0); // t0 = Y1^2-X1^2 - v2mul1271(P->x, t0, P->x); // X = X1*Y1*(Y1^2-X1^2) - v2dblsub1271(P->y, t0, P->y); // Y = 2*Z1^2-(Y1^2-X1^2) - v2mul1271(P->x, ctau1, P->x); // Xfinal = X*ctau1 - v2mul1271(P->y, P->z, P->y); // Yfinal = Y*Z - v2mul1271(P->z, t0, P->z); // Zfinal = t0*Z -} - - -static __inline void ecc_tau_dual(vpoint_extproj_t P) -{ // Apply tau_dual mapping to a point, P = tau_dual(P) - // Input: P = (X1:Y1:Z1) on Ehat in twisted Edwards coordinates - // Output: P = (Xfinal,Yfinal,Zfinal,Tafinal,Tbfinal) on E, where Tfinal = Tafinal*Tbfinal, - // corresponding to (Xfinal:Yfinal:Zfinal:Tfinal) in extended twisted Edwards coordinates - v2elm_t t0, t1, t2; - - v2sqr1271(P->x, t0); // t0 = X1^2 - v2sqr1271(P->z, t2); // t2 = Z1^2 - v2sqr1271(P->y, t1); // t1 = Y1^2 - v2sub1271(t1, t0, P->ta); // Tafinal = Y1^2-X1^2 - v2add1271(t1, t0, t0); // t0 = X1^2+Y1^2 - v2mul1271(P->x, P->y, P->x); // X = X1*Y1 - v2dblsub1271(t2, P->ta, P->z); // Z = 2*Z1^2-(Y1^2-X1^2) - v2mul1271(P->x, ctaudual1, P->tb); // Tbfinal = ctaudual1*X1*X1 - v2mul1271(P->z, P->ta, P->y); // Yfinal = Z*Tafinal - v2mul1271(P->tb, t0, P->x); // Xfinal = Tbfinal*t0 - v2mul1271(P->z, t0, P->z); // Zfinal = Z*t0 -} - - -static __inline void ecc_delphidel(vpoint_extproj_t P) -{ // Apply delta_phi_delta mapping to a point, P = delta(phi_W(delta_inv(P))), - // where phi_W is the endomorphism on the Weierstrass form. - // Input: P = (X1:Y1:Z1) on Ehat in twisted Edwards coordinates - // Output: P = (Xfinal:Yfinal:Zfinal) on Ehat in twisted Edwards coordinates - v2elm_t t0, t1, t2, t3, t4, t5, t6; - - v2sqr1271(P->z, t4); // t4 = Z1^2 - v2mul1271(P->y, P->z, t3); // t3 = Y1*Z1 - v2mul1271(t4, cphi4, t0); // t0 = cphi4*t4 - v2sqr1271(P->y, t2); // t2 = Y1^2 - v2add1271(t0, t2, t0); // t0 = t0+t2 - v2mul1271(t3, cphi3, t1); // t1 = cphi3*t3 - v2sub1271(t0, t1, t5); // t5 = t0-t1 - v2add1271(t0, t1, t0); // t0 = t0+t1 - v2mul1271(t0, P->z, t0); // t0 = t0*Z1 - v2mul1271(t3, cphi1, t1); // t1 = cphi1*t3 - v2mul1271(t0, t5, t0); // t0 = t0*t5 - v2mul1271(t4, cphi2, t5); // t5 = cphi2*t4 - v2add1271(t2, t5, t5); // t5 = t2+t5 - v2sub1271(t1, t5, t6); // t6 = t1-t5 - v2add1271(t1, t5, t1); // t1 = t1+t5 - v2mul1271(t6, t1, t6); // t6 = t1*t6 - v2mul1271(t6, cphi0, t6); // t6 = cphi0*t6 - v2mul1271(P->x, t6, P->x); // X = X1*t6 - v2sqr1271(t2, t6); // t6 = t2^2 - v2sqr1271(t3, t2); // t2 = t3^2 - v2sqr1271(t4, t3); // t3 = t4^2 - v2mul1271(t2, cphi8, t1); // t1 = cphi8*t2 - v2mul1271(t3, cphi9, t5); // t5 = cphi9*t3 - v2add1271(t1, t6, t1); // t1 = t1+t6 - v2mul1271(t2, cphi6, t2); // t2 = cphi6*t2 - v2mul1271(t3, cphi7, t3); // t3 = cphi7*t3 - v2add1271(t1, t5, t1); // t1 = t1+t5 - v2add1271(t2, t3, t2); // t2 = t2+t3 - v2mul1271(t1, P->y, t1); // t1 = Y1*t1 - v2add1271(t6, t2, P->y); // Y = t6+t2 - v2mul1271(P->x, t1, P->x); // X = X*t1 - v2mul1271(P->y, cphi5, P->y); // Y = cphi5*Y - vneg1271(&P->x[VWORDS_FIELD]); // Xfinal = X^p - v2mul1271(P->y, P->z, P->y); // Y = Y*Z1 - v2mul1271(t0, t1, P->z); // Z = t0*t1 - v2mul1271(P->y, t0, P->y); // Y = Y*t0 - vneg1271(&P->z[VWORDS_FIELD]); // Zfinal = Z^p - vneg1271(&P->y[VWORDS_FIELD]); // Yfinal = Y^p -} - - -static __inline void ecc_delpsidel(vpoint_extproj_t P) -{ // Apply delta_psi_delta mapping to a point, P = delta(psi_W(delta_inv(P))), - // where psi_W is the endomorphism on the Weierstrass form. - // Input: P = (X1:Y1:Z1) on Ehat in twisted Edwards coordinates - // Output: P = (Xfinal:Yfinal:Zfinal) on Ehat in twisted Edwards coordinates - v2elm_t t0, t1, t2; - - vneg1271(&P->x[VWORDS_FIELD]); // X = X1^p - vneg1271(&P->z[VWORDS_FIELD]); // Z = Z1^p - vneg1271(&P->y[VWORDS_FIELD]); // Y = Y1^p - v2sqr1271(P->z, t2); // t2 = Z1^p^2 - v2sqr1271(P->x, t0); // t0 = X1^p^2 - v2mul1271(P->x, t2, P->x); // X = X1^p*Z1^p^2 - v2mul1271(t2, cpsi2, P->z); // Z = cpsi2*Z1^p^2 - v2mul1271(t2, cpsi3, t1); // t1 = cpsi3*Z1^p^2 - v2mul1271(t2, cpsi4, t2); // t2 = cpsi4*Z1^p^2 - v2add1271(t0, P->z, P->z); // Z = X1^p^2 + cpsi2*Z1^p^2 - v2add1271(t0, t2, t2); // t2 = X1^p^2 + cpsi4*Z1^p^2 - v2add1271(t0, t1, t1); // t1 = X1^p^2 + cpsi3*Z1^p^2 - v2neg1271(t2); // t2 = -(X1^p^2 + cpsi4*Z1^p^2) - v2mul1271(P->z, P->y, P->z); // Z = Y1^p*(X1^p^2 + cpsi2*Z1^p^2) - v2mul1271(P->x, t2, P->x); // X = -X1^p*Z1^p^2*(X1^p^2 + cpsi4*Z1^p^2) - v2mul1271(t1, P->z, P->y); // Yfinal = t1*Z - v2mul1271(P->x, cpsi1, P->x); // Xfinal = cpsi1*X - v2mul1271(P->z, t2, P->z); // Zfinal = Z*t2 -} - - -void ecc_psi(vpoint_extproj_t P) -{ // Apply psi mapping to a point, P = psi(P) - // Input: P = (X1:Y1:Z1) on E in twisted Edwards coordinates - // Output: P = (Xfinal,Yfinal,Zfinal,Tafinal,Tbfinal) on E, where Tfinal = Tafinal*Tbfinal, - // corresponding to (Xfinal:Yfinal:Zfinal:Tfinal) in extended twisted Edwards coordinates - - ecc_tau(P); - ecc_delpsidel(P); - ecc_tau_dual(P); -} - - -void ecc_phi(vpoint_extproj_t P) -{ // Apply phi mapping to a point, P = phi(P) - // Input: P = (X1:Y1:Z1) on E in twisted Edwards coordinates - // Output: P = (Xfinal,Yfinal,Zfinal,Tafinal,Tbfinal) on E, where Tfinal = Tafinal*Tbfinal, - // corresponding to (Xfinal:Yfinal:Zfinal:Tfinal) in extended twisted Edwards coordinates - - ecc_tau(P); - ecc_delphidel(P); - ecc_tau_dual(P); -} - - -void ecc_precomp(vpoint_extproj_t P, vpoint_extproj_precomp_t *T) -{ // Generation of the precomputation table used by the variable-base scalar multiplication ecc_mul(). - // Input: P = (X1,Y1,Z1,Ta,Tb), where T1 = Ta*Tb, corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - // Output: table T containing 8 points: P, P+phi(P), P+psi(P), P+phi(P)+psi(P), P+psi(phi(P)), P+phi(P)+psi(phi(P)), P+psi(P)+psi(phi(P)), P+phi(P)+psi(P)+psi(phi(P)) - // Precomputed points use the representation (X+Y,Y-X,2Z,2dT) corresponding to (X:Y:Z:T) in extended twisted Edwards coordinates - vpoint_extproj_precomp_t Q, R, S; - vpoint_extproj_t PP; - - // Generating Q = phi(P) = (XQ+YQ,YQ-XQ,ZQ,TQ) - ecccopy(P, PP); - ecc_phi(PP); - R1_to_R3(PP, Q); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,Z,T) - - // Generating S = psi(Q) = (XS+YS,YS-XS,ZS,TS) - ecc_psi(PP); - R1_to_R3(PP, S); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,Z,T) - - // Generating T[0] = P = (XP+YP,YP-XP,2ZP,2dTP) - R1_to_R2(P, T[0]); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT) - - // Generating R = psi(P) = (XR+YR,YR-XR,ZR,TR) - ecc_psi(P); - R1_to_R3(P, R); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,Z,T) - - eccadd_core(T[0], Q, PP); // T[1] = P+Q using the representations (X,Y,Z,Ta,Tb) <- (X+Y,Y-X,2Z,2dT) + (X+Y,Y-X,Z,T) - R1_to_R2(PP, T[1]); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT) - eccadd_core(T[0], R, PP); // T[2] = P+R - R1_to_R2(PP, T[2]); - eccadd_core(T[1], R, PP); // T[3] = P+Q+R - R1_to_R2(PP, T[3]); - eccadd_core(T[0], S, PP); // T[4] = P+S - R1_to_R2(PP, T[4]); - eccadd_core(T[1], S, PP); // T[5] = P+Q+S - R1_to_R2(PP, T[5]); - eccadd_core(T[2], S, PP); // T[6] = P+R+S - R1_to_R2(PP, T[6]); - eccadd_core(T[3], S, PP); // T[7] = P+Q+R+S - R1_to_R2(PP, T[7]); -} - - -static __inline void mul_truncate(uint64_t* s, uint64_t* C, uint64_t* out) -{ // 256-bit multiplication with truncation for the scalar decomposition - // Outputs 64-bit value "out" = (uint64_t)((s * C) >> 256). - uint128_t tt1, tt2; - unsigned int carry1, carry2; - uint64_t temp; - - MUL128(s[0], C[0], tt2); - tt2[0] = tt2[1]; - tt2[1] = 0; - MUL128(s[1], C[0], tt1); - ADD128(tt1, tt2, tt1); - MUL128(s[0], C[1], tt2); - ADC128(tt1, tt2, carry1, tt1); - tt1[0] = tt1[1]; - tt1[1] = (uint64_t)(carry1); - MUL128(s[2], C[0], tt2); - ADD128(tt1, tt2, tt1); - MUL128(s[0], C[2], tt2); - ADC128(tt1, tt2, carry1, tt1); - MUL128(s[1], C[1], tt2); - ADC128(tt1, tt2, carry2, tt1); - tt1[0] = tt1[1]; - tt1[1] = (uint64_t)carry1 + (uint64_t)carry2; - MUL128(s[0], C[3], tt2); - ADD128(tt1, tt2, tt1); - MUL128(s[3], C[0], tt2); - ADC128(tt1, tt2, carry1, tt1); - MUL128(s[1], C[2], tt2); - ADC128(tt1, tt2, carry2, tt1); - temp = (uint64_t)carry1 + (uint64_t)carry2; - MUL128(s[2], C[1], tt2); - ADC128(tt1, tt2, carry2, tt1); - tt1[0] = tt1[1]; - tt1[1] = temp + (uint64_t)carry2; - MUL128(s[1], C[3], tt2); - ADD128(tt1, tt2, tt1); - MUL128(s[3], C[1], tt2); - ADD128(tt1, tt2, tt1); - MUL128(s[2], C[2], tt2); - ADD128(tt1, tt2, tt1); - *out = tt1[0]; -} - - -void decompose(uint64_t* k, uint64_t* scalars) -{ // Scalar decomposition for the variable-base scalar multiplication - // Input: scalar in the range [0, 2^256-1]. - // Output: 4 64-bit sub-scalars. - uint64_t a1, a2, a3, a4, temp, mask; - -#if (TARGET == TARGET_x86) && (COMPILER == COMPILER_VC) - uint128_t t1, t2, t3, t4; - - mul_truncate(k, ell1, &a1); - mul_truncate(k, ell2, &a2); - mul_truncate(k, ell3, &a3); - mul_truncate(k, ell4, &a4); - - MUL128(a1, b11, t1); MUL128(a2, b21, t2); MUL128(a3, b31, t3); MUL128(a4, b41, t4); - temp = k[0] - t1[0] - t2[0] - t3[0] - t4[0] + c1; - mask = ~(0 - (temp & 1)); // If temp is even then mask = 0xFF...FF, else mask = 0 - - scalars[0] = temp + (mask & b41); - MUL128(a1, b12, t1); MUL128(a3, b32, t2); MUL128(a4, b42, t3); - scalars[1] = t1[0] + (uint64_t)a2 - t2[0] - t3[0] + c2 + (mask & b42); - MUL128(a3, b33, t1); MUL128(a1, b13, t2); MUL128(a4, b43, t3); - scalars[2] = t1[0] - t2[0] - (uint64_t)a2 + t3[0] + c3 - (mask & b43); - MUL128(a1, b14, t1); MUL128(a2, b24, t2); MUL128(a3, b34, t3); MUL128(a4, b44, t4); - scalars[3] = t1[0] - t2[0] - t3[0] + t4[0] + c4 - (mask & b44); -#else - mul_truncate(k, ell1, &a1); - mul_truncate(k, ell2, &a2); - mul_truncate(k, ell3, &a3); - mul_truncate(k, ell4, &a4); - - temp = k[0] - (uint64_t)a1*b11 - (uint64_t)a2*b21 - (uint64_t)a3*b31 - (uint64_t)a4*b41 + c1; - mask = ~(0 - (temp & 1)); // If temp is even then mask = 0xFF...FF, else mask = 0 - - scalars[0] = temp + (mask & b41); - scalars[1] = (uint64_t)a1*b12 + (uint64_t)a2 - (uint64_t)a3*b32 - (uint64_t)a4*b42 + c2 + (mask & b42); - scalars[2] = (uint64_t)a3*b33 - (uint64_t)a1*b13 - (uint64_t)a2 + (uint64_t)a4*b43 + c3 - (mask & b43); - scalars[3] = (uint64_t)a1*b14 - (uint64_t)a2*b24 - (uint64_t)a3*b34 + (uint64_t)a4*b44 + c4 - (mask & b44); -#endif -} - - -void recode(uint64_t* scalars, unsigned int* digits, unsigned int* sign_masks) -{ // Recoding sub-scalars for use in the variable-base scalar multiplication. See Algorithm 1 in "Efficient and Secure Methods for GLV-Based Scalar - // Multiplication and their Implementation on GLV-GLS Curves (Extended Version)", A. Faz-Hernandez, P. Longa, and A.H. Sanchez, in Journal - // of Cryptographic Engineering, Vol. 5(1), 2015. - // Input: 4 64-bit sub-scalars passed through "scalars", which are obtained after calling decompose(). - // Outputs: "digits" array with 65 nonzero entries. Each entry is in the range [0, 7], corresponding to one entry in the precomputed table. - // "sign_masks" array with 65 entries storing the signs for their corresponding digits in "digits". - // Notation: if the corresponding digit > 0 then sign_mask = 0xFF...FF, else if digit < 0 then sign_mask = 0. - unsigned int i, bit, bit0, carry; - sign_masks[64] = (unsigned int)-1; - - for (i = 0; i < 64; i++) - { - scalars[0] >>= 1; - bit0 = (unsigned int)scalars[0] & 1; - sign_masks[i] = 0 - bit0; - - bit = (unsigned int)scalars[1] & 1; - carry = (bit0 | bit) ^ bit0; - scalars[1] = (scalars[1] >> 1) + (uint64_t)carry; - digits[i] = bit; - - bit = (unsigned int)scalars[2] & 1; - carry = (bit0 | bit) ^ bit0; - scalars[2] = (scalars[2] >> 1) + (uint64_t)carry; - digits[i] += (bit << 1); - - bit = (unsigned int)scalars[3] & 1; - carry = (bit0 | bit) ^ bit0; - scalars[3] = (scalars[3] >> 1) + (uint64_t)carry; - digits[i] += (bit << 2); - } - digits[64] = (unsigned int)(scalars[1] + (scalars[2] << 1) + (scalars[3] << 2)); -} - - -void cofactor_clearing(vpoint_extproj_t P) -{ // Co-factor clearing - // Input: P = (X1,Y1,Z1,Ta,Tb), where T1 = Ta*Tb, corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates. - // Output: P = 392*P = (Xfinal,Yfinal,Zfinal,Tafinal,Tbfinal), where Tfinal = Tafinal*Tbfinal, - // corresponding to (Xfinal:Yfinal:Zfinal:Tfinal) in extended twisted Edwards coordinates - vpoint_extproj_precomp_t Q; - - R1_to_R2(P, Q); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT) - eccdouble(P); // P = 2*P using representations (X,Y,Z,Ta,Tb) <- 2*(X,Y,Z) - eccadd(Q, P); // P = P+Q using representations (X,Y,Z,Ta,Tb) <- (X,Y,Z,Ta,Tb) + (X+Y,Y-X,2Z,2dT) - eccdouble(P); - eccdouble(P); - eccdouble(P); - eccdouble(P); - eccadd(Q, P); - eccdouble(P); - eccdouble(P); - eccdouble(P); -} - - -bool ecc_mul(point_t P, digit_t* k, point_t Q, bool clear_cofactor) -{ // Variable-base scalar multiplication Q = k*P using a 4-dimensional decomposition - // Inputs: scalar "k" in [0, 2^256-1], - // point P = (x,y) in affine coordinates, - // clear_cofactor = 1 (TRUE) or 0 (FALSE) whether cofactor clearing is required or not, respectively. - // Output: Q = k*P in affine coordinates (x,y). - // This function performs point validation and (if selected) cofactor clearing. - vpoint_t A; - vpoint_extproj_t R; - vpoint_extproj_precomp_t S, Table[8]; - uint64_t scalars[NWORDS64_ORDER]; - unsigned int digits[65], sign_masks[65]; - int i; - - point_setup(P, R); // Convert to vectorized representation (X,Y,1,Ta,Tb) - - if (ecc_point_validate(R) == false) { // Check if point lies on the curve - return false; - } - - decompose((uint64_t*)k, scalars); // Scalar decomposition - if (clear_cofactor == true) { - cofactor_clearing(R); - } - recode(scalars, digits, sign_masks); // Scalar recoding - ecc_precomp(R, Table); // Precomputation - table_lookup_1x8(Table, S, digits[64], sign_masks[64]); // Extract initial point in (X+Y,Y-X,2Z,2dT) representation - R2_to_R4(S, R); // Conversion to representation (2X,2Y,2Z) - - for (i = 63; i >= 0; i--) - { - table_lookup_1x8(Table, S, digits[i], sign_masks[i]); // Extract point S in (X+Y,Y-X,2Z,2dT) representation - eccdouble(R); // P = 2*P using representations (X,Y,Z,Ta,Tb) <- 2*(X,Y,Z) - eccadd(S, R); // P = P+S using representations (X,Y,Z,Ta,Tb) <- (X,Y,Z,Ta,Tb) + (X+Y,Y-X,2Z,2dT) - } - eccnorm(R, A); // Conversion to affine coordinates (x,y) and modular correction. - from_ext_to_std(A->x, Q->x); - from_ext_to_std(A->y, Q->y); - - return true; -} - -#endif - - -void eccset(point_t P) -{ // Set generator - // Output: P = (x,y) - - fp2copy1271((felm_t*)&GENERATOR_x, P->x); // X1 - fp2copy1271((felm_t*)&GENERATOR_y, P->y); // Y1 -} - - -__inline void eccnorm(vpoint_extproj_t P, vpoint_t Q) -{ // Normalize a projective point (X1:Y1:Z1), including full reduction - // Input: P = (X1:Y1:Z1) in twisted Edwards coordinates - // Output: Q = (X1/Z1,Y1/Z1), corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - - v2inv1271(P->z); // Z1 = Z1^-1 - v2mul1271(P->x, P->z, Q->x); // X1 = X1/Z1 - v2mul1271(P->y, P->z, Q->y); // Y1 = Y1/Z1 - v2mod1271(Q->x, Q->x); - v2mod1271(Q->y, Q->y); -} - - -void R1_to_R2(vpoint_extproj_t P, vpoint_extproj_precomp_t Q) -{ // Conversion from representation (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT), where T = Ta*Tb - // Input: P = (X1,Y1,Z1,Ta,Tb), where T1 = Ta*Tb, corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - // Output: Q = (X1+Y1,Y1-X1,2Z1,2dT1) corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - - v2add1271(P->ta, P->ta, Q->t2); // T = 2*Ta - v2add1271(P->x, P->y, Q->xy); // QX = X+Y - v2sub1271(P->y, P->x, Q->yx); // QY = Y-X - v2mul1271(Q->t2, P->tb, Q->t2); // T = 2*T - v2add1271(P->z, P->z, Q->z2); // QZ = 2*Z - v2mul1271(Q->t2, (digit_t*)&PARAMETER_d, Q->t2); // QT = 2d*T -} - - -void R1_to_R3(vpoint_extproj_t P, vpoint_extproj_precomp_t Q) -{ // Conversion from representation (X,Y,Z,Ta,Tb) to (X+Y,Y-X,Z,T), where T = Ta*Tb - // Input: P = (X1,Y1,Z1,Ta,Tb), where T1 = Ta*Tb, corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - // Output: Q = (X1+Y1,Y1-X1,Z1,T1) corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - - v2add1271(P->x, P->y, Q->xy); // XQ = (X1+Y1) - v2sub1271(P->y, P->x, Q->yx); // YQ = (Y1-X1) - v2mul1271(P->ta, P->tb, Q->t2); // TQ = T1 - v2copy1271(P->z, Q->z2); // ZQ = Z1 -} - - -void R2_to_R4(vpoint_extproj_precomp_t P, vpoint_extproj_t Q) -{ // Conversion from representation (X+Y,Y-X,2Z,2dT) to (2X,2Y,2Z,2dT) - // Input: P = (X1+Y1,Y1-X1,2Z1,2dT1) corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - // Output: Q = (2X1,2Y1,2Z1) corresponding to (X1:Y1:Z1) in twisted Edwards coordinates - - v2sub1271(P->xy, P->yx, Q->x); // XQ = 2*X1 - v2add1271(P->xy, P->yx, Q->y); // YQ = 2*Y1 - v2copy1271(P->z2, Q->z); // ZQ = 2*Z1 - v2mod1271_incomplete(Q->x, Q->x); - v2mod1271_incomplete(Q->y, Q->y); -} - - -void eccdouble(vpoint_extproj_t P) -{ // Point doubling 2P - // Input: P = (X1:Y1:Z1) in twisted Edwards coordinates - // Output: 2P = (Xfinal,Yfinal,Zfinal,Tafinal,Tbfinal), where Tfinal = Tafinal*Tbfinal, - // corresponding to (Xfinal:Yfinal:Zfinal:Tfinal) in extended twisted Edwards coordinates - v2elm_t t1, t2; - - v2sqr1271(P->x, t1); // t1 = X1^2 - v2sqr1271(P->y, t2); // t2 = Y1^2 - v2add1271(P->x, P->y, P->x); // X = X1+Y1 - v2add1271(t1, t2, P->tb); // Tbfinal = X1^2+Y1^2 - v2sub1271(t2, t1, t1); // t1 = Y1^2-X1^2 - v2sqr1271(P->z, t2); // t2 = Z1^2 - v2sqr1271(P->x, P->ta); // Ta = (X1+Y1)^2 - v2dblsub1271(t2, t1, t2); // t2 = 2Z1^2-(Y1^2-X1^2) - v2sub1271(P->ta, P->tb, P->ta); // Tafinal = 2X1*Y1 = (X1+Y1)^2-(X1^2+Y1^2) - v2mul1271(t1, P->tb, P->y); // Yfinal = (X1^2+Y1^2)(Y1^2-X1^2) - v2mul1271(t2, P->ta, P->x); // Xfinal = 2X1*Y1*[2Z1^2-(Y1^2-X1^2)] - v2mul1271(t1, t2, P->z); // Zfinal = (Y1^2-X1^2)[2Z1^2-(Y1^2-X1^2)] -} - - -__inline void eccadd_core(vpoint_extproj_precomp_t P, vpoint_extproj_precomp_t Q, vpoint_extproj_t R) -{ // Basic point addition R = P+Q or R = P+P - // Inputs: P = (X1+Y1,Y1-X1,2Z1,2dT1) corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - // Q = (X2+Y2,Y2-X2,Z2,T2) corresponding to (X2:Y2:Z2:T2) in extended twisted Edwards coordinates - // Output: R = (Xfinal,Yfinal,Zfinal,Tafinal,Tbfinal), where Tfinal = Tafinal*Tbfinal, - // corresponding to (Xfinal:Yfinal:Zfinal:Tfinal) in extended twisted Edwards coordinates - v2elm_t t1, t2; - - v2mul1271(P->t2, Q->t2, R->z); // Z = 2dT1*T2 - v2mul1271(P->z2, Q->z2, t1); // t1 = 2Z1*Z2 - v2mul1271(P->xy, Q->xy, R->x); // X = (X1+Y1)(X2+Y2) - v2mul1271(P->yx, Q->yx, R->y); // Y = (Y1-X1)(Y2-X2) - v2sub1271(t1, R->z, t2); // t2 = theta - v2add1271(t1, R->z, t1); // t1 = alpha - v2sub1271(R->x, R->y, R->tb); // Tbfinal = beta - v2add1271(R->x, R->y, R->ta); // Tafinal = omega - v2mul1271(R->tb, t2, R->x); // Xfinal = beta*theta - v2mul1271(t1, t2, R->z); // Zfinal = theta*alpha - v2mul1271(R->ta, t1, R->y); // Yfinal = alpha*omega -} - - -void eccadd(vpoint_extproj_precomp_t Q, vpoint_extproj_t P) -{ // Complete point addition P = P+Q or P = P+P - // Inputs: P = (X1,Y1,Z1,Ta,Tb), where T1 = Ta*Tb, corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - // Q = (X2+Y2,Y2-X2,2Z2,2dT2) corresponding to (X2:Y2:Z2:T2) in extended twisted Edwards coordinates - // Output: P = (Xfinal,Yfinal,Zfinal,Tafinal,Tbfinal), where Tfinal = Tafinal*Tbfinal, - // corresponding to (Xfinal:Yfinal:Zfinal:Tfinal) in extended twisted Edwards coordinates - vpoint_precomp_t R; - v2elm_t t1; - - v2add1271(P->y, P->x, R->xy); // XR = (X1+Y1) - v2sub1271(P->y, P->x, R->yx); // YR = (Y1-X1) - v2mul1271(P->ta, P->tb, R->t2); // TR = T1 - v2mul1271(Q->z2, P->z, t1); // t1 = 2Z1*Z2 - v2mul1271(Q->t2, R->t2, P->z); // Z = 2dT1*T2 - v2mul1271(Q->xy, R->xy, P->x); // X = (X1+Y1)(X2+Y2) - v2mul1271(Q->yx, R->yx, P->y); // Y = (Y1-X1)(Y2-X2) - v2sub1271(t1, P->z, R->t2); // TR = theta - v2add1271(t1, P->z, t1); // t1 = alpha - v2sub1271(P->x, P->y, P->tb); // Tbfinal = beta - v2add1271(P->x, P->y, P->ta); // Tafinal = omega - v2mul1271(P->tb, R->t2, P->x); // Xfinal = beta*theta - v2mul1271(t1, R->t2, P->z); // Zfinal = theta*alpha - v2mul1271(P->ta, t1, P->y); // Yfinal = alpha*omega -} - - -void point_setup(point_t P, vpoint_extproj_t Q) -{ // Point conversion to vectorized representation (X,Y,Z,Ta,Tb) - // Input: P = (x,y) in affine coordinates - // Output: P = (X,Y,1,Ta,Tb), where Ta=X, Tb=Y and T=Ta*Tb, corresponding to (X:Y:Z:T) in extended twisted Edwards coordinates - - from_std_to_ext(P->x, Q->x); - from_std_to_ext(P->y, Q->y); - v2copy1271(Q->x, Q->ta); // Ta = X1 - v2copy1271(Q->y, Q->tb); // Tb = Y1 - v2zero1271(Q->z); Q->z[0]=1; // Z1 = 1 -} - - -bool ecc_point_validate(vpoint_extproj_t P) -{ // Point validation: check if point lies on the curve - // Input: P = (x,y) in affine coordinates, where x, y in [0, 2^127-1]. - // Output: TRUE (1) if point lies on the curve E: -x^2+y^2-1-dx^2*y^2 = 0, FALSE (0) otherwise. - // SECURITY NOTE: this function does not run in constant time (input point P is assumed to be public). - v2elm_t t1, t2, t3; - unsigned int i; - - v2sqr1271(P->y, t1); - v2sqr1271(P->x, t2); - v2sub1271(t1, t2, t3); // -x^2 + y^2 - v2mul1271(t1, t2, t1); // x^2*y^2 - v2mul1271((digit_t*)&PARAMETER_d, t1, t2); // dx^2*y^2 - v2zero1271(t1); t1[0] = 1; // t1 = 1 - v2add1271(t2, t1, t2); // 1 + dx^2*y^2 - v2sub1271(t3, t2, t1); // -x^2 + y^2 - 1 - dx^2*y^2 - v2mod1271(t1, t1); - - for (i = 0; i < 2*VWORDS_FIELD-1; i++) { - if (t1[i] != 0) return false; - } - return true; -} - - -static __inline void R5_to_R1(vpoint_precomp_t P, vpoint_extproj_t Q) -{ // Conversion from representation (x+y,y-x,2dt) to (X,Y,Z,Ta,Tb) - // Input: P = (x1+y1,y1-x1,2dt1) corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates, where Z1=1 - // Output: Q = (x1,y1,z1,x1,y1), where z1=1, corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - - v2sub1271(P->xy, P->yx, Q->x); // 2*x1 - v2add1271(P->xy, P->yx, Q->y); // 2*y1 - v2zero1271(Q->z); Q->z[0]=1; // ZQ = 1 - v2div1271(Q->x); // XQ = x1 - v2div1271(Q->y); // YQ = y1 - v2copy1271(Q->x, Q->ta); // TaQ = x1 - v2copy1271(Q->y, Q->tb); // TbQ = y1 -} - - -static __inline void eccmadd(vpoint_precomp_t Q, vpoint_extproj_t P) -{ // Mixed point addition P = P+Q or P = P+P - // Inputs: P = (X1,Y1,Z1,Ta,Tb), where T1 = Ta*Tb, corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - // Q = (x2+y2,y2-x2,2dt2) corresponding to (X2:Y2:Z2:T2) in extended twisted Edwards coordinates, where Z2=1 - // Output: P = (Xfinal,Yfinal,Zfinal,Tafinal,Tbfinal), where Tfinal = Tafinal*Tbfinal, - // corresponding to (Xfinal:Yfinal:Zfinal:Tfinal) in extended twisted Edwards coordinates - v2elm_t t1, t2; - - v2mul1271(P->ta, P->tb, P->ta); // Ta = T1 - v2add1271(P->z, P->z, t1); // t1 = 2Z1 - v2mul1271(P->ta, Q->t2, P->ta); // Ta = 2dT1*t2 - v2add1271(P->x, P->y, P->z); // Z = (X1+Y1) - v2sub1271(P->y, P->x, P->tb); // Tb = (Y1-X1) - v2sub1271(t1, P->ta, t2); // t2 = theta - v2add1271(t1, P->ta, t1); // t1 = alpha - v2mul1271(Q->xy, P->z, P->ta); // Ta = (X1+Y1)(x2+y2) - v2mul1271(Q->yx, P->tb, P->x); // X = (Y1-X1)(y2-x2) - v2sub1271(P->ta, P->x, P->tb); // Tbfinal = beta - v2add1271(P->ta, P->x, P->ta); // Tafinal = omega - v2mul1271(t1, t2, P->z); // Zfinal = theta*alpha - v2mul1271(P->tb, t2, P->x); // Xfinal = beta*theta - v2mul1271(P->ta, t1, P->y); // Yfinal = alpha*omega -} - - -bool ecc_mul_fixed(digit_t* k, point_t Q) -{ // Fixed-base scalar multiplication Q = k*G, where G is the generator. FIXED_BASE_TABLE stores v*2^(w-1) = 80 multiples of G. - // Inputs: scalar "k" in [0, 2^256-1]. - // Output: Q = k*G in affine coordinates (x,y). - // The function is based on the modified LSB-set comb method, which converts the scalar to an odd signed representation - // with (bitlength(order)+w*v) digits. - unsigned int j, w = W_FIXEDBASE, v = V_FIXEDBASE, d = D_FIXEDBASE, e = E_FIXEDBASE; - unsigned int digit = 0, digits[NBITS_ORDER_PLUS_ONE+(W_FIXEDBASE*V_FIXEDBASE)-1] = {0}; - digit_t temp[NWORDS_ORDER]; - vpoint_t A; - vpoint_extproj_t R; - vpoint_precomp_t S; - int i, ii; - - modulo_order(k, temp); // temp = k mod (order) - conversion_to_odd(temp, temp); // Converting scalar to odd using the prime subgroup order - mLSB_set_recode((uint64_t*)temp, digits); // Scalar recoding - - // Extracting initial digit - digit = digits[w*d-1]; - for (i = (int)((w-1)*d-1); i >= (int)(2*d-1); i = i-d) - { - digit = 2*digit + digits[i]; - } - // Initialize R = (x+y,y-x,2dt) with a point from the table - table_lookup_fixed_base(((vpoint_precomp_t*)&FIXED_BASE_TABLE)+(v-1)*(1 << (w-1)), S, digit, digits[d-1]); - R5_to_R1(S, R); // Converting to representation (X:Y:1:Ta:Tb) - - for (j = 0; j < (v-1); j++) - { - digit = digits[w*d-(j+1)*e-1]; - for (i = (int)((w-1)*d-(j+1)*e-1); i >= (int)(2*d-(j+1)*e-1); i = i-d) - { - digit = 2*digit + digits[i]; - } - // Extract point in (x+y,y-x,2dt) representation - table_lookup_fixed_base(((vpoint_precomp_t*)&FIXED_BASE_TABLE)+(v-j-2)*(1 << (w-1)), S, digit, digits[d-(j+1)*e-1]); - eccmadd(S, R); // R = R+S using representations (X,Y,Z,Ta,Tb) <- (X,Y,Z,Ta,Tb) + (x+y,y-x,2dt) - } - - for (ii = (e-2); ii >= 0; ii--) - { - eccdouble(R); // R = 2*R using representations (X,Y,Z,Ta,Tb) <- 2*(X,Y,Z) - for (j = 0; j < v; j++) - { - digit = digits[w*d-j*e+ii-e]; - for (i = (int)((w-1)*d-j*e+ii-e); i >= (int)(2*d-j*e+ii-e); i = i-d) - { - digit = 2*digit + digits[i]; - } - // Extract point in (x+y,y-x,2dt) representation - table_lookup_fixed_base(((vpoint_precomp_t*)&FIXED_BASE_TABLE)+(v-j-1)*(1 << (w-1)), S, digit, digits[d-j*e+ii-e]); - eccmadd(S, R); // R = R+S using representations (X,Y,Z,Ta,Tb) <- (X,Y,Z,Ta,Tb) + (x+y,y-x,2dt) - } - } - eccnorm(R, A); // Conversion to affine coordinates (x,y) and modular correction. - from_ext_to_std(A->x, Q->x); - from_ext_to_std(A->y, Q->y); - - return true; -} - - -void mLSB_set_recode(uint64_t* scalar, unsigned int *digits) -{ // Computes the modified LSB-set representation of a scalar - // Inputs: scalar in [0, order-1], where the order of FourQ's subgroup is 246 bits. - // Output: digits, where the first "d" values (from index 0 to (d-1)) store the signs for the recoded values using the convention: -1 (negative), 0 (positive), and - // the remaining values (from index d to (l-1)) store the recoded values in mLSB-set representation, excluding their sign, - // where l = d*w and d = ceil(bitlength(order)/(w*v))*v. The values v and w are fixed and must be in the range [1, 10] (see FourQ.h); they determine the size - // of the precomputed table "FIXED_BASE_TABLE" used by ecc_mul_fixed(). - unsigned int i, j, d = D_FIXEDBASE, l = L_FIXEDBASE; - uint64_t temp, carry; - - digits[d-1] = 0; - - // Shift scalar to the right by 1 - for (j = 0; j < (NWORDS64_ORDER-1); j++) { - SHIFTR(scalar[j+1], scalar[j], 1, scalar[j], RADIX64); - } - scalar[NWORDS64_ORDER-1] >>= 1; - - for (i = 0; i < (d-1); i++) - { - digits[i] = (unsigned int)((scalar[0] & 1) - 1); // Convention for the "sign" row: - // if scalar_(i+1) = 0 then digit_i = -1 (negative), else if scalar_(i+1) = 1 then digit_i = 0 (positive) - // Shift scalar to the right by 1 - for (j = 0; j < (NWORDS64_ORDER-1); j++) { - SHIFTR(scalar[j+1], scalar[j], 1, scalar[j], RADIX64); - } - scalar[NWORDS64_ORDER-1] >>= 1; - } - - for (i = d; i < l; i++) - { - digits[i] = (unsigned int)(scalar[0] & 1); // digits_i = k mod 2. Sign is determined by the "sign" row - - // Shift scalar to the right by 1 - for (j = 0; j < (NWORDS64_ORDER-1); j++) { - SHIFTR(scalar[j+1], scalar[j], 1, scalar[j], RADIX64); - } - scalar[NWORDS64_ORDER-1] >>= 1; - - temp = (0 - digits[i-(i/d)*d]) & digits[i]; // if (digits_i=0 \/ 1) then temp = 0, else if (digits_i=-1) then temp = 1 - - // floor(scalar/2) + temp - scalar[0] = scalar[0] + temp; - carry = (temp & (uint64_t)is_digit_zero_ct((digit_t)scalar[0])); // carry = (scalar[0] < temp); - for (j = 1; j < NWORDS64_ORDER; j++) - { - scalar[j] = scalar[j] + carry; - carry = (carry & (uint64_t)is_digit_zero_ct((digit_t)scalar[j])); // carry = (scalar[j] < temp); - } - } - return; -} - - -static __inline void eccneg_extproj_precomp(vpoint_extproj_precomp_t P, vpoint_extproj_precomp_t Q) -{ // Point negation - // Input : point P in coordinates (X+Y,Y-X,2Z,2dT) - // Output: point Q = -P = (Y-X,X+Y,2Z,-2dT) - v2copy1271(P->t2, Q->t2); - v2copy1271(P->xy, Q->yx); - v2copy1271(P->yx, Q->xy); - v2copy1271(P->z2, Q->z2); - v2neg1271(Q->t2); -} - - -static __inline void eccneg_precomp(vpoint_precomp_t P, vpoint_precomp_t Q) -{ // Point negation - // Input : point P in coordinates (x+y,y-x,2dt) - // Output: point Q = -P = (y-x,x+y,-2dt) - v2copy1271(P->t2, Q->t2); - v2copy1271(P->xy, Q->yx); - v2copy1271(P->yx, Q->xy); - v2neg1271(Q->t2); -} - - -bool ecc_mul_double(digit_t* k, point_t Q, digit_t* l, point_t R) -{ // Double scalar multiplication R = k*G + l*Q, where the G is the generator. Uses DOUBLE_SCALAR_TABLE, which contains multiples of G, Phi(G), Psi(G) and Phi(Psi(G)). - // Inputs: point Q in affine coordinates, - // Scalars "k" and "l" in [0, 2^256-1]. - // Output: R = k*G + l*Q in affine coordinates (x,y). - // The function uses wNAF with interleaving. - vpoint_t A; - - // SECURITY NOTE: this function is intended for a non-constant-time operation such as signature verification. - -#if (USE_ENDO == true) - unsigned int position; - int i, digits_k1[65] = {0}, digits_k2[65] = {0}, digits_k3[65] = {0}, digits_k4[65] = {0}; - int digits_l1[65] = {0}, digits_l2[65] = {0}, digits_l3[65] = {0}, digits_l4[65] = {0}; - vpoint_precomp_t V; - vpoint_extproj_t Q1, Q2, Q3, Q4, T; - vpoint_extproj_precomp_t U, Q_table1[NPOINTS_DOUBLEMUL_WQ], Q_table2[NPOINTS_DOUBLEMUL_WQ], Q_table3[NPOINTS_DOUBLEMUL_WQ], Q_table4[NPOINTS_DOUBLEMUL_WQ]; - uint64_t k_scalars[4], l_scalars[4]; - - point_setup(Q, Q1); // Convert to representation (X,Y,1,Ta,Tb) - - if (ecc_point_validate(Q1) == false) { // Check if point lies on the curve - return false; - } - - // Computing endomorphisms over point Q - ecccopy(Q1, Q2); - ecc_phi(Q2); - ecccopy(Q1, Q3); - ecc_psi(Q3); - ecccopy(Q2, Q4); - ecc_psi(Q4); - - decompose((uint64_t*)k, k_scalars); // Scalar decomposition - decompose((uint64_t*)l, l_scalars); - wNAF_recode(k_scalars[0], WP_DOUBLEBASE, digits_k1); // Scalar recoding - wNAF_recode(k_scalars[1], WP_DOUBLEBASE, digits_k2); - wNAF_recode(k_scalars[2], WP_DOUBLEBASE, digits_k3); - wNAF_recode(k_scalars[3], WP_DOUBLEBASE, digits_k4); - wNAF_recode(l_scalars[0], WQ_DOUBLEBASE, digits_l1); - wNAF_recode(l_scalars[1], WQ_DOUBLEBASE, digits_l2); - wNAF_recode(l_scalars[2], WQ_DOUBLEBASE, digits_l3); - wNAF_recode(l_scalars[3], WQ_DOUBLEBASE, digits_l4); - ecc_precomp_double(Q1, Q_table1, NPOINTS_DOUBLEMUL_WQ); // Precomputation - ecc_precomp_double(Q2, Q_table2, NPOINTS_DOUBLEMUL_WQ); - ecc_precomp_double(Q3, Q_table3, NPOINTS_DOUBLEMUL_WQ); - ecc_precomp_double(Q4, Q_table4, NPOINTS_DOUBLEMUL_WQ); - - v2zero1271(T->x); // Initialize T as the neutral point (0:1:1) - v2zero1271(T->y); T->y[0] = 1; - v2zero1271(T->z); T->z[0] = 1; - - for (i = 64; i >= 0; i--) - { - eccdouble(T); // Double (X_T,Y_T,Z_T,Ta_T,Tb_T) = 2(X_T,Y_T,Z_T,Ta_T,Tb_T) - if (digits_l1[i] < 0) { - position = (-digits_l1[i])/2; - eccneg_extproj_precomp(Q_table1[position], U); // Load and negate U = (X_U,Y_U,Z_U,Td_U) <- -(X+Y,Y-X,2Z,2dT) from a point in the precomputed table - eccadd(U, T); // T = T+U = (X_T,Y_T,Z_T,Ta_T,Tb_T) = (X_T,Y_T,Z_T,Ta_T,Tb_T) + (X_U,Y_U,Z_U,Td_U) - } else if (digits_l1[i] > 0) { - position = (digits_l1[i])/2; // Take U = (X_U,Y_U,Z_U,Td_U) <- (X+Y,Y-X,2Z,2dT) from a point in the precomputed table - eccadd(Q_table1[position], T); // T = T+U = (X_T,Y_T,Z_T,Ta_T,Tb_T) = (X_T,Y_T,Z_T,Ta_T,Tb_T) + (X_U,Y_U,Z_U,Td_U) - } - if (digits_l2[i] < 0) { - position = (-digits_l2[i])/2; - eccneg_extproj_precomp(Q_table2[position], U); - eccadd(U, T); - } else if (digits_l2[i] > 0) { - position = (digits_l2[i])/2; - eccadd(Q_table2[position], T); - } - if (digits_l3[i] < 0) { - position = (-digits_l3[i])/2; - eccneg_extproj_precomp(Q_table3[position], U); - eccadd(U, T); - } else if (digits_l3[i] > 0) { - position = (digits_l3[i])/2; - eccadd(Q_table3[position], T); - } - if (digits_l4[i] < 0) { - position = (-digits_l4[i])/2; - eccneg_extproj_precomp(Q_table4[position], U); - eccadd(U, T); - } else if (digits_l4[i] > 0) { - position = (digits_l4[i])/2; - eccadd(Q_table4[position], T); - } - - if (digits_k1[i] < 0) { - position = (-digits_k1[i])/2; - eccneg_precomp(((vpoint_precomp_t*)&DOUBLE_SCALAR_TABLE)[position], V); // Load and negate V = (X_V,Y_V,Z_V,Td_V) <- -(x+y,y-x,2dt) from a point in the precomputed table - eccmadd(V, T); // T = T+V = (X_T,Y_T,Z_T,Ta_T,Tb_T) = (X_T,Y_T,Z_T,Ta_T,Tb_T) + (X_V,Y_V,Z_V,Td_V) - } else if (digits_k1[i] > 0) { - position = (digits_k1[i])/2; // Take V = (X_V,Y_V,Z_V,Td_V) <- (x+y,y-x,2dt) from a point in the precomputed table - eccmadd(((vpoint_precomp_t*)&DOUBLE_SCALAR_TABLE)[position], T); // T = T+V = (X_T,Y_T,Z_T,Ta_T,Tb_T) = (X_T,Y_T,Z_T,Ta_T,Tb_T) + (X_V,Y_V,Z_V,Td_V) - } - if (digits_k2[i] < 0) { - position = (-digits_k2[i])/2; - eccneg_precomp(((vpoint_precomp_t*)&DOUBLE_SCALAR_TABLE)[NPOINTS_DOUBLEMUL_WP+position], V); - eccmadd(V, T); - } else if (digits_k2[i] > 0) { - position = (digits_k2[i])/2; - eccmadd(((vpoint_precomp_t*)&DOUBLE_SCALAR_TABLE)[NPOINTS_DOUBLEMUL_WP+position], T); - } - if (digits_k3[i] < 0) { - position = (-digits_k3[i])/2; - eccneg_precomp(((vpoint_precomp_t*)&DOUBLE_SCALAR_TABLE)[2*NPOINTS_DOUBLEMUL_WP+position], V); - eccmadd(V, T); - } else if (digits_k3[i] > 0) { - position = (digits_k3[i])/2; - eccmadd(((vpoint_precomp_t*)&DOUBLE_SCALAR_TABLE)[2*NPOINTS_DOUBLEMUL_WP+position], T); - } - if (digits_k4[i] < 0) { - position = (-digits_k4[i])/2; - eccneg_precomp(((vpoint_precomp_t*)&DOUBLE_SCALAR_TABLE)[3*NPOINTS_DOUBLEMUL_WP+position], V); - eccmadd(V, T); - } else if (digits_k4[i] > 0) { - position = (digits_k4[i])/2; - eccmadd(((vpoint_precomp_t*)&DOUBLE_SCALAR_TABLE)[3*NPOINTS_DOUBLEMUL_WP+position], T); - } - } - -#else - point_t B; - vpoint_extproj_t T; - vpoint_extproj_precomp_t S; - - if (ecc_mul(Q, l, B, false) == false) { - return false; - } - point_setup(B, T); - R1_to_R2(T, S); - - ecc_mul_fixed(k, B); - point_setup(B, T); - eccadd(S, T); -#endif - eccnorm(T, A); // Conversion to affine coordinates (x,y) and modular correction. - from_ext_to_std(A->x, R->x); - from_ext_to_std(A->y, R->y); - - return true; -} - - -void ecc_precomp_double(vpoint_extproj_t P, vpoint_extproj_precomp_t* Table, unsigned int npoints) -{ // Generation of the precomputation table used internally by the double scalar multiplication function ecc_mul_double(). - // Inputs: point P in representation (X,Y,Z,Ta,Tb), - // Table with storage for npoints, - // number of points "npoints". - // Output: Table containing multiples of the base point P using representation (X+Y,Y-X,2Z,2dT). - vpoint_extproj_t Q; - vpoint_extproj_precomp_t PP; - unsigned int i; - - R1_to_R2(P, Table[0]); // Precomputed point Table[0] = P in coordinates (X+Y,Y-X,2Z,2dT) - eccdouble(P); // A = 2*P in (X,Y,Z,Ta,Tb) - R1_to_R3(P, PP); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,Z,T) - - for (i = 1; i < npoints; i++) { - eccadd_core(Table[i-1], PP, Q); // Table[i] = Table[i-1]+2P using the representations (X,Y,Z,Ta,Tb) <- (X+Y,Y-X,2Z,2dT) + (X+Y,Y-X,Z,T) - R1_to_R2(Q, Table[i]); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT) - } - - return; -} - - -void wNAF_recode(uint64_t scalar, unsigned int w, int* digits) -{ // Computes wNAF recoding of a scalar, where digits are in set {0,+-1,+-3,...,+-(2^(w-1)-1)} - unsigned int i; - int digit, index = 0; - int val1 = (int)(1 << (w-1)) - 1; // 2^(w-1) - 1 - int val2 = (int)(1 << w); // 2^w; - uint64_t k = scalar, mask = (uint64_t)val2 - 1; // 2^w - 1 - - while (k != 0) - { - digit = (int)(k & 1); - - if (digit == 0) { - k >>= 1; // Shift scalar to the right by 1 - digits[index] = 0; - } else { - digit = (int)(k & mask); - k >>= w; // Shift scalar to the right by w - - if (digit > val1) { - digit -= val2; - } - if (digit < 0) { // scalar + 1 - k += 1; - } - digits[index] = digit; - - if (k != 0) { // Check if scalar != 0 - for (i = 0; i < (w-1); i++) - { - index++; - digits[index] = 0; - } - } - } - index++; - } - return; -} - diff --git a/ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c b/ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c deleted file mode 100644 index 2cc7a3e..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/eccp2_no_endo.c +++ /dev/null @@ -1,157 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: ECC operations over GF(p^2) without exploiting endomorphisms -* -* This code is based on the paper "FourQ: four-dimensional decompositions on a -* Q-curve over the Mersenne prime" by Craig Costello and Patrick Longa, in Advances -* in Cryptology - ASIACRYPT, 2015. -* Preprint available at http://eprint.iacr.org/2015/565. -************************************************************************************/ - -#include "FourQ_internal.h" - - -#if (USE_ENDO == false) - -/***********************************************/ -/********** CURVE/SCALAR FUNCTIONS ***********/ - -void fixed_window_recode(uint64_t* scalar, unsigned int* digits, unsigned int* sign_masks) -{ // Converting scalar to the fixed window representation used by the variable-base scalar multiplication - // Inputs: scalar in [0, order-1], where the order of FourQ's subgroup is 246 bits. - // Outputs: "digits" array with (t_VARBASE+1) nonzero entries. Each entry is in the range [0, 7], corresponding to one entry in the precomputed table. - // where t_VARBASE+1 = ((bitlength(order)+w-1)/(w-1))+1 represents the fixed length of the recoded scalar using window width w. - // The value of w is fixed to W_VARBASE = 5, which corresponds to a precomputed table with 2^(W_VARBASE-2) = 8 entries (see FourQ.h) - // used by the variable base scalar multiplication ecc_mul(). - // "sign_masks" array with (t_VARBASE+1) entries storing the signs for their corresponding digits in "digits". - // Notation: if the corresponding digit > 0 then sign_mask = 0xFF...FF, else if digit < 0 then sign_mask = 0. - unsigned int val1, val2, i, j; - uint64_t res, borrow; - int64_t temp; - - val1 = (1 << W_VARBASE) - 1; - val2 = (1 << (W_VARBASE-1)); - - for (i = 0; i < t_VARBASE; i++) - { - temp = (scalar[0] & val1) - val2; // ki = (k mod 2^w)/2^(w-1) - sign_masks[i] = ~((unsigned int)(temp >> (RADIX64-1))); - digits[i] = ((sign_masks[i] & (unsigned int)(temp ^ -temp)) ^ (unsigned int)-temp) >> 1; - - res = scalar[0] - temp; // k = (k - ki) / 2^(w-1) - borrow = ((temp >> (RADIX64-1)) - 1) & (uint64_t)is_digit_lessthan_ct((digit_t)scalar[0], (digit_t)temp); - scalar[0] = res; - - for (j = 1; j < NWORDS64_ORDER; j++) - { - res = scalar[j]; - scalar[j] = res - borrow; - borrow = (uint64_t)is_digit_lessthan_ct((digit_t)res, (digit_t)borrow); - } - - for (j = 0; j < (NWORDS64_ORDER-1); j++) { - SHIFTR(scalar[j+1], scalar[j], (W_VARBASE-1), scalar[j], RADIX64); - } - scalar[NWORDS64_ORDER-1] = scalar[NWORDS64_ORDER-1] >> (W_VARBASE-1); - - } - sign_masks[t_VARBASE] = ~((unsigned int)(scalar[0] >> (RADIX64-1))); - digits[t_VARBASE] = ((sign_masks[t_VARBASE] & (unsigned int)(scalar[0] ^ (0-scalar[0]))) ^ (unsigned int)(0-scalar[0])) >> 1; // kt = k (t_VARBASE+1 digits) -} - - -void ecc_precomp(vpoint_extproj_t P, vpoint_extproj_precomp_t *T) -{ // Generation of the precomputation table used by the variable-base scalar multiplication ecc_mul(). - // Input: P = (X1,Y1,Z1,Ta,Tb), where T1 = Ta*Tb, corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates. - // Output: table T containing NPOINTS_VARBASE points: P, 3P, 5P, ... , (2*NPOINTS_VARBASE-1)P. NPOINTS_VARBASE is fixed to 8 (see FourQ.h). - // Precomputed points use the representation (X+Y,Y-X,2Z,2dT) corresponding to (X:Y:Z:T) in extended twisted Edwards coordinates. - vpoint_extproj_precomp_t P2; - vpoint_extproj_t Q; - unsigned int i; - - // Generating P2 = 2(X1,Y1,Z1,T1a,T1b) = (XP2+YP2,Y2P-X2P,ZP2,TP2) and T[0] = P = (X1+Y1,Y1-X1,2*Z1,2*d*T1) - ecccopy(P, Q); - R1_to_R2(P, T[0]); - eccdouble(Q); - R1_to_R3(Q, P2); - - for (i = 1; i < NPOINTS_VARBASE; i++) { - // T[i] = 2P+T[i-1] = (2*i+1)P = (XP2+YP2,Y2P-X2P,ZP2,TP2) + (X_(2*i-1)+Y_(2*i-1), Y_(2*i-1)-X_(2*i-1), 2Z_(2*i-1), 2T_(2*i-1)) = (X_(2*i+1)+Y_(2*i+1), Y_(2*i+1)-X_(2*i+1), 2Z_(2*i+1), 2dT_(2*i+1)) - eccadd_core(P2, T[i-1], Q); - R1_to_R2(Q, T[i]); - } -} - - -void cofactor_clearing(vpoint_extproj_t P) -{ // Co-factor clearing - // Input: P = (X1,Y1,Z1,Ta,Tb), where T1 = Ta*Tb, corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - // Output: P = 392*P = (Xfinal,Yfinal,Zfinal,Tafinal,Tbfinal), where Tfinal = Tafinal*Tbfinal, - // corresponding to (Xfinal:Yfinal:Zfinal:Tfinal) in extended twisted Edwards coordinates - vpoint_extproj_precomp_t Q; - - R1_to_R2(P, Q); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT) - eccdouble(P); // P = 2*P using representations (X,Y,Z,Ta,Tb) <- 2*(X,Y,Z) - eccadd(Q, P); // P = P+Q using representations (X,Y,Z,Ta,Tb) <- (X,Y,Z,Ta,Tb) + (X+Y,Y-X,2Z,2dT) - eccdouble(P); - eccdouble(P); - eccdouble(P); - eccdouble(P); - eccadd(Q, P); - eccdouble(P); - eccdouble(P); - eccdouble(P); -} - - -bool ecc_mul(point_t P, digit_t* k, point_t Q, bool clear_cofactor) -{ // Scalar multiplication Q = k*P - // Inputs: scalar "k" in [0, 2^256-1], - // point P = (x,y) in affine coordinates, - // clear_cofactor = 1 (TRUE) or 0 (FALSE) whether cofactor clearing is required or not, respectively. - // Output: Q = k*P in affine coordinates (x,y). - // This function performs point validation and (if selected) cofactor clearing. - vpoint_t A; - vpoint_extproj_t R; - vpoint_extproj_precomp_t S, Table[NPOINTS_VARBASE]; - unsigned int digits[t_VARBASE+1] = {0}, sign_masks[t_VARBASE+1] = {0}; - digit_t k_odd[NWORDS_ORDER]; - int i; - - point_setup(P, R); // Convert to representation (X,Y,1,Ta,Tb) - - if (ecc_point_validate(R) == false) { // Check if point lies on the curve - return false; - } - - if (clear_cofactor == true) { - cofactor_clearing(R); - } - - modulo_order(k, k_odd); // k_odd = k mod (order) - conversion_to_odd(k_odd, k_odd); // Converting scalar to odd using the prime subgroup order - ecc_precomp(R, Table); // Precomputation of points T[0],...,T[npoints-1] - fixed_window_recode((uint64_t*)k_odd, digits, sign_masks); // Scalar recoding - table_lookup_1x8(Table, S, digits[t_VARBASE], sign_masks[t_VARBASE]); - R2_to_R4(S, R); // Conversion to representation (2X,2Y,2Z) - - for (i = (t_VARBASE-1); i >= 0; i--) - { - eccdouble(R); - table_lookup_1x8(Table, S, digits[i], sign_masks[i]); // Extract point in (X+Y,Y-X,2Z,2dT) representation - eccdouble(R); - eccdouble(R); - eccdouble(R); // P = 2*P using representations (X,Y,Z,Ta,Tb) <- 2*(X,Y,Z) - eccadd(S, R); // P = P+S using representations (X,Y,Z,Ta,Tb) <- (X,Y,Z,Ta,Tb) + (X+Y,Y-X,2Z,2dT) - } - eccnorm(R, A); // Conversion to affine coordinates (x,y) and modular correction. - from_ext_to_std(A->x, Q->x); - from_ext_to_std(A->y, Q->y); - - return true; -} - -#endif \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_32bit/generic/fp.h b/ffi-deps/FourQlib/FourQ_32bit/generic/fp.h deleted file mode 100644 index 765f7e6..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/generic/fp.h +++ /dev/null @@ -1,523 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: modular arithmetic and other low-level operations for 32-bit platforms -************************************************************************************/ - -#ifndef __FP_H__ -#define __FP_H__ - - -// For C++ -#ifdef __cplusplus -extern "C" { -#endif - - -#include "../table_lookup.h" -#include "../FourQ_params.h" -#if (TARGET == TARGET_x86) && (COMPILER == COMPILER_VC) - #include "intrin.h" -#endif - -#define mask_26 (((uint32_t)1 << 26) - 1) -#define mask_23 (((uint32_t)1 << 23) - 1) - - -void digit_x_digit(digit_t a, digit_t b, digit_t* c) -{ // Digit multiplication, digit * digit -> 2-digit result - register digit_t al, ah, bl, bh, temp; - digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry; - digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t)*4), mask_high = (digit_t)(-1) << (sizeof(digit_t)*4); - - al = a & mask_low; // Low part - ah = a >> (sizeof(digit_t) * 4); // High part - bl = b & mask_low; - bh = b >> (sizeof(digit_t) * 4); - - albl = al*bl; - albh = al*bh; - ahbl = ah*bl; - ahbh = ah*bh; - c[0] = albl & mask_low; // C00 - - res1 = albl >> (sizeof(digit_t) * 4); - res2 = ahbl & mask_low; - res3 = albh & mask_low; - temp = res1 + res2 + res3; - carry = temp >> (sizeof(digit_t) * 4); - c[0] ^= temp << (sizeof(digit_t) * 4); // C01 - - res1 = ahbl >> (sizeof(digit_t) * 4); - res2 = albh >> (sizeof(digit_t) * 4); - res3 = ahbh & mask_low; - temp = res1 + res2 + res3 + carry; - c[1] = temp & mask_low; // C10 - carry = temp & mask_high; - c[1] ^= (ahbh & mask_high) + carry; // C11 -} - - -static __inline void fpcopy1271(felm_t a, felm_t c) -{ // Copy of a field element, c = a - unsigned int i; - - for (i = 0; i < NWORDS_FIELD; i++) - c[i] = a[i]; -} - - -void vadd1271(velm_t a, velm_t b, velm_t c) -{ // Field addition over GF(2^127-1) - // Redundant representation: 23/26/26/26/26-bit - - c[0] = a[0] + b[0]; - c[1] = a[1] + b[1]; - c[2] = a[2] + b[2]; - c[3] = a[3] + b[3]; - c[4] = a[4] + b[4]; -} - - -void vsub1271(velm_t a, velm_t b, velm_t c) -{ // Field subtraction over GF(2^127-1) - // Redundant representation: 23/26/26/26/26-bit - - c[0] = a[0] - b[0]; - c[1] = a[1] - b[1]; - c[2] = a[2] - b[2]; - c[3] = a[3] - b[3]; - c[4] = a[4] - b[4]; -} - - -void vneg1271(velm_t a) -{ // Field negation over GF(2^127-1) - // Redundant representation: 23/26/26/26/26-bit - - a[0] = mask_26 - a[0]; - a[1] = mask_26 - a[1]; - a[2] = mask_26 - a[2]; - a[3] = mask_26 - a[3]; - a[4] = mask_23 - a[4]; -} - - -void vmul1271(velm_t a, velm_t b, velm_t c) -{ // Field multiplication, c = a*b mod p - int32_t r0, r1, a0, a1, a2, a3, a4, b0, b1, b2, b3, b4; - int64_t c0, c1, c2, c3, c4; - -#if (TARGET == TARGET_x86) && (COMPILER == COMPILER_VC) - a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3]; a4 = a[4]; - b0 = b[0]; b1 = b[1]; b2 = b[2]; b3 = b[3]; b4 = b[4]; - - c0 = __emul((int)a0, (int)b0) + (__emul((int)a1, (int)b4) << 3) + (__emul((int)a4, (int)b1) << 3) + (__emul((int)a2, (int)b3) << 3) + (__emul((int)a3, (int)b2) << 3); - c1 = __emul((int)a0, (int)b1) + __emul((int)a1, (int)b0) + (__emul((int)a2, (int)b4) << 3) + (__emul((int)a4, (int)b2) << 3) + (__emul((int)a3, (int)b3) << 3); - c2 = __emul((int)a0, (int)b2) + __emul((int)a2, (int)b0) + __emul((int)a1, (int)b1) + (__emul((int)a3, (int)b4) << 3) + (__emul((int)a4, (int)b3) << 3); - c3 = __emul((int)a0, (int)b3) + __emul((int)a3, (int)b0) + __emul((int)a1, (int)b2) + __emul((int)a2, (int)b1) + (__emul((int)a4, (int)b4) << 3); - c4 = __emul((int)a0, (int)b4) + __emul((int)a4, (int)b0) + __emul((int)a1, (int)b3) + __emul((int)a3, (int)b1) + __emul((int)a2, (int)b2); -#else - int64_t t1, t2, t3, t4; - - a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3]; a4 = a[4]; - b0 = b[0]; b1 = b[1]; b2 = b[2]; b3 = b[3]; b4 = b[4]; - - t1 = (int64_t)a1 << 3; - t2 = (int64_t)a2 << 3; - t3 = (int64_t)a3 << 3; - t4 = (int64_t)a4 << 3; - - c0 = (int64_t)a0*b0 + (int64_t)t1*b4 + (int64_t)t4*b1 + (int64_t)t2*b3 + (int64_t)t3*b2; - c1 = (int64_t)a0*b1 + (int64_t)a1*b0 + (int64_t)t2*b4 + (int64_t)t4*b2 + (int64_t)t3*b3; - c2 = (int64_t)a0*b2 + (int64_t)a2*b0 + (int64_t)a1*b1 + (int64_t)t3*b4 + (int64_t)t4*b3; - c3 = (int64_t)a0*b3 + (int64_t)a3*b0 + (int64_t)a1*b2 + (int64_t)a2*b1 + (int64_t)t4*b4; - c4 = (int64_t)a0*b4 + (int64_t)a4*b0 + (int64_t)a1*b3 + (int64_t)a3*b1 + (int64_t)a2*b2; -#endif - - r0 = c0 & mask_26; - c1 += c0 >> 26; r1 = c1 & mask_26; - c2 += c1 >> 26; c[2] = c2 & mask_26; - c3 += c2 >> 26; c[3] = c3 & mask_26; - c4 += c3 >> 26; c[4] = c4 & mask_23; -// c4 += c3 >> 26; c[4] = c4 & mask_26; - - c0 = r0 + (c4 >> 23); -// c0 = r0 + ((c4 >> 26) << 3); - c[0] = (int32_t)c0 & mask_26; - c[1] = r1 + (int32_t)(c0 >> 26); -} - - -void vsqr1271(velm_t a, velm_t c) -{ // Field squaring, c = a*b mod p - int32_t r0, r1, a0, a1, a2, a3, a4; - int64_t c0, c1, c2, c3, c4; - -#if (TARGET == TARGET_x86) && (COMPILER == COMPILER_VC) - a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3]; a4 = a[4]; - - c0 = __emul((int)a0, (int)a0) + (__emul((int)a4, (int)a1) << 4) + (__emul((int)a2, (int)a3) << 4); - c1 = (__emul((int)a0, (int)a1) << 1) + (__emul((int)a3, (int)a3) << 3) + (__emul((int)a4, (int)a2) << 4); - c2 = (__emul((int)a0, (int)a2) << 1) + __emul((int)a1, (int)a1) + (__emul((int)a4, (int)a3) << 4); - c3 = (__emul((int)a0, (int)a3) << 1) + (__emul((int)a1, (int)a2) << 1) + (__emul((int)a4, (int)a4) << 3); - c4 = (__emul((int)a0, (int)a4) << 1) + (__emul((int)a1, (int)a3) << 1) + __emul((int)a2, (int)a2); - -#else - int64_t t0, t1, t2, t3, t4; - - a0 = a[0]; a1 = a[1]; a2 = a[2]; a3 = a[3]; a4 = a[4]; - - t0 = (int64_t)a0 << 1; - t1 = (int64_t)a1 << 1; - t2 = (int64_t)a2 << 4; - t3 = (int64_t)a3 << 3; - t4 = (int64_t)a4 << 4; - - c0 = (int64_t)a0*a0 + (int64_t)t4*a1 + (int64_t)t2*a3; - c1 = (int64_t)t0*a1 + (int64_t)t3*a3 + (int64_t)t4*a2; - c2 = (int64_t)t0*a2 + (int64_t)a1*a1 + (int64_t)t4*a3; - c3 = (int64_t)t0*a3 + (int64_t)t1*a2 + ((int64_t)a4 << 3)*a4; - c4 = (int64_t)t0*a4 + (int64_t)t1*a3 + (int64_t)a2*a2; -#endif - - r0 = c0 & mask_26; - c1 += c0 >> 26; r1 = c1 & mask_26; - c2 += c1 >> 26; c[2] = c2 & mask_26; - c3 += c2 >> 26; c[3] = c3 & mask_26; - c4 += c3 >> 26; c[4] = c4 & mask_23; -// c4 += c3 >> 26; c[4] = c4 & mask_26; - - c0 = r0 + (c4 >> 23); -// c0 = r0 + ((c4 >> 26) << 3); - c[0] = (int32_t)c0 & mask_26; - c[1] = r1 + (int32_t)(c0 >> 26); -} - - -void vmod1271_incomplete(velm_t a, velm_t c) -{ // Reduce field element modulo 2^127-1 - // Redundant representation: 23/26/26/26/26-bit - // Output is in the range [0, 2^127-1] - int32_t t0, t1, t2, t3, t4; - uint32_t rem; - - t0 = a[0]; t1 = a[1]; t2 = a[2]; t3 = a[3]; t4 = a[4]; - - // Carry propagation - t1 += (t0 >> 26); t0 &= mask_26; - t2 += (t1 >> 26); t1 &= mask_26; - t3 += (t2 >> 26); t2 &= mask_26; - t4 += (t3 >> 26); t3 &= mask_26; - rem = (t4 >> 23); t4 &= mask_23; - - // Correction - t0 += rem; - t1 += (t0 >> 26); t0 &= mask_26; - t2 += (t1 >> 26); t1 &= mask_26; - t3 += (t2 >> 26); t2 &= mask_26; - t4 += (t3 >> 26); t3 &= mask_26; - rem = (t4 >> 23); t4 &= mask_23; - t0 += rem; - - c[0] = t0; c[1] = t1; c[2] = t2; c[3] = t3; c[4] = t4; -} - - -void vmod1271(velm_t a, velm_t c) -{ // Reduce field element modulo 2^127-1 - // Output is in the range [0, 2^127-2] - // Redundant representation: 23/26/26/26/26-bit - int32_t t0, t1, t2, t3, t4; - uint32_t mask, rem; - - t0 = a[0]; t1 = a[1]; t2 = a[2]; t3 = a[3]; t4 = a[4]; - - // First carry propagation - t1 += (t0 >> 26); t0 &= mask_26; - t2 += (t1 >> 26); t1 &= mask_26; - t3 += (t2 >> 26); t2 &= mask_26; - t4 += (t3 >> 26); t3 &= mask_26; - rem = (t4 >> 23); t4 &= mask_23; - - // First correction adding rem+1 - t0 += rem + 1; - t1 += (t0 >> 26); t0 &= mask_26; - t2 += (t1 >> 26); t1 &= mask_26; - t3 += (t2 >> 26); t2 &= mask_26; - t4 += (t3 >> 26); t3 &= mask_26; - rem = (t4 >> 23); t4 &= mask_23; - - // If final carry = 0 then subtract 1 - mask = rem - 1; - t0 -= (mask & 1); - t1 += (t0 >> 26); t0 &= mask_26; - t2 += (t1 >> 26); t1 &= mask_26; - t3 += (t2 >> 26); t2 &= mask_26; - t4 += (t3 >> 26); t3 &= mask_26; - - c[0] = t0; c[1] = t1; c[2] = t2; c[3] = t3; c[4] = t4; -} - - -__inline void vexp1251(felm_t a, felm_t af) -{ // Exponentiation over GF(p), af = a^(125-1) - int i; - velm_t t1, t2, t3, t4, t5; - - vsqr1271(a, t2); - vmul1271(a, t2, t2); - vsqr1271(t2, t3); - vsqr1271(t3, t3); - vmul1271(t2, t3, t3); - vsqr1271(t3, t4); - vsqr1271(t4, t4); - vsqr1271(t4, t4); - vsqr1271(t4, t4); - vmul1271(t3, t4, t4); - vsqr1271(t4, t5); - for (i = 0; i<7; i++) vsqr1271(t5, t5); - vmul1271(t4, t5, t5); - vsqr1271(t5, t2); - for (i = 0; i<15; i++) vsqr1271(t2, t2); - vmul1271(t5, t2, t2); - vsqr1271(t2, t1); - for (i = 0; i<31; i++) vsqr1271(t1, t1); - vmul1271(t2, t1, t1); - for (i = 0; i<32; i++) vsqr1271(t1, t1); - vmul1271(t1, t2, t1); - for (i = 0; i<16; i++) vsqr1271(t1, t1); - vmul1271(t5, t1, t1); - for (i = 0; i<8; i++) vsqr1271(t1, t1); - vmul1271(t4, t1, t1); - for (i = 0; i<4; i++) vsqr1271(t1, t1); - vmul1271(t3, t1, t1); - vsqr1271(t1, t1); - vmul1271(a, t1, af); -} - - -void vinv1271(felm_t a) -{ // Field inversion, af = a^-1 = a^(p-2) mod p - // Hardcoded for p = 2^127-1 - velm_t t; - - vexp1251(a, t); - vsqr1271(t, t); - vsqr1271(t, t); - vmul1271(a, t, a); -} - - -void from_std_to_ext(f2elm_t a, v2elm_t c) -{ // Expand GF(p^2) element represented with two 4 32-bit digits to 23/26/26/26/26/23/26/26/26/26-bit vector representation - // Assumes fully reduced input in [0, 2^127-1] - const uint32_t mask_8 = ((uint32_t)1 << 8) - 1; - const uint32_t mask_14 = ((uint32_t)1 << 14) - 1; - const uint32_t mask_20 = ((uint32_t)1 << 20) - 1; - - c[0] = a[0][0] & mask_26; - c[1] = (a[0][0] >> 26) | ((a[0][1] & mask_20) << 6); - c[2] = (a[0][1] >> 20) | ((a[0][2] & mask_14) << 12); - c[3] = (a[0][2] >> 14) | ((a[0][3] & mask_8 ) << 18); - c[4] = (a[0][3] >> 8) & mask_23; - - c[5] = a[1][0] & mask_26; - c[6] = (a[1][0] >> 26) | ((a[1][1] & mask_20) << 6); - c[7] = (a[1][1] >> 20) | ((a[1][2] & mask_14) << 12); - c[8] = (a[1][2] >> 14) | ((a[1][3] & mask_8 ) << 18); - c[9] = (a[1][3] >> 8) & mask_23; -} - - -void from_ext_to_std(v2elm_t a, f2elm_t c) -{ // Contract GF(p^2) element in 23/26/26/26/26/23/26/26/26/26-bit vector representation to two 4 32-bit digits - // Assumes fully reduced input in [0, 2^127-1] - - c[0][0] = (a[1] << 26) | a[0]; - c[0][1] = (a[2] << 20) | (a[1] >> 6); - c[0][2] = (a[3] << 14) | (a[2] >> 12); - c[0][3] = (a[4] << 8) | (a[3] >> 18); - - c[1][0] = (a[6] << 26) | a[5]; - c[1][1] = (a[7] << 20) | (a[6] >> 6); - c[1][2] = (a[8] << 14) | (a[7] >> 12); - c[1][3] = (a[9] << 8) | (a[8] >> 18); -} - - -void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) -{ // Schoolbook multiprecision multiply, c = a*b - unsigned int i, j; - digit_t u, v, UV[2]; - unsigned int carry = 0; - - for (i = 0; i < (2*nwords); i++) c[i] = 0; - - for (i = 0; i < nwords; i++) { - u = 0; - for (j = 0; j < nwords; j++) { - MUL(a[i], b[j], UV+1, UV[0]); - ADDC(0, UV[0], u, carry, v); - u = UV[1] + carry; - ADDC(0, c[i+j], v, carry, v); - u = u + carry; - c[i+j] = v; - } - c[nwords+i] = u; - } -} - - -unsigned int mp_add(digit_t* a, digit_t* b, digit_t* c, unsigned int nwords) -{ // Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit - unsigned int i, carry = 0; - - for (i = 0; i < nwords; i++) { - ADDC(carry, a[i], b[i], carry, c[i]); - } - - return carry; -} - - -static __inline void multiply(const digit_t* a, const digit_t* b, digit_t* c) -{ // Schoolbook multiprecision multiply, c = a*b - - mp_mul(a, b, c, NWORDS_ORDER); -} - - -static __inline unsigned int add(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) -{ // Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit - unsigned int i, carry = 0; - - for (i = 0; i < nwords; i++) { - ADDC(carry, a[i], b[i], carry, c[i]); - } - - return carry; -} - - -unsigned int subtract(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) -{ // Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit - unsigned int i, borrow = 0; - - for (i = 0; i < nwords; i++) { - SUBC(borrow, a[i], b[i], borrow, c[i]); - } - - return borrow; -} - - -void subtract_mod_order(const digit_t* a, const digit_t* b, digit_t* c) -{ // Subtraction modulo the curve order, c = a-b mod order - digit_t mask, carry = 0; - digit_t* order = (digit_t*)curve_order; - unsigned int i, bout; - - bout = subtract(a, b, c, NWORDS_ORDER); // (bout, c) = a - b - mask = 0 - (digit_t)bout; // if bout = 0 then mask = 0x00..0, else if bout = 1 then mask = 0xFF..F - - for (i = 0; i < NWORDS_ORDER; i++) { // c = c + (mask & order) - ADDC(carry, c[i], mask & order[i], carry, c[i]); - } -} - - -void add_mod_order(const digit_t* a, const digit_t* b, digit_t* c) -{ // Addition modulo the curve order, c = a+b mod order - - add(a, b, c, NWORDS_ORDER); // c = a + b - subtract_mod_order(c, (digit_t*)&curve_order, c); // if c >= order then c = c - order -} - - -void Montgomery_multiply_mod_order(const digit_t* ma, const digit_t* mb, digit_t* mc) -{ // 256-bit Montgomery multiplication modulo the curve order, mc = ma*mb*r' mod order, where ma,mb,mc in [0, order-1] - // ma, mb and mc are assumed to be in Montgomery representation - // The Montgomery constant r' = -r^(-1) mod 2^(log_2(r)) is the global value "Montgomery_rprime", where r is the order - unsigned int i; - digit_t mask, P[2*NWORDS_ORDER], Q[2*NWORDS_ORDER], temp[2*NWORDS_ORDER]; - digit_t* order = (digit_t*)curve_order; - unsigned int cout = 0, bout = 0; - - multiply(ma, mb, P); // P = ma * mb - multiply(P, (digit_t*)&Montgomery_rprime, Q); // Q = P * r' mod 2^(log_2(r)) - multiply(Q, (digit_t*)&curve_order, temp); // temp = Q * r - cout = add(P, temp, temp, 2*NWORDS_ORDER); // (cout, temp) = P + Q * r - - for (i = 0; i < NWORDS_ORDER; i++) { // (cout, mc) = (P + Q * r)/2^(log_2(r)) - mc[i] = temp[NWORDS_ORDER + i]; - } - - // Final, constant-time subtraction - bout = subtract(mc, (digit_t*)&curve_order, mc, NWORDS_ORDER); // (cout, mc) = (cout, mc) - r - mask = (digit_t)cout - (digit_t)bout; // if (cout, mc) >= 0 then mask = 0x00..0, else if (cout, mc) < 0 then mask = 0xFF..F - - for (i = 0; i < NWORDS_ORDER; i++) { // temp = mask & r - temp[i] = (order[i] & mask); - } - add(mc, temp, mc, NWORDS_ORDER); // mc = mc + (mask & r) - - return; -} - - -void modulo_order(digit_t* a, digit_t* c) -{ // Reduction modulo the order using Montgomery arithmetic - // ma = a*Montgomery_Rprime mod r, where a,ma in [0, r-1], a,ma,r < 2^256 - // c = ma*1*Montgomery_Rprime^(-1) mod r, where ma,c in [0, r-1], ma,c,r < 2^256 - digit_t ma[NWORDS_ORDER], one[NWORDS_ORDER] = {0}; - - one[0] = 1; - Montgomery_multiply_mod_order(a, (digit_t*)&Montgomery_Rprime, ma); - Montgomery_multiply_mod_order(ma, one, c); -} - - -void conversion_to_odd(digit_t* k, digit_t* k_odd) -{ // Convert scalar to odd if even using the prime subgroup order r - digit_t mask; - digit_t* order = (digit_t*)curve_order; - unsigned int i, carry = 0; - - mask = ~(0 - (k[0] & 1)); - - for (i = 0; i < NWORDS_ORDER; i++) { // If (k is odd) then k_odd = k else k_odd = k + r - ADDC(carry, order[i] & mask, k[i], carry, k_odd[i]); - } -} - - -__inline void vdiv1271(uint32_t* a) -{ // GF(p) division by two, c = a/2 mod p - // Redundant representation: 23/26/26/26/26-bit - digit_t mask; - - mask = (0 - (a[0] & 1)) >> 6; // if a[0] is odd then mask = 2^26-1, else mask = 0 - - a[0] += mask; - a[1] += mask; - a[2] += mask; - a[3] += mask; - a[4] += (mask >> 3); - - a[0] = ((sdigit_t)a[0] >> 1) + ((a[1] & 1) << 25); - a[1] = ((sdigit_t)a[1] >> 1) + ((a[2] & 1) << 25); - a[2] = ((sdigit_t)a[2] >> 1) + ((a[3] & 1) << 25); - a[3] = ((sdigit_t)a[3] >> 1) + ((a[4] & 1) << 25); - a[4] = ((sdigit_t)a[4] >> 1); -} - - -#ifdef __cplusplus -} -#endif - - -#endif diff --git a/ffi-deps/FourQlib/FourQ_32bit/kex.c b/ffi-deps/FourQlib/FourQ_32bit/kex.c deleted file mode 100644 index e4a03cf..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/kex.c +++ /dev/null @@ -1,181 +0,0 @@ -/******************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: Diffie-Hellman key exchange based on FourQ -* option 1: co-factor ecdh using compressed 32-byte public keys, -* (see https://datatracker.ietf.org/doc/draft-ladd-cfrg-4q/). -* option 2: co-factor ecdh using uncompressed, 64-byte public keys. -*********************************************************************************/ - -#include "FourQ_internal.h" -#include "FourQ_params.h" -#include "../random/random.h" -#include - - -static __inline bool is_neutral_point(point_t P) -{ // Is P the neutral point (0,1)? - // SECURITY NOTE: this function does not run in constant time (input point P is assumed to be public). - - if (is_zero_ct((digit_t*)P->x, 2*NWORDS_FIELD) && is_zero_ct(&((digit_t*)P->y)[1], 2*NWORDS_FIELD-1) && is_digit_zero_ct(P->y[0][0] - 1)) { - return true; - } - return false; -} - - -/*************** ECDH USING COMPRESSED, 32-BYTE PUBLIC KEYS ***************/ - -ECCRYPTO_STATUS CompressedPublicKeyGeneration(const unsigned char* SecretKey, unsigned char* PublicKey) -{ // Compressed public key generation for key exchange - // It produces a public key PublicKey, which is the encoding of P = SecretKey*G (G is the generator). - // Input: 32-byte SecretKey - // Output: 32-byte PublicKey - point_t P; - - ecc_mul_fixed((digit_t*)SecretKey, P); // Compute public key - encode(P, PublicKey); // Encode public key - - return ECCRYPTO_SUCCESS; -} - - -ECCRYPTO_STATUS CompressedKeyGeneration(unsigned char* SecretKey, unsigned char* PublicKey) -{ // Keypair generation for key exchange. Public key is compressed to 32 bytes - // It produces a private key SecretKey and a public key PublicKey, which is the encoding of P = SecretKey*G (G is the generator). - // Outputs: 32-byte SecretKey and 32-byte PublicKey - ECCRYPTO_STATUS Status = ECCRYPTO_ERROR_UNKNOWN; - - Status = RandomBytesFunction(SecretKey, 32); - if (Status != ECCRYPTO_SUCCESS) { - goto cleanup; - } - - Status = CompressedPublicKeyGeneration(SecretKey, PublicKey); - if (Status != ECCRYPTO_SUCCESS) { - goto cleanup; - } - - return ECCRYPTO_SUCCESS; - -cleanup: - clear_words((unsigned int*)SecretKey, 256/(sizeof(unsigned int)*8)); - clear_words((unsigned int*)PublicKey, 256/(sizeof(unsigned int)*8)); - - return Status; -} - - -ECCRYPTO_STATUS CompressedSecretAgreement(const unsigned char* SecretKey, const unsigned char* PublicKey, unsigned char* SharedSecret) -{ // Secret agreement computation for key exchange using a compressed, 32-byte public key - // The output is the y-coordinate of SecretKey*A, where A is the decoding of the public key PublicKey. - // Inputs: 32-byte SecretKey and 32-byte PublicKey - // Output: 32-byte SharedSecret - point_t A; - ECCRYPTO_STATUS Status = ECCRYPTO_ERROR_UNKNOWN; - - if ((PublicKey[15] & 0x80) != 0) { // Is bit128(PublicKey) = 0? - Status = ECCRYPTO_ERROR_INVALID_PARAMETER; - goto cleanup; - } - - Status = decode(PublicKey, A); // Also verifies that A is on the curve. If it is not, it fails - if (Status != ECCRYPTO_SUCCESS) { - goto cleanup; - } - - Status = ecc_mul(A, (digit_t*)SecretKey, A, true); - if (Status != ECCRYPTO_SUCCESS) { - goto cleanup; - } - - if (is_neutral_point(A)) { // Is output = neutral point (0,1)? - Status = ECCRYPTO_ERROR_SHARED_KEY; - goto cleanup; - } - - memmove(SharedSecret, (unsigned char*)A->y, 32); - - return ECCRYPTO_SUCCESS; - -cleanup: - clear_words((unsigned int*)SharedSecret, 256/(sizeof(unsigned int)*8)); - - return Status; -} - - -/*************** ECDH USING UNCOMPRESSED PUBLIC KEYS ***************/ - -ECCRYPTO_STATUS PublicKeyGeneration(const unsigned char* SecretKey, unsigned char* PublicKey) -{ // Public key generation for key exchange - // It produces the public key PublicKey = SecretKey*G, where G is the generator. - // Input: 32-byte SecretKey - // Output: 64-byte PublicKey - - ecc_mul_fixed((digit_t*)SecretKey, (point_affine*)PublicKey); // Compute public key - - return ECCRYPTO_SUCCESS; -} - - -ECCRYPTO_STATUS KeyGeneration(unsigned char* SecretKey, unsigned char* PublicKey) -{ // Keypair generation for key exchange - // It produces a private key SecretKey and computes the public key PublicKey = SecretKey*G, where G is the generator. - // Outputs: 32-byte SecretKey and 64-byte PublicKey - ECCRYPTO_STATUS Status = ECCRYPTO_ERROR_UNKNOWN; - - Status = RandomBytesFunction(SecretKey, 32); - if (Status != ECCRYPTO_SUCCESS) { - goto cleanup; - } - - Status = PublicKeyGeneration(SecretKey, PublicKey); - if (Status != ECCRYPTO_SUCCESS) { - goto cleanup; - } - - return ECCRYPTO_SUCCESS; - -cleanup: - clear_words((unsigned int*)SecretKey, 256/(sizeof(unsigned int)*8)); - clear_words((unsigned int*)PublicKey, 512/(sizeof(unsigned int)*8)); - - return Status; -} - - -ECCRYPTO_STATUS SecretAgreement(const unsigned char* SecretKey, const unsigned char* PublicKey, unsigned char* SharedSecret) -{ // Secret agreement computation for key exchange - // The output is the y-coordinate of SecretKey*PublicKey. - // Inputs: 32-byte SecretKey and 64-byte PublicKey - // Output: 32-byte SharedSecret - point_t A; - ECCRYPTO_STATUS Status = ECCRYPTO_ERROR_UNKNOWN; - - if (((PublicKey[15] & 0x80) != 0) || ((PublicKey[31] & 0x80) != 0) || ((PublicKey[47] & 0x80) != 0) || ((PublicKey[63] & 0x80) != 0)) { // Are PublicKey_x[i] and PublicKey_y[i] < 2^127? - Status = ECCRYPTO_ERROR_INVALID_PARAMETER; - goto cleanup; - } - - Status = ecc_mul((point_affine*)PublicKey, (digit_t*)SecretKey, A, true); // Also verifies that PublicKey is a point on the curve. If it is not, it fails - if (Status != ECCRYPTO_SUCCESS) { - goto cleanup; - } - - if (is_neutral_point(A)) { // Is output = neutral point (0,1)? - Status = ECCRYPTO_ERROR_SHARED_KEY; - goto cleanup; - } - - memmove(SharedSecret, (unsigned char*)A->y, 32); - - return ECCRYPTO_SUCCESS; - -cleanup: - clear_words((unsigned int*)SharedSecret, 256/(sizeof(unsigned int)*8)); - - return Status; -} \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_32bit/makefile b/ffi-deps/FourQlib/FourQ_32bit/makefile deleted file mode 100644 index c74367a..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/makefile +++ /dev/null @@ -1,98 +0,0 @@ -#### Makefile for compilation using GNU GCC or clang on 32-bit processors #### - -OPT=-O3 # Optimization option by default - -CC=gcc -ifeq "$(CC)" "gcc" - COMPILER=gcc -else ifeq "$(CC)" "clang" - COMPILER=clang -endif - -ifeq "$(ARCH)" "x86" - ARCHITECTURE=_X86_ -else ifeq "$(ARCH)" "ARM" - ARCHITECTURE=_ARM_ -endif - -ADDITIONAL_SETTINGS=-fwrapv -fomit-frame-pointer -funroll-loops -march=native -ifeq "$(EXTENDED_SET)" "FALSE" - ADDITIONAL_SETTINGS= -endif - -ifeq "$(ARCH)" "ARM" - ARM_SETTING=-lrt -endif - -USE_ENDOMORPHISMS=-D USE_ENDO -ifeq "$(USE_ENDO)" "FALSE" - USE_ENDOMORPHISMS= -endif - -INLINING_SETTINGS= -ifeq "$(CC)" "gcc" - INLINING_SETTINGS=-finline-functions -finline-limit=100 -endif - -MEM=-D _CACHE_MEM_ -ifeq "$(CACHE_MEM)" "FALSE" - MEM=-D _NO_CACHE_MEM_ -endif - -CFLAGS=-c $(OPT) -D $(ARCHITECTURE) $(ADDITIONAL_SETTINGS) -D __LINUX__ $(USE_ENDOMORPHISMS) $(MEM) $(INLINING_SETTINGS) -LDFLAGS= -OBJECTS=eccp2.o eccp2_no_endo.o crypto_util.o schnorrq.o kex.o sha512.o random.o -OBJECTS_ECC_TEST=ecc_tests.o test_extras.o $(OBJECTS) -OBJECTS_FP_TEST=$(OBJECTS) fp_tests.o test_extras.o -OBJECTS_CRYPTO_TEST=crypto_tests.o $(OBJECTS) test_extras.o -OBJECTS_ALL=$(OBJECTS) $(OBJECTS_ECC_TEST) $(OBJECTS_FP_TEST) $(OBJECTS_CRYPTO_TEST) - -all: crypto_test ecc_test fp_test - -crypto_test: $(OBJECTS_CRYPTO_TEST) - $(CC) -o crypto_test $(OBJECTS_CRYPTO_TEST) $(ARM_SETTING) - -ecc_test: $(OBJECTS_ECC_TEST) - $(CC) -o ecc_test $(OBJECTS_ECC_TEST) $(ARM_SETTING) - -fp_test: $(OBJECTS_FP_TEST) - $(CC) -o fp_test $(OBJECTS_FP_TEST) $(ARM_SETTING) - -eccp2.o: eccp2.c - $(CC) $(CFLAGS) eccp2.c - -eccp2_no_endo.o: eccp2_no_endo.c - $(CC) $(CFLAGS) eccp2_no_endo.c - -schnorrq.o: schnorrq.c - $(CC) $(CFLAGS) schnorrq.c - -kex.o: kex.c - $(CC) $(CFLAGS) kex.c - -crypto_util.o: crypto_util.c - $(CC) $(CFLAGS) crypto_util.c - -sha512.o: ../sha512/sha512.c - $(CC) $(CFLAGS) ../sha512/sha512.c - -random.o: ../random/random.c - $(CC) $(CFLAGS) ../random/random.c - -test_extras.o: tests/test_extras.c - $(CC) $(CFLAGS) tests/test_extras.c - -crypto_tests.o: tests/crypto_tests.c - $(CC) $(CFLAGS) tests/crypto_tests.c - -ecc_tests.o: tests/ecc_tests.c - $(CC) $(CFLAGS) tests/ecc_tests.c - -fp_tests.o: tests/fp_tests.c - $(CC) $(CFLAGS) tests/fp_tests.c - -.PHONY: clean - -clean: - rm -f crypto_test ecc_test fp_test $(OBJECTS_ALL) - diff --git a/ffi-deps/FourQlib/FourQ_32bit/schnorrq.c b/ffi-deps/FourQlib/FourQ_32bit/schnorrq.c deleted file mode 100644 index 9905350..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/schnorrq.c +++ /dev/null @@ -1,191 +0,0 @@ -/********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: digital signature SchnorrQ -* -* See "SchnorrQ: Schnorr signatures on FourQ" by Craig Costello and Patrick Longa, -* MSR Technical Report, 2016. Available at: -* https://www.microsoft.com/en-us/research/wp-content/uploads/2016/07/SchnorrQ.pdf. -***********************************************************************************/ - -#include "FourQ_internal.h" -#include "FourQ_params.h" -#include "../random/random.h" -#include "../sha512/sha512.h" -#include -#include - -extern int KangarooTwelveCryptoHashFunction(const unsigned char* input, const unsigned int inputByteLen, unsigned char* output); -#define CryptoHashFunction KangarooTwelveCryptoHashFunction - -ECCRYPTO_STATUS SchnorrQ_KeyGeneration(const unsigned char* SecretKey, unsigned char* PublicKey) -{ // SchnorrQ public key generation - // It produces a public key PublicKey, which is the encoding of P = s*G, where G is the generator and - // s is the output of hashing SecretKey and taking the least significant 32 bytes of the result. - // Input: 32-byte SecretKey - // Output: 32-byte PublicKey - point_t P; - unsigned char k[64]; - ECCRYPTO_STATUS Status = ECCRYPTO_ERROR_UNKNOWN; - - if (CryptoHashFunction(SecretKey, 32, k) != 0) { - Status = ECCRYPTO_ERROR; - goto cleanup; - } - - ecc_mul_fixed((digit_t*)k, P); // Compute public key - encode(P, PublicKey); // Encode public key - - return ECCRYPTO_SUCCESS; - -cleanup: - clear_words((unsigned int*)k, 512/(sizeof(unsigned int)*8)); - clear_words((unsigned int*)PublicKey, 256/(sizeof(unsigned int)*8)); - - return Status; -} - - -ECCRYPTO_STATUS SchnorrQ_FullKeyGeneration(unsigned char* SecretKey, unsigned char* PublicKey) -{ // SchnorrQ keypair generation - // It produces a private key SecretKey and computes the public key PublicKey, which is the encoding of P = s*G, - // where G is the generator and s is the output of hashing SecretKey and taking the least significant 32 bytes of the result. - // Outputs: 32-byte SecretKey and 32-byte PublicKey - ECCRYPTO_STATUS Status = ECCRYPTO_ERROR_UNKNOWN; - - Status = RandomBytesFunction(SecretKey, 32); - if (Status != ECCRYPTO_SUCCESS) { - goto cleanup; - } - - Status = SchnorrQ_KeyGeneration(SecretKey, PublicKey); - if (Status != ECCRYPTO_SUCCESS) { - goto cleanup; - } - - return ECCRYPTO_SUCCESS; - -cleanup: - clear_words((unsigned int*)SecretKey, 256/(sizeof(unsigned int)*8)); - clear_words((unsigned int*)PublicKey, 256/(sizeof(unsigned int)*8)); - - return Status; -} - -ECCRYPTO_STATUS SchnorrQ_Sign(const unsigned char* SecretKey, const unsigned char* PublicKey, const unsigned char* Message, const unsigned int SizeMessage, unsigned char* Signature) -{ // SchnorrQ signature generation - // It produces the signature Signature of a message Message of size SizeMessage in bytes - // Inputs: 32-byte SecretKey, 32-byte PublicKey, and Message of size SizeMessage in bytes - // Output: 64-byte Signature - point_t R; - unsigned char k[64], r[64], h[64], *temp = NULL; - digit_t* H = (digit_t*)h; - digit_t* S = (digit_t*)(Signature+32); - ECCRYPTO_STATUS Status = ECCRYPTO_ERROR_UNKNOWN; - - if (CryptoHashFunction(SecretKey, 32, k) != 0) { - Status = ECCRYPTO_ERROR; - goto cleanup; - } - - temp = (unsigned char*)calloc(1, SizeMessage+64); - if (temp == NULL) { - Status = ECCRYPTO_ERROR_NO_MEMORY; - goto cleanup; - } - - memmove(temp+32, k+32, 32); - memmove(temp+64, Message, SizeMessage); - - if (CryptoHashFunction(temp+32, SizeMessage+32, r) != 0) { - Status = ECCRYPTO_ERROR; - goto cleanup; - } - - ecc_mul_fixed((digit_t*)r, R); - encode(R, Signature); // Encode lowest 32 bytes of signature - memmove(temp, Signature, 32); - memmove(temp+32, PublicKey, 32); - - if (CryptoHashFunction(temp, SizeMessage+64, h) != 0) { - Status = ECCRYPTO_ERROR; - goto cleanup; - } - modulo_order((digit_t*)r, (digit_t*)r); - modulo_order(H, H); - to_Montgomery((digit_t*)k, S); // Converting to Montgomery representation - to_Montgomery(H, H); // Converting to Montgomery representation - Montgomery_multiply_mod_order(S, H, S); - from_Montgomery(S, S); // Converting back to standard representation - subtract_mod_order((digit_t*)r, S, S); - Status = ECCRYPTO_SUCCESS; - -cleanup: - if (temp != NULL) - free(temp); - clear_words((unsigned int*)k, 512/(sizeof(unsigned int)*8)); - clear_words((unsigned int*)r, 512/(sizeof(unsigned int)*8)); - - return Status; -} - - -ECCRYPTO_STATUS SchnorrQ_Verify(const unsigned char* PublicKey, const unsigned char* Message, const unsigned int SizeMessage, const unsigned char* Signature, unsigned int* valid) -{ // SchnorrQ signature verification - // It verifies the signature Signature of a message Message of size SizeMessage in bytes - // Inputs: 32-byte PublicKey, 64-byte Signature, and Message of size SizeMessage in bytes - // Output: true (valid signature) or false (invalid signature) - point_t A; - unsigned char *temp, h[64]; - unsigned int i; - ECCRYPTO_STATUS Status = ECCRYPTO_ERROR_UNKNOWN; - - *valid = false; - - temp = (unsigned char*)calloc(1, SizeMessage+64); - if (temp == NULL) { - Status = ECCRYPTO_ERROR_NO_MEMORY; - goto cleanup; - } - - if (((PublicKey[15] & 0x80) != 0) || ((Signature[15] & 0x80) != 0) || (Signature[63] != 0) || ((Signature[62] & 0xC0) != 0)) { // Are bit128(PublicKey) = bit128(Signature) = 0 and Signature+32 < 2^246? - Status = ECCRYPTO_ERROR_INVALID_PARAMETER; - goto cleanup; - } - - Status = decode(PublicKey, A); // Also verifies that A is on the curve. If it is not, it fails - if (Status != ECCRYPTO_SUCCESS) { - goto cleanup; - } - - memmove(temp, Signature, 32); - memmove(temp+32, PublicKey, 32); - memmove(temp+64, Message, SizeMessage); - - if (CryptoHashFunction(temp, SizeMessage+64, h) != 0) { - Status = ECCRYPTO_ERROR; - goto cleanup; - } - - Status = ecc_mul_double((digit_t*)(Signature+32), A, (digit_t*)h, A); - if (Status != ECCRYPTO_SUCCESS) { - goto cleanup; - } - - encode(A, (unsigned char*)A); - - for (i = 0; i < NWORDS_ORDER; i++) { - if (((digit_t*)A)[i] != ((digit_t*)Signature)[i]) { - goto cleanup; - } - } - *valid = true; - -cleanup: - if (temp != NULL) - free(temp); - - return Status; -} \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_32bit/table_lookup.h b/ffi-deps/FourQlib/FourQ_32bit/table_lookup.h deleted file mode 100644 index 4a888e4..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/table_lookup.h +++ /dev/null @@ -1,167 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: table lookup functions -************************************************************************************/ - -#ifndef __TABLE_LOOKUP_H__ -#define __TABLE_LOOKUP_H__ - - -// For C++ -#ifdef __cplusplus -extern "C" { -#endif - - -#include "FourQ_internal.h" - - -void table_lookup_1x8(vpoint_extproj_precomp_t* table, vpoint_extproj_precomp_t P, unsigned int digit, unsigned int sign_mask) -{ // Constant-time table lookup to extract a point represented as (X+Y,Y-X,2Z,2dT) corresponding to extended twisted Edwards coordinates (X:Y:Z:T) - // Inputs: sign_mask, digit, table containing 8 points - // Output: P = sign*table[digit], where sign=1 if sign_mask=0xFF...FF and sign=-1 if sign_mask=0 - -#if defined(NO_CACHE_MEM) - vpoint_extproj_precomp_t temp_point[2]; - - ecccopy_precomp(table[digit], temp_point[1]); // temp_point[1] = table[digit] - v2copy1271(temp_point[1]->xy, temp_point[0]->yx); // temp_point[0] = -table[digit], i.e., coordinates (y-x,x+y,2*z,-2dt) - v2copy1271(temp_point[1]->yx, temp_point[0]->xy); - v2copy1271(temp_point[1]->t2, temp_point[0]->t2); - v2copy1271(temp_point[1]->z2, temp_point[0]->z2); - vneg1271(&temp_point[0]->t2[0]); - vneg1271(&temp_point[0]->t2[VWORDS_FIELD]); - ecccopy_precomp(temp_point[sign_mask & 1], P); - -#else - vpoint_extproj_precomp_t point, temp_point; - unsigned int i, j; - digit_t mask; - - ecccopy_precomp(table[0], point); // point = table[0] - - for (i = 1; i < 8; i++) - { - digit--; - // While digit>=0 mask = 0xFF...F else sign = 0x00...0 - mask = ((digit_t)digit >> (RADIX-1)) - 1; - ecccopy_precomp(table[i], temp_point); // temp_point = table[i] - // If mask = 0x00...0 then point = point, else if mask = 0xFF...F then point = temp_point - point->xy[0] = (mask & (point->xy[0] ^ temp_point->xy[0])) ^ point->xy[0]; - point->xy[1] = (mask & (point->xy[1] ^ temp_point->xy[1])) ^ point->xy[1]; - point->xy[2] = (mask & (point->xy[2] ^ temp_point->xy[2])) ^ point->xy[2]; - point->xy[3] = (mask & (point->xy[3] ^ temp_point->xy[3])) ^ point->xy[3]; - point->xy[4] = (mask & (point->xy[4] ^ temp_point->xy[4])) ^ point->xy[4]; - point->xy[5] = (mask & (point->xy[5] ^ temp_point->xy[5])) ^ point->xy[5]; - point->xy[6] = (mask & (point->xy[6] ^ temp_point->xy[6])) ^ point->xy[6]; - point->xy[7] = (mask & (point->xy[7] ^ temp_point->xy[7])) ^ point->xy[7]; - point->xy[8] = (mask & (point->xy[8] ^ temp_point->xy[8])) ^ point->xy[8]; - point->xy[9] = (mask & (point->xy[9] ^ temp_point->xy[9])) ^ point->xy[9]; - point->yx[0] = (mask & (point->yx[0] ^ temp_point->yx[0])) ^ point->yx[0]; - point->yx[1] = (mask & (point->yx[1] ^ temp_point->yx[1])) ^ point->yx[1]; - point->yx[2] = (mask & (point->yx[2] ^ temp_point->yx[2])) ^ point->yx[2]; - point->yx[3] = (mask & (point->yx[3] ^ temp_point->yx[3])) ^ point->yx[3]; - point->yx[4] = (mask & (point->yx[4] ^ temp_point->yx[4])) ^ point->yx[4]; - point->yx[5] = (mask & (point->yx[5] ^ temp_point->yx[5])) ^ point->yx[5]; - point->yx[6] = (mask & (point->yx[6] ^ temp_point->yx[6])) ^ point->yx[6]; - point->yx[7] = (mask & (point->yx[7] ^ temp_point->yx[7])) ^ point->yx[7]; - point->yx[8] = (mask & (point->yx[8] ^ temp_point->yx[8])) ^ point->yx[8]; - point->yx[9] = (mask & (point->yx[9] ^ temp_point->yx[9])) ^ point->yx[9]; - point->z2[0] = (mask & (point->z2[0] ^ temp_point->z2[0])) ^ point->z2[0]; - point->z2[1] = (mask & (point->z2[1] ^ temp_point->z2[1])) ^ point->z2[1]; - point->z2[2] = (mask & (point->z2[2] ^ temp_point->z2[2])) ^ point->z2[2]; - point->z2[3] = (mask & (point->z2[3] ^ temp_point->z2[3])) ^ point->z2[3]; - point->z2[4] = (mask & (point->z2[4] ^ temp_point->z2[4])) ^ point->z2[4]; - point->z2[5] = (mask & (point->z2[5] ^ temp_point->z2[5])) ^ point->z2[5]; - point->z2[6] = (mask & (point->z2[6] ^ temp_point->z2[6])) ^ point->z2[6]; - point->z2[7] = (mask & (point->z2[7] ^ temp_point->z2[7])) ^ point->z2[7]; - point->z2[8] = (mask & (point->z2[8] ^ temp_point->z2[8])) ^ point->z2[8]; - point->z2[9] = (mask & (point->z2[9] ^ temp_point->z2[9])) ^ point->z2[9]; - point->t2[0] = (mask & (point->t2[0] ^ temp_point->t2[0])) ^ point->t2[0]; - point->t2[1] = (mask & (point->t2[1] ^ temp_point->t2[1])) ^ point->t2[1]; - point->t2[2] = (mask & (point->t2[2] ^ temp_point->t2[2])) ^ point->t2[2]; - point->t2[3] = (mask & (point->t2[3] ^ temp_point->t2[3])) ^ point->t2[3]; - point->t2[4] = (mask & (point->t2[4] ^ temp_point->t2[4])) ^ point->t2[4]; - point->t2[5] = (mask & (point->t2[5] ^ temp_point->t2[5])) ^ point->t2[5]; - point->t2[6] = (mask & (point->t2[6] ^ temp_point->t2[6])) ^ point->t2[6]; - point->t2[7] = (mask & (point->t2[7] ^ temp_point->t2[7])) ^ point->t2[7]; - point->t2[8] = (mask & (point->t2[8] ^ temp_point->t2[8])) ^ point->t2[8]; - point->t2[9] = (mask & (point->t2[9] ^ temp_point->t2[9])) ^ point->t2[9]; - } - - v2copy1271(point->t2, temp_point->t2); - v2copy1271(point->xy, temp_point->yx); // point: x+y,y-x,2dt coordinate, temp_point: y-x,x+y,-2dt coordinate - v2copy1271(point->yx, temp_point->xy); - vneg1271(&temp_point->t2[0]); // Negate 2dt coordinate - vneg1271(&temp_point->t2[VWORDS_FIELD]); - for (j = 0; j < 2*VWORDS_FIELD; j++) { // If sign_mask = 0 then choose negative of the point - point->xy[j] = ((digit_t)((int)sign_mask) & (point->xy[j] ^ temp_point->xy[j])) ^ temp_point->xy[j]; - point->yx[j] = ((digit_t)((int)sign_mask) & (point->yx[j] ^ temp_point->yx[j])) ^ temp_point->yx[j]; - point->t2[j] = ((digit_t)((int)sign_mask) & (point->t2[j] ^ temp_point->t2[j])) ^ temp_point->t2[j]; - } - ecccopy_precomp(point, P); -#endif -} - - -void table_lookup_fixed_base(vpoint_precomp_t* table, vpoint_precomp_t P, unsigned int digit, unsigned int sign) -{ // Constant-time table lookup to extract a point represented as (x+y,y-x,2t) corresponding to extended twisted Edwards coordinates (X:Y:Z:T) with Z=1 - // Inputs: sign, digit, table containing VPOINTS_FIXEDBASE = 2^(W_FIXEDBASE-1) points - // Output: if sign=0 then P = table[digit], else if (sign=-1) then P = -table[digit] - -#if defined(NO_CACHE_MEM) - vpoint_precomp_t temp_point[2]; - - ecccopy_precomp_fixed_base(table[digit], temp_point[0]); // temp_point[0] = table[digit] - v2copy1271(temp_point[0]->xy, temp_point[1]->yx); // temp_point[1] = -table[digit], i.e., coordinates (y-x,x+y,2*z,-2dt) - v2copy1271(temp_point[0]->yx, temp_point[1]->xy); - v2copy1271(temp_point[0]->t2, temp_point[1]->t2); - vneg1271(&temp_point[1]->t2[0]); - vneg1271(&temp_point[1]->t2[VWORDS_FIELD]); - ecccopy_precomp_fixed_base(temp_point[sign & 1], P); - -#else - vpoint_precomp_t point, temp_point; - unsigned int i, j; - digit_t mask; - - ecccopy_precomp_fixed_base(table[0], point); // point = table[0] - - for (i = 1; i < VPOINTS_FIXEDBASE; i++) - { - digit--; - // While digit>=0 mask = 0xFF...F else sign = 0x00...0 - mask = ((digit_t)digit >> (RADIX-1)) - 1; - ecccopy_precomp_fixed_base(table[i], temp_point); // temp_point = table[i] - // If mask = 0x00...0 then point = point, else if mask = 0xFF...F then point = temp_point - for (j = 0; j < 2*VWORDS_FIELD; j++) { - point->xy[j] = (mask & (point->xy[j] ^ temp_point->xy[j])) ^ point->xy[j]; - point->yx[j] = (mask & (point->yx[j] ^ temp_point->yx[j])) ^ point->yx[j]; - point->t2[j] = (mask & (point->t2[j] ^ temp_point->t2[j])) ^ point->t2[j]; - } - } - - v2copy1271(point->t2, temp_point->t2); - v2copy1271(point->xy, temp_point->yx); // point: x+y,y-x,2dt coordinate, temp_point: y-x,x+y,-2dt coordinate - v2copy1271(point->yx, temp_point->xy); - vneg1271(&temp_point->t2[0]); // Negate 2dt coordinate - vneg1271(&temp_point->t2[VWORDS_FIELD]); - for (j = 0; j < 2*VWORDS_FIELD; j++) { // If sign = 0xFF...F then choose negative of the point - point->xy[j] = ((digit_t)((int)sign) & (point->xy[j] ^ temp_point->xy[j])) ^ point->xy[j]; - point->yx[j] = ((digit_t)((int)sign) & (point->yx[j] ^ temp_point->yx[j])) ^ point->yx[j]; - point->t2[j] = ((digit_t)((int)sign) & (point->t2[j] ^ temp_point->t2[j])) ^ point->t2[j]; - } - ecccopy_precomp_fixed_base(point, P); -#endif -} - - -#ifdef __cplusplus -} -#endif - - -#endif diff --git a/ffi-deps/FourQlib/FourQ_32bit/tests/crypto_tests.c b/ffi-deps/FourQlib/FourQ_32bit/tests/crypto_tests.c deleted file mode 100644 index a4a83db..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/tests/crypto_tests.c +++ /dev/null @@ -1,368 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: testing code for cryptographic functions based on FourQ -************************************************************************************/ - -#include "../FourQ_api.h" -#include "../FourQ_params.h" -#include "test_extras.h" -#include - - -// Benchmark and test parameters -#if defined(GENERIC_IMPLEMENTATION) - #define BENCH_LOOPS 100 // Number of iterations per bench - #define TEST_LOOPS 100 // Number of iterations per test -#else - #define BENCH_LOOPS 10000 - #define TEST_LOOPS 1000 -#endif - - -ECCRYPTO_STATUS SchnorrQ_test() -{ // Test the SchnorrQ digital signature scheme - int n, passed; - void *msg = NULL; - unsigned int len, valid = false; - unsigned char SecretKey[32], PublicKey[32], Signature[64]; - ECCRYPTO_STATUS Status = ECCRYPTO_SUCCESS; - - printf("\n--------------------------------------------------------------------------------------------------------\n\n"); - printf("Testing the SchnorrQ signature scheme: \n\n"); - - passed = 1; - for (n = 0; n < TEST_LOOPS; n++) - { - // Signature key generation - Status = SchnorrQ_FullKeyGeneration(SecretKey, PublicKey); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - - // Signature computation - msg = "a"; - len = 1; - Status = SchnorrQ_Sign(SecretKey, PublicKey, msg, len, Signature); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - - // Valid signature test - Status = SchnorrQ_Verify(PublicKey, msg, len, Signature, &valid); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - if (valid == false) { - passed = 0; - break; - } - - // Invalid signature test (flipping one bit of the message) - msg = "b"; - Status = SchnorrQ_Verify(PublicKey, msg, len, Signature, &valid); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - if (valid == true) { - passed = 0; - break; - } - } - if (passed==1) printf(" Signature tests.................................................................. PASSED"); - else { printf(" Signature tests... FAILED"); printf("\n"); Status = ECCRYPTO_ERROR_SIGNATURE_VERIFICATION; } - printf("\n"); - - return Status; -} - - -ECCRYPTO_STATUS SchnorrQ_run() -{ // Benchmark the SchnorrQ digital signature scheme - int n; - unsigned long long nsec, nsec1, nsec2; - void *msg = NULL; - unsigned int len = 0, valid = false; - unsigned char SecretKey[32], PublicKey[32], Signature[64]; - ECCRYPTO_STATUS Status = ECCRYPTO_SUCCESS; - - printf("\n--------------------------------------------------------------------------------------------------------\n\n"); - printf("Benchmarking the SchnorrQ signature scheme: \n\n"); - - nsec = 0; - for (n = 0; n < BENCH_LOOPS; n++) - { - nsec1 = cpu_nseconds(); - Status = SchnorrQ_FullKeyGeneration(SecretKey, PublicKey); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - nsec2 = cpu_nseconds(); - nsec = nsec+(nsec2-nsec1); - } - printf(" SchnorrQ's key generation runs in ............................................... %8lld ", nsec/BENCH_LOOPS); print_unit; - printf("\n"); - - nsec = 0; - for (n = 0; n < BENCH_LOOPS; n++) - { - nsec1 = cpu_nseconds(); - Status = SchnorrQ_Sign(SecretKey, PublicKey, msg, len, Signature); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - nsec2 = cpu_nseconds(); - nsec = nsec+(nsec2-nsec1); - } - printf(" SchnorrQ's signing runs in ...................................................... %8lld ", nsec/BENCH_LOOPS); print_unit; - printf("\n"); - - nsec = 0; - for (n = 0; n < BENCH_LOOPS; n++) - { - nsec1 = cpu_nseconds(); - Status = SchnorrQ_Verify(PublicKey, msg, len, Signature, &valid); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - nsec2 = cpu_nseconds(); - nsec = nsec+(nsec2-nsec1); - } - printf(" SchnorrQ's verification runs in ................................................. %8lld ", nsec/BENCH_LOOPS); print_unit; - printf("\n"); - - return Status; -} - - -ECCRYPTO_STATUS compressedkex_test() -{ // Test ECDH key exchange based on FourQ - int n, passed; - unsigned int i; - unsigned char SecretKeyA[32], PublicKeyA[32], SecretAgreementA[32]; - unsigned char SecretKeyB[32], PublicKeyB[32], SecretAgreementB[32]; - ECCRYPTO_STATUS Status = ECCRYPTO_SUCCESS; - - printf("\n--------------------------------------------------------------------------------------------------------\n\n"); - printf("Testing DH key exchange using compressed, 32-byte public keys: \n\n"); - - passed = 1; - for (n = 0; n < TEST_LOOPS; n++) - { - // Alice's keypair generation - Status = CompressedKeyGeneration(SecretKeyA, PublicKeyA); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - // Bob's keypair generation - Status = CompressedKeyGeneration(SecretKeyB, PublicKeyB); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - - // Alice's shared secret computation - Status = CompressedSecretAgreement(SecretKeyA, PublicKeyB, SecretAgreementA); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - // Bob's shared secret computation - Status = CompressedSecretAgreement(SecretKeyB, PublicKeyA, SecretAgreementB); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - - for (i = 0; i < 32; i++) { - if (SecretAgreementA[i] != SecretAgreementB[i]) { - passed = 0; - break; - } - } - } - if (passed==1) printf(" DH key exchange tests............................................................ PASSED"); - else { printf(" DH key exchange tests... FAILED"); printf("\n"); Status = ECCRYPTO_ERROR_SHARED_KEY; } - printf("\n"); - - return Status; -} - - -ECCRYPTO_STATUS compressedkex_run() -{ // Benchmark ECDH key exchange based on FourQ - int n; - unsigned long long nsec, nsec1, nsec2; - unsigned char SecretKeyA[32], PublicKeyA[32], SecretAgreementA[32]; - unsigned char SecretKeyB[32], PublicKeyB[32]; - ECCRYPTO_STATUS Status = ECCRYPTO_SUCCESS; - - printf("\n--------------------------------------------------------------------------------------------------------\n\n"); - printf("Benchmarking DH key exchange using compressed, 32-byte public keys: \n\n"); - - nsec = 0; - for (n = 0; n < BENCH_LOOPS; n++) - { - nsec1 = cpu_nseconds(); - Status = CompressedKeyGeneration(SecretKeyA, PublicKeyA); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - nsec2 = cpu_nseconds(); - nsec = nsec + (nsec2 - nsec1); - } - printf(" Keypair generation runs in ...................................................... %8lld ", nsec/BENCH_LOOPS); print_unit; - printf("\n"); - - Status = CompressedKeyGeneration(SecretKeyB, PublicKeyB); - nsec = 0; - for (n = 0; n < BENCH_LOOPS; n++) - { - nsec1 = cpu_nseconds(); - Status = CompressedSecretAgreement(SecretKeyA, PublicKeyB, SecretAgreementA); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - nsec2 = cpu_nseconds(); - nsec = nsec + (nsec2 - nsec1); - } - printf(" Secret agreement runs in ........................................................ %8lld ", nsec/BENCH_LOOPS); print_unit; - printf("\n"); - - return Status; -} - - -ECCRYPTO_STATUS kex_test() -{ // Test ECDH key exchange based on FourQ - int n, passed; - unsigned int i; - unsigned char SecretKeyA[32], PublicKeyA[64], SecretAgreementA[32]; - unsigned char SecretKeyB[32], PublicKeyB[64], SecretAgreementB[32]; - ECCRYPTO_STATUS Status = ECCRYPTO_SUCCESS; - - printf("\n--------------------------------------------------------------------------------------------------------\n\n"); - printf("Testing DH key exchange using uncompressed, 64-byte public keys: \n\n"); - - passed = 1; - for (n = 0; n < TEST_LOOPS; n++) - { - // Alice's keypair generation - Status = KeyGeneration(SecretKeyA, PublicKeyA); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - // Bob's keypair generation - Status = KeyGeneration(SecretKeyB, PublicKeyB); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - - // Alice's shared secret computation - Status = SecretAgreement(SecretKeyA, PublicKeyB, SecretAgreementA); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - // Bob's shared secret computation - Status = SecretAgreement(SecretKeyB, PublicKeyA, SecretAgreementB); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - - for (i = 0; i < 32; i++) { - if (SecretAgreementA[i] != SecretAgreementB[i]) { - passed = 0; - break; - } - } - } - if (passed==1) printf(" DH key exchange tests............................................................ PASSED"); - else { printf(" DH key exchange tests... FAILED"); printf("\n"); Status = ECCRYPTO_ERROR_SHARED_KEY; } - printf("\n"); - - return Status; -} - - -ECCRYPTO_STATUS kex_run() -{ // Benchmark ECDH key exchange based on FourQ - int n; - unsigned long long nsec, nsec1, nsec2; - unsigned char SecretKeyA[32], PublicKeyA[64], SecretAgreementA[32]; - unsigned char SecretKeyB[32], PublicKeyB[64]; - ECCRYPTO_STATUS Status = ECCRYPTO_SUCCESS; - - printf("\n--------------------------------------------------------------------------------------------------------\n\n"); - printf("Benchmarking DH key exchange using uncompressed, 64-byte public keys: \n\n"); - - nsec = 0; - for (n = 0; n < BENCH_LOOPS; n++) - { - nsec1 = cpu_nseconds(); - Status = KeyGeneration(SecretKeyA, PublicKeyA); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - nsec2 = cpu_nseconds(); - nsec = nsec + (nsec2 - nsec1); - } - printf(" Keypair generation runs in ...................................................... %8lld ", nsec/BENCH_LOOPS); print_unit; - printf("\n"); - - Status = KeyGeneration(SecretKeyB, PublicKeyB); - nsec = 0; - for (n = 0; n < BENCH_LOOPS; n++) - { - nsec1 = cpu_nseconds(); - Status = SecretAgreement(SecretKeyA, PublicKeyB, SecretAgreementA); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - nsec2 = cpu_nseconds(); - nsec = nsec + (nsec2 - nsec1); - } - printf(" Secret agreement runs in ........................................................ %8lld ", nsec/BENCH_LOOPS); print_unit; - printf("\n"); - - return Status; -} - - -int main() -{ - ECCRYPTO_STATUS Status = ECCRYPTO_SUCCESS; - - Status = SchnorrQ_test(); // Test SchnorrQ signature scheme - if (Status != ECCRYPTO_SUCCESS) { - printf("\n\n Error detected: %s \n\n", FourQ_get_error_message(Status)); - return false; - } - Status = SchnorrQ_run(); // Benchmark SchnorrQ signature scheme - if (Status != ECCRYPTO_SUCCESS) { - printf("\n\n Error detected: %s \n\n", FourQ_get_error_message(Status)); - return false; - } - - Status = compressedkex_test(); // Test Diffie-Hellman key exchange using compressed public keys - if (Status != ECCRYPTO_SUCCESS) { - printf("\n\n Error detected: %s \n\n", FourQ_get_error_message(Status)); - return false; - } - Status = compressedkex_run(); // Benchmark Diffie-Hellman key exchange using compressed public keys - if (Status != ECCRYPTO_SUCCESS) { - printf("\n\n Error detected: %s \n\n", FourQ_get_error_message(Status)); - return false; - } - - Status = kex_test(); // Test Diffie-Hellman key exchange using uncompressed public keys - if (Status != ECCRYPTO_SUCCESS) { - printf("\n\n Error detected: %s \n\n", FourQ_get_error_message(Status)); - return false; - } - Status = kex_run(); // Benchmark Diffie-Hellman key exchange using uncompressed public keys - if (Status != ECCRYPTO_SUCCESS) { - printf("\n\n Error detected: %s \n\n", FourQ_get_error_message(Status)); - return false; - } - - return true; -} \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_32bit/tests/ecc_tests.c b/ffi-deps/FourQlib/FourQ_32bit/tests/ecc_tests.c deleted file mode 100644 index f3a411d..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/tests/ecc_tests.c +++ /dev/null @@ -1,656 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: testing code for FourQ's curve arithmetic -************************************************************************************/ - -#include "../FourQ_api.h" -#include "../FourQ_params.h" -#include "../FourQ_tables.h" -#include "test_extras.h" -#include - - -// Benchmark and test parameters -#define BENCH_LOOPS 1000 // Number of iterations per bench -#define SHORT_BENCH_LOOPS 100 // Number of iterations per bench (for expensive operations) -#define TEST_LOOPS 1000 // Number of iterations per test - - -bool ecc_test() -{ - bool clear_cofactor, OK = true; - unsigned int n; - int passed; - point_t A; - vpoint_t VA; - vpoint_extproj_t VP; - vpoint_extproj_precomp_t VQ; - v2elm_t t1; - uint64_t scalar[4], res_x[4], res_y[4]; - - printf("\n--------------------------------------------------------------------------------------------------------\n\n"); - printf("Testing FourQ's curve arithmetic: \n\n"); - - // Point doubling - passed = 1; - eccset(A); - point_setup(A, VP); - - for (n=0; nx, A->x); - from_ext_to_std(VA->y, A->y); - - // Result - res_x[0] = 0xC9099C54855859D6; res_x[1] = 0x2C3FD8822C82270F; res_x[2] = 0xA7B3F6E2043E8E68; res_x[3] = 0x4DA5B9E83AA7A1B2; - res_y[0] = 0x3EE089F0EB49AA14; res_y[1] = 0x2001EB3A57688396; res_y[2] = 0x1FEE5617A7E954CD; res_y[3] = 0x0FFDB0D761421F50; - - if (fp2compare64((uint64_t*)A->x, res_x)!=0 || fp2compare64((uint64_t*)A->y, res_y)!=0) passed=0; - if (passed==1) printf(" Point doubling tests .................................................................... PASSED"); - else { printf(" Point doubling tests ... FAILED"); printf("\n"); return false; } - printf("\n"); - - // Point addition - eccset(A); - point_setup(A, VP); - - for (n=0; nta, t1); // d*ta - v2add1271(t1, t1, t1); // 2*d*ta - v2mul1271(t1, VP->tb, VQ->t2); // 2*d*t - v2add1271(VP->x, VP->y, VQ->xy); // x+y - v2sub1271(VP->y, VP->x, VQ->yx); // y-x - v2copy1271(VP->z, VQ->z2); - v2add1271(VQ->z2, VQ->z2, VQ->z2); // 2*z - eccadd(VQ, VP); // 2*P - } - eccnorm(VP, VA); - from_ext_to_std(VA->x, A->x); - from_ext_to_std(VA->y, A->y); - - // Result - res_x[0] = 0xC9099C54855859D6; res_x[1] = 0x2C3FD8822C82270F; res_x[2] = 0xA7B3F6E2043E8E68; res_x[3] = 0x4DA5B9E83AA7A1B2; - res_y[0] = 0x3EE089F0EB49AA14; res_y[1] = 0x2001EB3A57688396; res_y[2] = 0x1FEE5617A7E954CD; res_y[3] = 0x0FFDB0D761421F50; - - if (fp2compare64((uint64_t*)A->x, res_x)!=0 || fp2compare64((uint64_t*)A->y, res_y)!=0) passed=0; - - eccset(A); - point_setup(A, VP); - v2mul1271((uint32_t*)&PARAMETER_d, VP->x, t1); // d*x - v2add1271(t1, t1, t1); // 2*d*x - v2mul1271(t1, VP->y, VQ->t2); // 2*d*t - v2add1271(VP->x, VP->y, VQ->xy); // x+y - v2sub1271(VP->y, VP->x, VQ->yx); // y-x - v2zero1271(VQ->z2); VQ->z2[0] = 2; // 2*z - eccdouble(VP); // P = 2P - - for (n=0; nx, A->x); - from_ext_to_std(VA->y, A->y); - - // Result - res_x[0] = 0x6480B1EF0A151DB0; res_x[1] = 0x3E243958590C4D90; res_x[2] = 0xAA270F644A65D473; res_x[3] = 0x5327AF7D84238CD0; - res_y[0] = 0x5E06003D73C43EB1; res_y[1] = 0x3EF69A49CB7E0237; res_y[2] = 0x4E752648AC2EF0AB; res_y[3] = 0x293EB1E26DD23B4E; - - if (fp2compare64((uint64_t*)A->x, res_x)!=0 || fp2compare64((uint64_t*)A->y, res_y)!=0) passed=0; - - if (passed==1) printf(" Point addition tests .................................................................... PASSED"); - else { printf(" Point addition tests ... FAILED"); printf("\n"); return false; } - printf("\n"); - -#if (USE_ENDO == true) - // Psi endomorphism - eccset(A); - point_setup(A, VP); - - for (n=0; nx, A->x); - from_ext_to_std(VA->y, A->y); - - // Result - res_x[0] = 0xD8F3C8C24A2BC7E2; res_x[1] = 0x75AF54EDB41A2B93; res_x[2] = 0x4DE2466701F009A9; res_x[3] = 0x065249F9EDE0C798; - res_y[0] = 0x1C6E119ADD608104; res_y[1] = 0x06DBB85BFFB7C21E; res_y[2] = 0xFD234D6C4CFA3EC1; res_y[3] = 0x060A30903424BF13; - - if (fp2compare64((uint64_t*)A->x, res_x)!=0 || fp2compare64((uint64_t*)A->y, res_y)!=0) passed=0; - - if (passed==1) printf(" Psi endomorphism tests .................................................................. PASSED"); - else { printf(" Psi endomorphism tests ... FAILED"); printf("\n"); return false; } - printf("\n"); - - // Phi endomorphism - eccset(A); - point_setup(A, VP); - - for (n=0; nx, A->x); - from_ext_to_std(VA->y, A->y); - point_setup(A, VP); - } - - // Result - res_x[0] = 0xD5B5A3061287DB16; res_x[1] = 0x5550AAB9E7A620EE; res_x[2] = 0xEC321E6CF33610FC; res_x[3] = 0x3E61EBB9A1CB0210; - res_y[0] = 0x7E2851D5A8E83FB9; res_y[1] = 0x5474BF8EC55603AE; res_y[2] = 0xA5077613491788D5; res_y[3] = 0x5476093DBF8BF6BF; - - if (fp2compare64((uint64_t*)A->x, res_x)!=0 || fp2compare64((uint64_t*)A->y, res_y)!=0) passed=0; - if (passed==1) printf(" Phi endomorphism tests .................................................................. PASSED"); - else { printf(" Phi endomorphism tests ... FAILED"); printf("\n"); return false; } - printf("\n"); - - // Scalar decomposition and recoding - { - uint64_t acc1, acc2, acc3, acc4, scalars[4]; - unsigned int digits[65], sign_masks[65]; - uint64_t k[4]; - int i; - - for (n=0; n= 0; i--) - { - acc1 = 2*acc1; acc2 = 2*acc2; acc3 = 2*acc3; acc4 = 2*acc4; - if (sign_masks[i] == (unsigned int)-1) { - acc1 += 1; - acc2 += (digits[i] & 1); - acc3 += ((digits[i] >> 1) & 1); - acc4 += ((digits[i] >> 2) & 1); - } else if (sign_masks[i] == 0) { - acc1 -= 1; - acc2 -= (digits[i] & 1); - acc3 -= ((digits[i] >> 1) & 1); - acc4 -= ((digits[i] >> 2) & 1); - } - } - if (scalar[0] != acc1 || scalar[1] != acc2 || scalar[2] != acc3 || scalar[3] != acc4) { passed=0; break; } - } - - if (passed==1) printf(" Recoding and decomposition tests ........................................................ PASSED"); - else { printf(" Recoding and decomposition tests ... FAILED"); printf("\n"); return false; } - printf("\n"); - } -#endif - - // Scalar multiplication - eccset(A); - clear_cofactor = false; - scalar[0] = 0x3AD457AB55456230; scalar[1] = 0x3A8B3C2C6FD86E0C; scalar[2] = 0x7E38F7C9CFBB9166; scalar[3] = 0x0028FD6CBDA458F0; - - for (n=0; nx, res_x)!=0 || fp2compare64((uint64_t*)A->y, res_y)!=0) passed=0; - - eccset(A); - clear_cofactor = true; - scalar[0] = 0x3AD457AB55456230; scalar[1] = 0x3A8B3C2C6FD86E0C; scalar[2] = 0x7E38F7C9CFBB9166; scalar[3] = 0x0028FD6CBDA458F0; - - for (n=0; nx, res_x)!=0 || fp2compare64((uint64_t*)A->y, res_y)!=0) passed=0; - - if (passed==1) printf(" Scalar multiplication tests ............................................................. PASSED"); - else { printf(" Scalar multiplication tests ... FAILED"); printf("\n"); return false; } - printf("\n"); - - { - point_t A, B, C; - unsigned int j, w, v, e, d; - uint64_t k[4]; - unsigned int digits_fixed[NBITS_ORDER_PLUS_ONE+(W_FIXEDBASE*V_FIXEDBASE)-1] = {0}; - - // Scalar recoding using the mLSB-set representation - w = W_FIXEDBASE; - v = V_FIXEDBASE; - e = E_FIXEDBASE; - d = D_FIXEDBASE; - - for (n=0; nx,(uint64_t*)C->x)!=0 || fp2compare64((uint64_t*)B->y,(uint64_t*)C->y)!=0) { passed=0; break; } - } - - if (passed==1) printf(" Fixed-base scalar multiplication tests .................................................. PASSED"); - else { printf(" Fixed-base scalar multiplication tests ... FAILED"); printf("\n"); return false; } - printf("\n"); - } - - { - point_t PP, QQ, RR, UU, TT; - vpoint_extproj_t VS, VT; - vpoint_extproj_precomp_t VR; - uint64_t k[4], l[4], scalar[4]; - - // Double scalar multiplication - eccset(QQ); - eccset(PP); - - for (n=0; nx, VT->x); - from_std_to_ext(TT->y, VT->y); - v2add1271(VT->x, VT->y, VR->xy); - v2sub1271(VT->y, VT->x, VR->yx); - v2zero1271(VR->z2); VR->z2[0] = 2; - v2mul1271(VT->x, VT->y, VR->t2); - v2add1271(VR->t2, VR->t2, VR->t2); - v2mul1271(VR->t2, (digit_t*)&PARAMETER_d, VR->t2); - - eccadd(VR, VS); - eccnorm(VS, VA); - from_ext_to_std(VA->x, A->x); - from_ext_to_std(VA->y, A->y); - - if (fp2compare64((uint64_t*)A->x,(uint64_t*)RR->x)!=0 || fp2compare64((uint64_t*)A->y,(uint64_t*)RR->y)!=0) { passed=0; break; } - } - - if (passed==1) printf(" Double scalar multiplication tests ...................................................... PASSED"); - else { printf(" Double scalar multiplication tests ... FAILED"); printf("\n"); return false; } - printf("\n"); - } - - return OK; -} - - -bool ecc_run() -{ - bool OK = true; - unsigned int n, i, sign_mask=-1, digit=1; - unsigned long long nsec, nsec1, nsec2; - point_t A, B; - vpoint_extproj_t VP; - vpoint_extproj_precomp_t VQ, Table[8]; - v2elm_t t1; - uint64_t scalar[4]; - - printf("\n--------------------------------------------------------------------------------------------------------\n\n"); - printf("Benchmarking FourQ's curve arithmetic \n\n"); - - // Point doubling - eccset(A); - point_setup(A, VP); - - nsec = 0; - for (n=0; nx, t1); // d*x - v2add1271(t1, t1, t1); // 2*d*x - v2mul1271(t1, VP->y, VQ->t2); // 2*d*t - v2add1271(VP->x, VP->y, VQ->xy); // x+y - v2sub1271(VP->y, VP->x, VQ->yx); // y-x - v2zero1271(VQ->z2); VQ->z2[0] = 2; // 2*z - eccdouble(VP); // P = 2P - - nsec = 0; - for (n=0; n - - -// Benchmark and test parameters -#define BENCH_LOOPS 10000 // Number of iterations per bench -#define SHORT_BENCH_LOOPS 1000 // Number of iterations per bench (for expensive operations) -#define TEST_LOOPS 1000 // Number of iterations per test - - -bool fp_test() -{ // Tests for the quadratic extension field arithmetic - bool OK = true; - int n, i, passed; - velm_t va, vb, vc, vd, ve, vf, vz = {0}, vo = {0}; - - printf("\n--------------------------------------------------------------------------------------------------------\n\n"); - printf("Testing field arithmetic over GF(2^127-1): \n\n"); - - // Field multiplication using p = 2^127-1 - passed = 1; - for (n=0; n - #include -#endif -#if (OS_TARGET == OS_LINUX) && (TARGET == TARGET_ARM) - #include -#endif -#include -#include -#include - - -int64_t cpu_nseconds(void) -{ // Access system counter for benchmarking -#if (OS_TARGET == OS_WIN) && (TARGET == TARGET_x86) - return __rdtsc(); -#elif (OS_TARGET == OS_WIN) && (TARGET == TARGET_ARM) - return __rdpmccntr64(); -#elif (OS_TARGET == OS_LINUX) && (TARGET == TARGET_x86) - unsigned int hi, lo; - - asm volatile ("rdtsc\n\t" : "=a" (lo), "=d"(hi)); - return ((int64_t)lo) | (((int64_t)hi) << 32); -#elif (OS_TARGET == OS_LINUX) && (TARGET == TARGET_ARM) - struct timespec time; - - clock_gettime(CLOCK_REALTIME, &time); - return (int64_t)(time.tv_sec*1e9 + time.tv_nsec); -#else - return 0; -#endif -} - - -int vcompare32(uint32_t* a, uint32_t* b) -{ // Comparing uint32_t digits of two field elements, a=b? : (0) equal, (1) unequal - // NOTE: this function does not have constant-time execution. TO BE USED FOR TESTING ONLY. - unsigned int i; - - for (i = 0; i < VWORDS_FIELD; i++) { - if (a[i] != b[i]) return 1; - } - - return 0; -} - - -int v2compare32(uint32_t* a, uint32_t* b) -{ // Comparing uint32_t digits of two quadratic extension field elements, a=b? : (0) equal, (1) unequal - // NOTE: this function does not have constant-time execution. TO BE USED FOR TESTING ONLY. - unsigned int i; - - for (i = 0; i < 2*VWORDS_FIELD; i++) { - if (a[i] != b[i]) return 1; - } - - return 0; -} - - -int fpcompare64(uint64_t* a, uint64_t* b) -{ // Comparing uint64_t digits of two field elements, a=b? : (0) equal, (1) unequal - // NOTE: this function does not have constant-time execution. TO BE USED FOR TESTING ONLY. - unsigned int i; - - for (i = 0; i < (NWORDS64_FIELD); i++) { - if (a[i] != b[i]) return 1; - } - - return 0; -} - - -int fp2compare64(uint64_t* a, uint64_t* b) -{ // Comparing uint64_t digits of two quadratic extension field elements, ai=bi? : (0) equal, (1) unequal - // NOTE: this function does not have constant-time execution. TO BE USED FOR TESTING ONLY. - unsigned int i; - - for (i = 0; i < (2*NWORDS64_FIELD); i++) { - if (a[i] != b[i]) return 1; - } - - return 0; -} - - -void random_scalar_test(uint64_t* a) -{ // Generating a pseudo-random scalar value in [0, 2^256-1] - // NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY. - unsigned char* string = (unsigned char*)&a[0]; - unsigned int i; - - for (i = 0; i < (sizeof(uint64_t)*NWORDS64_ORDER); i++) { - string[i] = (unsigned char)rand(); - } -} - - -void fp2random1271_test(f2elm_t a) -{ // Generating a pseudo-random GF(p^2) element a+b*i, where a,b in [0, 2^127-1] - // NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY. - digit_t mask_7fff = (digit_t)-1 >> 1; - - random_scalar_test((uint64_t*)&a[0]); - a[0][NWORDS_FIELD-1] &= mask_7fff; - a[1][NWORDS_FIELD-1] &= mask_7fff; -} - - -void vrandom1271_test(velm_t a) -{ // Generating a vectorized pseudo-random GF(p) element in [0, 2^127-1] - // NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY. - v2elm_t temp; - - v2random1271_test(temp); - memmove((unsigned char*)a, (unsigned char*)&temp[0], 20); -} - - -void v2random1271_test(v2elm_t a) -{ // Generating a vectorized pseudo-random GF(p^2) element a+b*i, where a,b in [0, 2^127-1] - // NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY. - digit_t mask_7fff = (digit_t)-1 >> 1; - f2elm_t temp; - - random_scalar_test((uint64_t*)&temp[0]); - temp[0][NWORDS_FIELD-1] &= mask_7fff; - temp[1][NWORDS_FIELD-1] &= mask_7fff; - from_std_to_ext(temp, a); -} - - -bool verify_mLSB_recoding(uint64_t* scalar, int* digits) -{ // Verification of the mLSB-set's recoding algorithm used in fixed-base scalar multiplication - unsigned int j, l = L_FIXEDBASE, d = D_FIXEDBASE; - uint64_t temp, temp2, carry, borrow, generated_scalar[NWORDS64_ORDER] = {0}; - int i, digit; - - for (i = (l-1); i >= 0; i--) - { - // Shift generated scalar to the left by 1 (multiply by 2) - temp = ((generated_scalar[0] >> (RADIX64-1)) & 1) ; - generated_scalar[0] = generated_scalar[0] << 1; - - for (j = 1; j < NWORDS64_ORDER; j++) { - temp2 = ((generated_scalar[j] >> (RADIX64-1)) & 1) ; - generated_scalar[j] = (generated_scalar[j] << 1) | temp; - temp = temp2; - } - - // generated scalar + digit_i - if (i < (int)d) { - digit = digits[i] | 1; - if (digit >= 0) { - generated_scalar[0] = generated_scalar[0] + digit; - carry = (generated_scalar[0] < (unsigned int)digit); - for (j = 1; j < NWORDS64_ORDER; j++) - { - generated_scalar[j] = generated_scalar[j] + carry; - carry = (generated_scalar[j] < carry); - } - } else { - borrow = 0; - temp = (uint64_t)(-digit); - for (j = 0; j < NWORDS64_ORDER; j++) - { - temp2 = generated_scalar[j] - temp; - carry = (generated_scalar[j] < temp); - generated_scalar[j] = temp2 - borrow; - borrow = carry || (temp2 < borrow); - temp = 0; - } - } - } else { - digit = digits[i]*(digits[i-(i/d)*d] | 1); - if (digit >= 0) { - generated_scalar[0] = generated_scalar[0] + digit; - carry = (generated_scalar[0] < (unsigned int)digit); - for (j = 1; j < NWORDS64_ORDER; j++) - { - generated_scalar[j] = generated_scalar[j] + carry; - carry = (generated_scalar[j] < carry); - } - } else { - borrow = 0; - temp = (uint64_t)(-digit); - for (j = 0; j < NWORDS64_ORDER; j++) - { - temp2 = generated_scalar[j] - temp; - carry = (generated_scalar[j] < temp); - generated_scalar[j] = temp2 - borrow; - borrow = carry || (temp2 < borrow); - temp = 0; - } - } - } - } - - for (j = 0; j < NWORDS64_ORDER; j++) - { - if (scalar[j] != generated_scalar[j]) - return false; - } - - return true; -} diff --git a/ffi-deps/FourQlib/FourQ_32bit/tests/test_extras.h b/ffi-deps/FourQlib/FourQ_32bit/tests/test_extras.h deleted file mode 100644 index 42d7605..0000000 --- a/ffi-deps/FourQlib/FourQ_32bit/tests/test_extras.h +++ /dev/null @@ -1,62 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: utility header file for tests -************************************************************************************/ - -#ifndef __TEST_EXTRAS_H__ -#define __TEST_EXTRAS_H__ - - -// For C++ -#ifdef __cplusplus -extern "C" { -#endif - - -#if TARGET == TARGET_ARM - #define print_unit printf("nsec"); -#else - #define print_unit printf("cycles"); -#endif - - -// Access system counter for benchmarking -int64_t cpu_nseconds(void); - -// Comparing uint32_t digits of two field elements, a=b? : (0) equal, (1) unequal -int vcompare32(uint32_t* a, uint32_t* b); - -// Comparing uint32_t digits of two quadratic extension field elements, ai=bi? : (0) equal, (1) unequal -int v2compare32(uint32_t* a, uint32_t* b); - -// Comparing uint64_t digits of two field elements, a=b? : (0) equal, (1) unequal -int fpcompare64(uint64_t* a, uint64_t* b); - -// Comparing uint64_t digits of two quadratic extension field elements, ai=bi? : (0) equal, (1) unequal -int fp2compare64(uint64_t* a, uint64_t* b); - -// Generating a pseudo-random scalar value in [0, 2^256-1] -void random_scalar_test(uint64_t* a); - -// Generating a vectorized pseudo-random GF(p) element in [0, 2^127-1] -void vrandom1271_test(velm_t a); - -// Generating a pseudo-random GF(p^2) element a+b*i, where a,b in [0, 2^127-1] -void fp2random1271_test(f2elm_t a); - -// Generating a vectorized pseudo-random GF(p^2) element a+b*i, where a,b in [0, 2^127-1] -void v2random1271_test(v2elm_t a); - -// Verification of the mLSB-set's recoding algorithm used in fixed-base scalar multiplication -bool verify_mLSB_recoding(uint64_t* scalar, int* digits); - - -#ifdef __cplusplus -} -#endif - - -#endif \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/AMD64/consts.c b/ffi-deps/FourQlib/FourQ_64bit_and_portable/AMD64/consts.c deleted file mode 100644 index 531dfa7..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/AMD64/consts.c +++ /dev/null @@ -1,15 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: constants for assembly implementation -************************************************************************************/ - -#include - - -uint32_t ONEx8[8] = {1,1,1,1,1,1,1,1}; -uint32_t TWOx8[8] = {2,2,2,2,2,2,2,2}; -uint64_t PRIME1271[4] = {0xFFFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF}; - diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/AMD64/fp2_1271.S b/ffi-deps/FourQlib/FourQ_64bit_and_portable/AMD64/fp2_1271.S deleted file mode 100644 index 454fbea..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/AMD64/fp2_1271.S +++ /dev/null @@ -1,354 +0,0 @@ -//*********************************************************************************** -// FourQlib: a high-performance crypto library based on the elliptic curve FourQ -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// Abstract: arithmetic over GF(p^2) using x64 assembly for Linux -//*********************************************************************************** - -.intel_syntax noprefix - -// Registers that are used for parameter passing: -#define reg_p1 rdi -#define reg_p2 rsi -#define reg_p3 rdx -#define reg_p4 rcx - - -.text -//************************************************************************** -// Quadratic extension field multiplication using lazy reduction -// Based on schoolbook method -// Operation: c [reg_p3] = a [reg_p1] * b [reg_p2] in GF(p^2), p = 2^127-1 -// NOTE: only a=c is allowed for fp2mul1271_a(a, b, c) -//************************************************************************** -.global fp2mul1271_a -fp2mul1271_a: - push r15 -#if defined(PUSH_SET) - push r12 - push r14 - push r13 -#endif - mov rcx, reg_p3 - - // T0 = a0 * b0, (r11, r10, r9, r8) <- [reg_p1_0-8] * [reg_p2_0-8] - mov rax, [reg_p1] - mov r11, [reg_p2] - mul r11 -#if !defined(PUSH_SET) - push r12 -#endif - xor r10, r10 - mov r8, rax - mov r9, rdx - - mov r12, [reg_p2+8] - mov rax, [reg_p1] - mul r12 - add r9, rax -#if !defined(PUSH_SET) - push r14 -#endif - adc r10, rdx - - mov rax, [reg_p1+8] - mul r11 - add r9, rax -#if !defined(PUSH_SET) - push r13 -#endif - adc r10, rdx - - mov rax, [reg_p1+8] - mul r12 - add r10, rax - mov r11, 0 - adc r11, rdx - - // T1 = a1 * b1, (r15, r14, r13, r12) <- [reg_p1_16-24] * [reg_p2_16-24] - xor r14, r14 - mov rax, [reg_p1+16] - mov r15, [reg_p2+16] - mul r15 - mov r12, rax - mov rax, [reg_p2+24] - mov r13, rdx - - mov rdx, [reg_p1+16] - mul rdx - add r13, rax - mov rax, [reg_p1+24] - adc r14, rdx - - mul r15 - add r13, rax - adc r14, rdx - - mov r15, [reg_p2+24] - mov rax, [reg_p1+24] - mul r15 - mov r15, 0 - add r14, rax - adc r15, rdx - - // c0 = T0 - T1 = a0*b0 - a1*b1 - xor rax, rax - sub r8, r12 - sbb r9, r13 - sbb r10, r14 - sbb r11, r15 - adc rax, 0 - - shld r11, r10, 1 - shld r10, r9, 1 - mov r15, [reg_p2+16] - mov rax, [reg_p1] - btr r9, 63 - - // T0 = a0 * b1, (r15, r14, r13, r12) <- [reg_p1_0-8] * [reg_p2_16-24] - mul r15 - btr r11, 63 // Add prime if borrow=1 - sbb r10, 0 - sbb r11, 0 - xor r14, r14 - mov r12, rax - mov rax, [reg_p2+24] - mov r13, rdx - - mov rdx, [reg_p1] - mul rdx - add r13, rax - mov rax, [reg_p1+8] - adc r14, rdx - - mul r15 - xor r15, r15 - add r13, rax - mov rax, [reg_p1+8] - adc r14, rdx - - mul qword ptr [reg_p2+24] - add r8, r10 - adc r9, r11 - add r14, rax - adc r15, rdx - - // Reducing and storing c0 - btr r9, 63 - adc r8, 0 - mov r11, [reg_p2] - adc r9, 0 - - // T1 = a1 * b0, (r12, r11, r10, r9) <- [reg_p1_16-24] * [reg_p2_0-8] - mov rax, [reg_p1+16] - mul r11 - mov [rcx], r8 - mov [rcx+8], r9 - mov r8, rax - mov r9, rdx - - mov rax, [reg_p1+16] - mov rsi, [reg_p2+8] - mul rsi - xor r10, r10 - add r9, rax - adc r10, rdx - - mov rax, [reg_p1+24] - mul r11 - add r9, rax - adc r10, rdx - - xor r11, r11 - mov rax, [reg_p1+24] - mul rsi - add r10, rax - adc r11, rdx - - // c1 = T0 + T1 = a0*b1 + a1*b0 - add r8, r12 - adc r9, r13 - pop r13 - adc r10, r14 - pop r14 - pop r12 - adc r11, r15 - pop r15 - - // Reducing and storing c1 - shld r11, r10, 1 - shld r10, r9, 1 - btr r9, 63 - btr r11, 63 - adc r8, r10 - adc r9, r11 - btr r9, 63 - adc r8, 0 - adc r9, 0 - mov [rcx+16], r8 - mov [rcx+24], r9 - ret - - -//*********************************************************************** -// Quadratic extension field squaring -// Operation: c [reg_p2] = a^2 [reg_p1] in GF(p^2), p = 2^127-1 -// NOTE: a=c is not allowed for fp2sqr1271_a(a, c) -//*********************************************************************** -.global fp2sqr1271_a -fp2sqr1271_a: - push r14 - - // t0 = (r9, r8) = a0 + a1, (rcx, r14) <- a1 - mov r8, [reg_p1] - mov r14, [reg_p1+16] - add r8, r14 - mov r9, [reg_p1+8] - mov rcx, [reg_p1+24] - adc r9, rcx - - btr r9, 63 - push r12 - adc r8, 0 - adc r9, 0 - - // t1 = (r11, r10) = a0 - a1 - mov r10, [reg_p1] - sub r10, r14 - mov r11, [reg_p1+8] - sbb r11, rcx - - btr r11, 63 - sbb r10, 0 - push r13 - sbb r11, 0 - - // c0 = t0 * t1 = (a0 + a1)*(a0 - a1), (rcx, r14, r13, r12) <- (r9, r8) * (r11, r10) - xor r14, r14 - mov rax, r8 - mul r10 - mov r12, rax - mov rax, r11 - mov r13, rdx - - mul r8 - xor rcx, rcx - add r13, rax - adc r14, rdx - - mov rax, r9 - mul r10 - mov r8, [reg_p1] - add r13, rax - adc r14, rdx - - mov rax, r9 - mul r11 - mov r9, [reg_p1+8] - add r14, rax - adc rcx, rdx - - // t2 = (r9, r8) = 2*a0 - add r8, r8 - adc r9, r9 - - btr r9, 63 - adc r8, 0 - adc r9, 0 - - // Reducing and storing c0 - shld rcx, r14, 1 - shld r14, r13, 1 - btr r13, 63 - add r12, r14 - adc r13, rcx - btr r13, 63 - adc r12, 0 - adc r13, 0 - mov [reg_p2], r12 - mov [reg_p2+8], r13 - - // c1 = 2a0 * a1, (rcx, r14, r11, r10) <- (r9, r8) * [reg_p1_16-24] - mov rcx, [reg_p1+16] - mov rax, r8 - mul rcx - mov r10, rax - mov r11, rdx - - mov rax, [reg_p1+24] - xor r14, r14 - mul r8 - add r11, rax - adc r14, rdx - - mov rax, rcx - mul r9 - add r11, rax - adc r14, rdx - - mov rax, [reg_p1+24] - mul r9 - xor rcx, rcx - add r14, rax - pop r13 - adc rcx, rdx - - // Reducing and storing c1 - shld rcx, r14, 1 - shld r14, r11, 1 - btr r11, 63 - add r10, r14 - pop r12 - adc r11, rcx - btr r11, 63 - adc r10, 0 - pop r14 - adc r11, 0 - mov [reg_p2+16], r10 - mov [reg_p2+24], r11 - ret - - -//*************************************************************************** -// Quadratic extension field addition/subtraction -// Operation: c [reg_p3] = 2*a [reg_p1] - b [reg_p2] in GF(p^2), p = 2^127-1 -//*************************************************************************** -.global fp2addsub1271_a -fp2addsub1271_a: - mov r8, [reg_p1] - mov r9, [reg_p1+8] - add r8, r8 - adc r9, r9 - btr r9, 63 - adc r8, 0 - adc r9, 0 - - mov r10, [reg_p2] - sub r8, r10 - mov r10, [reg_p2+8] - sbb r9, r10 - btr r9, 63 - sbb r8, 0 - mov [reg_p3], r8 - sbb r9, 0 - mov [reg_p3+8], r9 - - mov r8, [reg_p1+16] - mov r9, [reg_p1+24] - add r8, r8 - adc r9, r9 - btr r9, 63 - adc r8, 0 - adc r9, 0 - - mov r10, [reg_p2+16] - sub r8, r10 - mov r10, [reg_p2+24] - sbb r9, r10 - btr r9, 63 - sbb r8, 0 - mov [reg_p3+16], r8 - sbb r9, 0 - mov [reg_p3+24], r9 - ret diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/AMD64/fp2_1271_AVX2.S b/ffi-deps/FourQlib/FourQ_64bit_and_portable/AMD64/fp2_1271_AVX2.S deleted file mode 100644 index 8d582a1..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/AMD64/fp2_1271_AVX2.S +++ /dev/null @@ -1,446 +0,0 @@ -//*********************************************************************************** -// FourQlib: a high-performance crypto library based on the elliptic curve FourQ -// -// Copyright (c) Microsoft Corporation. All rights reserved. -// -// Abstract: arithmetic over GF(p^2) using x64 assembly for Linux with AVX2 support -//*********************************************************************************** - -#include "consts.s" - -.intel_syntax noprefix - -// Registers that are used for parameter passing: -#define reg_p1 rdi -#define reg_p2 rsi -#define reg_p3 rdx -#define reg_p4 rcx - - -.text -//************************************************************************** -// Quadratic extension field multiplication using lazy reduction -// Based on schoolbook method -// Operation: c [reg_p3] = a [reg_p1] * b [reg_p2] in GF(p^2), p = 2^127-1 -// NOTE: only a=c is allowed for fp2mul1271_a(a, b, c) -//************************************************************************** -.global fp2mul1271_a -fp2mul1271_a: - mov rcx, reg_p3 - - // T0 = a0 * b0, (r11, r10, r9, r8) <- [reg_p1_0-8] * [reg_p2_0-8] - mov rdx, [reg_p2] - mulx r9, r8, [reg_p1] - mulx rax, r10, [reg_p1+8] - push r15 - push r14 - add r9, r10 - mov rdx, [reg_p2+8] - mulx r11, r10, [reg_p1+8] - push r13 - adc r10, rax - push r12 - mulx rax, rdx, [reg_p1] - adc r11, 0 - add r9, rdx - - // T1 = a1 * b1, (r15, r14, r13, r12) <- [reg_p1_16-24] * [reg_p2_16-24] - mov rdx, [reg_p2+16] - mulx r13, r12, [reg_p1+16] - adc r10, rax - mulx rax, r14, [reg_p1+24] - adc r11, 0 - mov rdx, [reg_p2+24] - add r13, r14 - mulx r15, r14, [reg_p1+24] - adc r14, rax - adc r15, 0 - mulx rax, rdx, [reg_p1+16] - add r13, rdx - adc r14, rax - adc r15, 0 - - // c0 = T0 - T1 = a0*b0 - a1*b1 - xor rax, rax - sub r8, r12 - sbb r9, r13 - sbb r10, r14 - sbb r11, r15 - - shld r11, r10, 1 - shld r10, r9, 1 - mov rdx, [reg_p2+16] - btr r9, 63 - - // T0 = a0 * b1, (r15, r14, r13, r12) <- [reg_p1_0-8] * [reg_p2_16-24] - mulx r13, r12, [reg_p1] - btr r11, 63 // Add prime if borrow=1 - sbb r10, 0 - sbb r11, 0 - mulx rax, r14, [reg_p1+8] - add r13, r14 - mov rdx, [reg_p2+24] - mulx r15, r14, [reg_p1+8] - adc r14, rax - adc r15, 0 - mulx rax, rdx, [reg_p1] - add r13, rdx - adc r14, rax - adc r15, 0 - - // Reducing and storing c0 - add r10, r8 - adc r11, r9 - btr r11, 63 - adc r10, 0 - adc r11, 0 - - // T1 = a1 * b0, (r12, r11, r10, r9) <- [reg_p1_16-24] * [reg_p2_0-8] - mov rdx, [reg_p2] - mulx r9, r8, [reg_p1+16] - mov [rcx], r10 - mulx rax, r10, [reg_p1+24] - mov [rcx+8], r11 - add r9, r10 - mov rdx, [reg_p2+8] - mulx r11, r10, [reg_p1+24] - adc r10, rax - adc r11, 0 - mulx rax, rdx, [reg_p1+16] - add r9, rdx - adc r10, rax - adc r11, 0 - - // c1 = T0 + T1 = a0*b1 + a1*b0 - add r8, r12 - pop r12 - adc r9, r13 - pop r13 - adc r10, r14 - pop r14 - adc r11, r15 - - // Reducing and storing c1 - shld r11, r10, 1 - shld r10, r9, 1 - btr r9, 63 - btr r11, 63 - adc r8, r10 - adc r9, r11 - btr r9, 63 - pop r15 - adc r8, 0 - adc r9, 0 - mov [rcx+16], r8 - mov [rcx+24], r9 - ret - - -//*********************************************************************** -// Quadratic extension field squaring -// Operation: c [reg_p2] = a^2 [reg_p1] in GF(p^2), p = 2^127-1 -// NOTE: a=c is not allowed for fp2sqr1271_a(a, c) -//*********************************************************************** -.global fp2sqr1271_a -fp2sqr1271_a: - - // t0 = (r9, r8) = a0 + a1, (rcx, r14) <- a1 - mov r10, [reg_p1] - push r14 - mov r14, [reg_p1+16] - sub r10, r14 - mov r11, [reg_p1+8] - mov rcx, [reg_p1+24] - sbb r11, rcx - - push r13 - btr r11, 63 - push r12 - sbb r10, 0 - - // t1 = (r11, r10) = a0 - a1 - mov rdx, r10 - mov r8, [reg_p1] - add r8, r14 - mov r9, [reg_p1+8] - adc r9, rcx - - // c0 = t0 * t1 = (a0 + a1)*(a0 - a1), (rcx, r14, r13, r12) <- (r9, r8) * (r11, r10) - mulx r13, r12, r8 - sbb r11, 0 - mulx rax, r14, r9 - mov rdx, r11 - add r13, r14 - mulx rcx, r14, r9 - mov r9, [reg_p1+8] - adc r14, rax - adc rcx, 0 - mulx rax, rdx, r8 - mov r8, [reg_p1] - add r13, rdx - adc r14, rax - adc rcx, 0 - - // t2 = (r9, r8) = 2*a0 - add r8, r8 - adc r9, r9 - - // Reducing and storing c0 - shld rcx, r14, 1 - shld r14, r13, 1 - btr r13, 63 - btr rcx, 63 - adc r12, r14 - adc r13, rcx - btr r13, 63 - adc r12, 0 - adc r13, 0 - mov [reg_p2], r12 - mov [reg_p2+8], r13 - - // c1 = 2a0 * a1, (rcx, r14, r11, r10) <- (r9, r8) * [reg_p1_16-24] - mov rdx, [reg_p1+16] - mulx r11, r10, r8 - pop r12 - mulx rax, r14, r9 - pop r13 - add r11, r14 - mov rdx, [reg_p1+24] - mulx rcx, r14, r9 - adc r14, rax - adc rcx, 0 - mulx rax, rdx, r8 - add r11, rdx - adc r14, rax - adc rcx, 0 - - // Reducing and storing c1 - shld rcx, r14, 1 - shld r14, r11, 1 - btr r11, 63 - btr rcx, 63 - adc r10, r14 - adc r11, rcx - btr r11, 63 - pop r14 - adc r10, 0 - adc r11, 0 - mov [reg_p2+16], r10 - mov [reg_p2+24], r11 - ret - - -//*************************************************************************** -// Quadratic extension field addition/subtraction -// Operation: c [reg_p3] = 2*a [reg_p1] - b [reg_p2] in GF(p^2), p = 2^127-1 -//*************************************************************************** -.global fp2addsub1271_a -fp2addsub1271_a: - mov r8, [reg_p1] - mov r9, [reg_p1+8] - add r8, r8 - adc r9, r9 - btr r9, 63 - adc r8, 0 - adc r9, 0 - - mov r10, [reg_p2] - sub r8, r10 - mov r10, [reg_p2+8] - sbb r9, r10 - btr r9, 63 - sbb r8, 0 - mov [reg_p3], r8 - sbb r9, 0 - mov [reg_p3+8], r9 - - mov r8, [reg_p1+16] - mov r9, [reg_p1+24] - add r8, r8 - adc r9, r9 - btr r9, 63 - adc r8, 0 - adc r9, 0 - - mov r10, [reg_p2+16] - sub r8, r10 - mov r10, [reg_p2+24] - sbb r9, r10 - btr r9, 63 - sbb r8, 0 - mov [reg_p3+16], r8 - sbb r9, 0 - mov [reg_p3+24], r9 - ret - - -//*********************************************************************************************** -// Constant-time table lookup to extract a point -// Inputs: sign_mask, digit, table containing 8 points -// Output: P = sign*table[digit], where sign=1 if sign_mask=0xFF...FF and sign=-1 if sign_mask=0 -//*********************************************************************************************** -.global table_lookup_1x8_a -table_lookup_1x8_a: - vpbroadcastd ymm4, DWORD PTR [reg_p3] - vpbroadcastd ymm14, DWORD PTR [reg_p4] - vmovdqu ymm5, [ONEx8+rip] - vmovdqu ymm11, [TWOx8+rip] - vmovdqu ymm0, YMMWORD PTR [reg_p1] - vmovdqu ymm1, YMMWORD PTR [reg_p1+32] - vmovdqu ymm2, YMMWORD PTR [reg_p1+64] - vmovdqu ymm3, YMMWORD PTR [reg_p1+96] - vmovdqu ymm10, ymm4 - -// While digit>=0 mask = 0x00...0 else mask = 0xFF...F -// If mask = 0xFF...F then point = point, else if mask = 0x00...0 then point = temp_point - vpsubd ymm4, ymm4, ymm5 - vpsubd ymm10, ymm10, ymm11 - vmovdqu ymm6, YMMWORD PTR [reg_p1+128] - vmovdqu ymm7, YMMWORD PTR [reg_p1+160] - vmovdqu ymm8, YMMWORD PTR [reg_p1+192] - vmovdqu ymm9, YMMWORD PTR [reg_p1+224] - vpsrad ymm15, ymm4, 31 - vpxor ymm0, ymm0, ymm6 - vpxor ymm1, ymm1, ymm7 - vpxor ymm2, ymm2, ymm8 - vpxor ymm3, ymm3, ymm9 - vpand ymm0, ymm0, ymm15 - vpand ymm1, ymm1, ymm15 - vpand ymm2, ymm2, ymm15 - vpand ymm3, ymm3, ymm15 - vpxor ymm0, ymm0, ymm6 - vpxor ymm1, ymm1, ymm7 - vpxor ymm2, ymm2, ymm8 - vpxor ymm3, ymm3, ymm9 - - vmovdqu ymm6, YMMWORD PTR [reg_p1+256] - vmovdqu ymm7, YMMWORD PTR [reg_p1+288] - vmovdqu ymm8, YMMWORD PTR [reg_p1+320] - vmovdqu ymm9, YMMWORD PTR [reg_p1+352] - vpsrad ymm15, ymm10, 31 - vpxor ymm0, ymm0, ymm6 - vpxor ymm1, ymm1, ymm7 - vpxor ymm2, ymm2, ymm8 - vpxor ymm3, ymm3, ymm9 - vpand ymm0, ymm0, ymm15 - vpand ymm1, ymm1, ymm15 - vpand ymm2, ymm2, ymm15 - vpand ymm3, ymm3, ymm15 - vpxor ymm0, ymm0, ymm6 - vpxor ymm1, ymm1, ymm7 - vpxor ymm2, ymm2, ymm8 - vpxor ymm3, ymm3, ymm9 - - vpsubd ymm4, ymm10, ymm5 - vpsubd ymm10, ymm10, ymm11 - vmovdqu ymm6, YMMWORD PTR [reg_p1+384] - vmovdqu ymm7, YMMWORD PTR [reg_p1+416] - vmovdqu ymm8, YMMWORD PTR [reg_p1+448] - vmovdqu ymm9, YMMWORD PTR [reg_p1+480] - vpsrad ymm15, ymm4, 31 - vpxor ymm0, ymm0, ymm6 - vpxor ymm1, ymm1, ymm7 - vpxor ymm2, ymm2, ymm8 - vpxor ymm3, ymm3, ymm9 - vpand ymm0, ymm0, ymm15 - vpand ymm1, ymm1, ymm15 - vpand ymm2, ymm2, ymm15 - vpand ymm3, ymm3, ymm15 - vpxor ymm0, ymm0, ymm6 - vpxor ymm1, ymm1, ymm7 - vpxor ymm2, ymm2, ymm8 - vpxor ymm3, ymm3, ymm9 - - vmovdqu ymm6, YMMWORD PTR [reg_p1+512] - vmovdqu ymm7, YMMWORD PTR [reg_p1+544] - vmovdqu ymm8, YMMWORD PTR [reg_p1+576] - vmovdqu ymm9, YMMWORD PTR [reg_p1+608] - vpsrad ymm15, ymm10, 31 - vpxor ymm0, ymm0, ymm6 - vpxor ymm1, ymm1, ymm7 - vpxor ymm2, ymm2, ymm8 - vpxor ymm3, ymm3, ymm9 - vpand ymm0, ymm0, ymm15 - vpand ymm1, ymm1, ymm15 - vpand ymm2, ymm2, ymm15 - vpand ymm3, ymm3, ymm15 - vpxor ymm0, ymm0, ymm6 - vpxor ymm1, ymm1, ymm7 - vpxor ymm2, ymm2, ymm8 - vpxor ymm3, ymm3, ymm9 - - vpsubd ymm4, ymm10, ymm5 - vpsubd ymm10, ymm10, ymm11 - vmovdqu ymm6, YMMWORD PTR [reg_p1+640] - vmovdqu ymm7, YMMWORD PTR [reg_p1+672] - vmovdqu ymm8, YMMWORD PTR [reg_p1+704] - vmovdqu ymm9, YMMWORD PTR [reg_p1+736] - vpsrad ymm15, ymm4, 31 - vpxor ymm0, ymm0, ymm6 - vpxor ymm1, ymm1, ymm7 - vpxor ymm2, ymm2, ymm8 - vpxor ymm3, ymm3, ymm9 - vpand ymm0, ymm0, ymm15 - vpand ymm1, ymm1, ymm15 - vpand ymm2, ymm2, ymm15 - vpand ymm3, ymm3, ymm15 - vpxor ymm0, ymm0, ymm6 - vpxor ymm1, ymm1, ymm7 - vpxor ymm2, ymm2, ymm8 - vpxor ymm3, ymm3, ymm9 - - vmovdqu ymm6, YMMWORD PTR [reg_p1+768] - vmovdqu ymm7, YMMWORD PTR [reg_p1+800] - vmovdqu ymm8, YMMWORD PTR [reg_p1+832] - vmovdqu ymm9, YMMWORD PTR [reg_p1+864] - vpsrad ymm15, ymm10, 31 - vpxor ymm0, ymm0, ymm6 - vpxor ymm1, ymm1, ymm7 - vpxor ymm2, ymm2, ymm8 - vpxor ymm3, ymm3, ymm9 - vpand ymm0, ymm0, ymm15 - vpand ymm1, ymm1, ymm15 - vpand ymm2, ymm2, ymm15 - vpand ymm3, ymm3, ymm15 - vpxor ymm0, ymm0, ymm6 - vpxor ymm1, ymm1, ymm7 - vpxor ymm2, ymm2, ymm8 - vpxor ymm3, ymm3, ymm9 - - vpsubd ymm4, ymm10, ymm5 - vmovdqu ymm6, YMMWORD PTR [reg_p1+896] - vmovdqu ymm7, YMMWORD PTR [reg_p1+928] - vmovdqu ymm8, YMMWORD PTR [reg_p1+960] - vmovdqu ymm9, YMMWORD PTR [reg_p1+992] - vpsrad ymm15, ymm4, 31 - vpxor ymm0, ymm0, ymm6 - vpxor ymm1, ymm1, ymm7 - vpxor ymm2, ymm2, ymm8 - vpxor ymm3, ymm3, ymm9 - vpand ymm0, ymm0, ymm15 - vpand ymm1, ymm1, ymm15 - vpand ymm2, ymm2, ymm15 - vpand ymm3, ymm3, ymm15 - vpxor ymm0, ymm0, ymm6 - vpxor ymm1, ymm1, ymm7 - vpxor ymm2, ymm2, ymm8 - vpxor ymm3, ymm3, ymm9 - -// point: x+y,y-x,2dt, temp_point: y-x,x+y,-2dt coordinate -// If sign_mask = 0 then choose negative of the point - vmovdqu ymm5, [PRIME1271+rip] - vmovdqu ymm6, ymm0 - vpsubq ymm7, ymm5, ymm3 // Negate 2dt coordinate - vpxor ymm10, ymm0, ymm1 - vpand ymm10, ymm10, ymm14 - vpxor ymm0, ymm1, ymm10 - vpxor ymm10, ymm6, ymm1 - vpand ymm10, ymm10, ymm14 - vpxor ymm1, ymm6, ymm10 - vpblendvb ymm3, ymm7, ymm3, ymm14 - - vmovdqu YMMWORD PTR [reg_p2], ymm0 - vmovdqu YMMWORD PTR [reg_p2+32], ymm1 - vmovdqu YMMWORD PTR [reg_p2+64], ymm2 - vmovdqu YMMWORD PTR [reg_p2+96], ymm3 - ret diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/AMD64/fp_x64.h b/ffi-deps/FourQlib/FourQ_64bit_and_portable/AMD64/fp_x64.h deleted file mode 100644 index 9a243c0..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/AMD64/fp_x64.h +++ /dev/null @@ -1,409 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: modular arithmetic and other low-level operations for x64 platforms -************************************************************************************/ - -#ifndef __FP_X64_H__ -#define __FP_X64_H__ - - -// For C++ -#ifdef __cplusplus -extern "C" { -#endif - - -#include "../table_lookup.h" -#include "../FourQ_params.h" - - -#if defined(UINT128_SUPPORT) - const uint128_t prime1271 = ((uint128_t)1 << 127) - 1; -#elif defined(SCALAR_INTRIN_SUPPORT) - const uint128_t prime1271 = {0xFFFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF}; -#endif -#define mask63 0x7FFFFFFFFFFFFFFF - - -void mod1271(felm_t a) -{ // Modular correction, a = a mod (2^127-1) - -#if defined(UINT128_SUPPORT) - uint128_t* r = (uint128_t*)&a[0]; - - *r = *r - prime1271; - *r = *r + (((uint128_t)0 - (*r >> 127)) & prime1271); -#elif defined(SCALAR_INTRIN_SUPPORT) - uint64_t mask; - uint128_t prime; - - prime[0] = prime1271[0]; - prime[1] = prime1271[1]; - - SUB128(a, prime1271, a); - mask = 0 - (a[1] >> 63); - prime[0] &= mask; prime[1] &= mask; - ADD128(a, prime, a); -#endif -} - - -__inline void fpcopy1271(felm_t a, felm_t c) -{ // Copy of a field element, c = a - c[0] = a[0]; - c[1] = a[1]; -} - - -static __inline void fpzero1271(felm_t a) -{ // Zeroing a field element, a = 0 - a[0] = 0; - a[1] = 0; -} - - -__inline void fpadd1271(felm_t a, felm_t b, felm_t c) -{ // Field addition, c = a+b mod (2^127-1) - -#if defined(UINT128_SUPPORT) - uint128_t* r = (uint128_t*)&a[0]; - uint128_t* s = (uint128_t*)&b[0]; - uint128_t* t = (uint128_t*)&c[0]; - - *t = *r + *s; - *t += (*t >> 127); - *t &= prime1271; -#elif defined(SCALAR_INTRIN_SUPPORT) - uint64_t temp; - unsigned char _carry; - - ADD128(a, b, c); - temp = __ull_rshift(c[1], 63); - c[1] &= mask63; - _carry = _addcarry_u64(0, c[0], temp, &c[0]); - _addcarry_u64(_carry, c[1], 0, &c[1]); -#endif -} - - -__inline void fpsub1271(felm_t a, felm_t b, felm_t c) -{ // Field subtraction, c = a-b mod (2^127-1) - -#if defined(UINT128_SUPPORT) - uint128_t* r = (uint128_t*)&a[0]; - uint128_t* s = (uint128_t*)&b[0]; - uint128_t* t = (uint128_t*)&c[0]; - - *t = *r - *s; - *t -= (*t >> 127); - *t &= prime1271; -#elif defined(SCALAR_INTRIN_SUPPORT) - uint64_t temp; - unsigned char _borrow; - - SUB128(a, b, c); - temp = __ull_rshift(c[1], 63); - c[1] &= mask63; - _borrow = _subborrow_u64(0, c[0], temp, &c[0]); - _subborrow_u64(_borrow, c[1], 0, &c[1]); -#endif -} - - -void fpneg1271(felm_t a) -{ // Field negation, a = -a mod (2^127-1) - -#if defined(UINT128_SUPPORT) - uint128_t* r = (uint128_t*)&a[0]; - - *r = prime1271 - *r; -#elif defined(SCALAR_INTRIN_SUPPORT) - SUB128(prime1271, a, a); -#endif -} - - -__inline void fpmul1271(felm_t a, felm_t b, felm_t c) -{ // Field multiplication, c = a*b mod (2^127-1) - uint128_t tt1, tt2, tt3 = {0}; - -#if defined(UINT128_SUPPORT) - tt1 = (uint128_t)a[0]*b[0]; - tt2 = (uint128_t)a[0]*b[1] + (uint128_t)a[1]*b[0] + (uint64_t)(tt1 >> 64); - tt3 = (uint128_t)a[1]*(b[1]*2) + ((uint128_t)tt2 >> 63); - tt1 = (uint64_t)tt1 | ((uint128_t)((uint64_t)tt2 & mask63) << 64); - tt1 += tt3; - tt1 = (tt1 >> 127) + (tt1 & prime1271); - c[0] = (uint64_t)tt1; - c[1] = (uint64_t)(tt1 >> 64); -#elif defined(SCALAR_INTRIN_SUPPORT) - uint128_t tt4; - - MUL128(a[0], b[0], tt1); - tt3[0] = tt1[1]; - MUL128(a[0], b[1], tt2); ADD128(tt2, tt3, tt2); - MUL128(a[1], b[0], tt3); ADD128(tt2, tt3, tt2); - MUL128(a[1], b[1], tt3); - SHIFTR128(tt2, 63, tt4); - SHIFTL128(tt3, 1, tt3); - ADD128(tt4, tt3, tt3); - tt1[1] = tt2[0] & mask63; - ADD128(tt1, tt3, tt1); - tt3[1] = 0; tt3[0] = __ull_rshift(tt1[1], 63); - tt1[1] &= mask63; - ADD128(tt1, tt3, c); -#endif -} - - -void fpsqr1271(felm_t a, felm_t c) -{ // Field squaring, c = a^2 mod (2^127-1) - uint128_t tt1, tt2, tt3 = {0}; - -#if defined(UINT128_SUPPORT) - tt1 = (uint128_t)a[0]*a[0]; - tt2 = (uint128_t)a[0]*(a[1]*2) + (uint64_t)(tt1 >> 64); - tt3 = (uint128_t)a[1]*(a[1]*2) + ((uint128_t)tt2 >> 63); - tt1 = (uint64_t)tt1 | ((uint128_t)((uint64_t)tt2 & mask63) << 64); - tt1 += tt3; - tt1 = (tt1 >> 127) + (tt1 & prime1271); - c[0] = (uint64_t)tt1; - c[1] = (uint64_t)(tt1 >> 64); -#elif defined(SCALAR_INTRIN_SUPPORT) - uint128_t tt4; - - MUL128(a[0], a[0], tt1); - tt3[0] = tt1[1]; - MUL128(a[0], a[1], tt2); ADD128(tt2, tt3, tt3); ADD128(tt2, tt3, tt2); - MUL128(a[1], a[1], tt3); - SHIFTR128(tt2, 63, tt4); - SHIFTL128(tt3, 1, tt3); - ADD128(tt4, tt3, tt3); - tt1[1] = tt2[0] & mask63; - ADD128(tt1, tt3, tt1); - tt3[1] = 0; tt3[0] = __ull_rshift(tt1[1], 63); - tt1[1] &= mask63; - ADD128(tt1, tt3, c); -#endif -} - - -__inline void fpexp1251(felm_t a, felm_t af) -{ // Exponentiation over GF(p), af = a^(125-1) - int i; - felm_t t1, t2, t3, t4, t5; - - fpsqr1271(a, t2); - fpmul1271(a, t2, t2); - fpsqr1271(t2, t3); - fpsqr1271(t3, t3); - fpmul1271(t2, t3, t3); - fpsqr1271(t3, t4); - fpsqr1271(t4, t4); - fpsqr1271(t4, t4); - fpsqr1271(t4, t4); - fpmul1271(t3, t4, t4); - fpsqr1271(t4, t5); - for (i=0; i<7; i++) fpsqr1271(t5, t5); - fpmul1271(t4, t5, t5); - fpsqr1271(t5, t2); - for (i=0; i<15; i++) fpsqr1271(t2, t2); - fpmul1271(t5, t2, t2); - fpsqr1271(t2, t1); - for (i=0; i<31; i++) fpsqr1271(t1, t1); - fpmul1271(t2, t1, t1); - for (i=0; i<32; i++) fpsqr1271(t1, t1); - fpmul1271(t1, t2, t1); - for (i=0; i<16; i++) fpsqr1271(t1, t1); - fpmul1271(t5, t1, t1); - for (i=0; i<8; i++) fpsqr1271(t1, t1); - fpmul1271(t4, t1, t1); - for (i=0; i<4; i++) fpsqr1271(t1, t1); - fpmul1271(t3, t1, t1); - fpsqr1271(t1, t1); - fpmul1271(a, t1, af); -} - - -void fpinv1271(felm_t a) -{ // Field inversion, af = a^-1 = a^(p-2) mod p - // Hardcoded for p = 2^127-1 - felm_t t; - - fpexp1251(a, t); - fpsqr1271(t, t); - fpsqr1271(t, t); - fpmul1271(a, t, a); -} - - -static __inline void multiply(const digit_t* a, const digit_t* b, digit_t* c) -{ // Schoolbook multiprecision multiply, c = a*b - unsigned int i, j; - digit_t u, v, UV[2]; - unsigned char carry = 0; - - for (i = 0; i < (2*NWORDS_ORDER); i++) c[i] = 0; - - for (i = 0; i < NWORDS_ORDER; i++) { - u = 0; - for (j = 0; j < NWORDS_ORDER; j++) { - MUL(a[i], b[j], UV+1, UV[0]); - ADDC(0, UV[0], u, carry, v); - u = UV[1] + carry; - ADDC(0, c[i+j], v, carry, v); - u = u + carry; - c[i+j] = v; - } - c[NWORDS_ORDER+i] = u; - } -} - - -static __inline unsigned char add(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) -{ // Multiprecision addition, c = a+b. Returns the carry bit - unsigned int i; - unsigned char carry = 0; - - for (i = 0; i < nwords; i++) { - ADDC(carry, a[i], b[i], carry, c[i]); - } - - return carry; -} - - -unsigned char subtract(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) -{ // Multiprecision subtraction, c = a-b. Returns the borrow bit - unsigned int i; - unsigned char borrow = 0; - - for (i = 0; i < nwords; i++) { - SUBC(borrow, a[i], b[i], borrow, c[i]); - } - - return borrow; -} - - -void subtract_mod_order(const digit_t* a, const digit_t* b, digit_t* c) -{ // Subtraction modulo the curve order, c = a-b mod order - digit_t mask, carry = 0; - digit_t* order = (digit_t*)curve_order; - unsigned int i, bout; - - bout = subtract(a, b, c, NWORDS_ORDER); // (bout, c) = a - b - mask = 0 - (digit_t)bout; // if bout = 0 then mask = 0x00..0, else if bout = 1 then mask = 0xFF..F - - for (i = 0; i < NWORDS_ORDER; i++) { // c = c + (mask & order) - ADDC(carry, c[i], mask & order[i], carry, c[i]); - } -} - - -void add_mod_order(const digit_t* a, const digit_t* b, digit_t* c) -{ // Addition modulo the curve order, c = a+b mod order - - add(a, b, c, NWORDS_ORDER); // c = a + b - subtract_mod_order(c, (digit_t*)&curve_order, c); // if c >= order then c = c - order -} - - -void Montgomery_multiply_mod_order(const digit_t* ma, const digit_t* mb, digit_t* mc) -{ // 256-bit Montgomery multiplication modulo the curve order, mc = ma*mb*r' mod order, where ma,mb,mc in [0, order-1] - // ma, mb and mc are assumed to be in Montgomery representation - // The Montgomery constant r' = -r^(-1) mod 2^(log_2(r)) is the global value "Montgomery_rprime", where r is the order - unsigned int i; - digit_t mask, P[2*NWORDS_ORDER], Q[2*NWORDS_ORDER], temp[2*NWORDS_ORDER]; - digit_t* order = (digit_t*)curve_order; - unsigned char cout = 0, bout = 0; - - multiply(ma, mb, P); // P = ma * mb - multiply(P, (digit_t*)&Montgomery_rprime, Q); // Q = P * r' mod 2^(log_2(r)) - multiply(Q, (digit_t*)&curve_order, temp); // temp = Q * r - cout = add(P, temp, temp, 2*NWORDS_ORDER); // (cout, temp) = P + Q * r - - for (i = 0; i < NWORDS_ORDER; i++) { // (cout, mc) = (P + Q * r)/2^(log_2(r)) - mc[i] = temp[NWORDS_ORDER + i]; - } - - // Final, constant-time subtraction - bout = subtract(mc, (digit_t*)&curve_order, mc, NWORDS_ORDER); // (cout, mc) = (cout, mc) - r - mask = (digit_t)(cout - bout); // if (cout, mc) >= 0 then mask = 0x00..0, else if (cout, mc) < 0 then mask = 0xFF..F - - for (i = 0; i < NWORDS_ORDER; i++) { // temp = mask & r - temp[i] = (order[i] & mask); - } - add(mc, temp, mc, NWORDS_ORDER); // mc = mc + (mask & r) - - return; -} - - -void modulo_order(digit_t* a, digit_t* c) -{ // Reduction modulo the order using Montgomery arithmetic - // ma = a*Montgomery_Rprime mod r, where a,ma in [0, r-1], a,ma,r < 2^256 - // c = ma*1*Montgomery_Rprime^(-1) mod r, where ma,c in [0, r-1], ma,c,r < 2^256 - digit_t ma[NWORDS_ORDER], one[NWORDS_ORDER] = {0}; - - one[0] = 1; - Montgomery_multiply_mod_order(a, (digit_t*)&Montgomery_Rprime, ma); - Montgomery_multiply_mod_order(ma, one, c); -} - - -void conversion_to_odd(digit_t* k, digit_t* k_odd) -{// Convert scalar to odd if even using the prime subgroup order r - digit_t i, mask; - digit_t* order = (digit_t*)curve_order; - unsigned char carry = 0; - - mask = ~(0 - (k[0] & 1)); - - for (i = 0; i < NWORDS_ORDER; i++) { // If (k is odd) then k_odd = k else k_odd = k + r - ADDC(carry, order[i] & mask, k[i], carry, k_odd[i]); - } -} - - -void fpdiv1271(felm_t a) -{ // Field division by two, c = a/2 mod p - digit_t mask, temp[2]; - unsigned char carry; - - mask = (0 - (1 & a[0])); - ADDC(0, a[0], mask, carry, temp[0]); - ADDC(carry, a[1], (mask >> 1), carry, temp[1]); - SHIFTR(temp[1], temp[0], 1, a[0], RADIX); - a[1] = (temp[1] >> 1); -} - - -void fp2div1271(f2elm_t a) -{ // GF(p^2) division by two c = a/2 mod p - digit_t mask, temp[2]; - unsigned char carry; - - mask = (0 - (1 & a[0][0])); - ADDC(0, a[0][0], mask, carry, temp[0]); - ADDC(carry, a[0][1], (mask >> 1), carry, temp[1]); - SHIFTR(temp[1], temp[0], 1, a[0][0], RADIX); - a[0][1] = (temp[1] >> 1); - - mask = (0 - (1 & a[1][0])); - ADDC(0, a[1][0], mask, carry, temp[0]); - ADDC(carry, a[1][1], (mask >> 1), carry, temp[1]); - SHIFTR(temp[1], temp[0], 1, a[1][0], RADIX); - a[1][1] = (temp[1] >> 1); -} - - -#ifdef __cplusplus -} -#endif - - -#endif diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/ARM64/fp_arm64.h b/ffi-deps/FourQlib/FourQ_64bit_and_portable/ARM64/fp_arm64.h deleted file mode 100644 index a14c16e..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/ARM64/fp_arm64.h +++ /dev/null @@ -1,327 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: modular arithmetic and other low-level operations for x64 platforms -************************************************************************************/ - -#ifndef __FP_ARM64_H__ -#define __FP_ARM64_H__ - - -// For C++ -#ifdef __cplusplus -extern "C" { -#endif - - -#include "../table_lookup.h" -#include "../FourQ_params.h" - - -const uint128_t prime1271 = ((uint128_t)1 << 127) - 1; -#define mask63 0x7FFFFFFFFFFFFFFF - - -void mod1271(felm_t a) -{ // Modular correction, a = a mod (2^127-1) - uint128_t* r = (uint128_t*)&a[0]; - - *r = *r - prime1271; - *r = *r + (((uint128_t)0 - (*r >> 127)) & prime1271); -} - - -__inline void fpcopy1271(felm_t a, felm_t c) -{ // Copy of a field element, c = a - c[0] = a[0]; - c[1] = a[1]; -} - - -static __inline void fpzero1271(felm_t a) -{ // Zeroing a field element, a = 0 - a[0] = 0; - a[1] = 0; -} - - -__inline void fpadd1271(felm_t a, felm_t b, felm_t c) -{ // Field addition, c = a+b mod (2^127-1) - uint128_t* r = (uint128_t*)&a[0]; - uint128_t* s = (uint128_t*)&b[0]; - uint128_t* t = (uint128_t*)&c[0]; - - *t = *r + *s; - *t += (*t >> 127); - *t &= prime1271; -} - - -__inline void fpsub1271(felm_t a, felm_t b, felm_t c) -{ // Field subtraction, c = a-b mod (2^127-1) - uint128_t* r = (uint128_t*)&a[0]; - uint128_t* s = (uint128_t*)&b[0]; - uint128_t* t = (uint128_t*)&c[0]; - - *t = *r - *s; - *t -= (*t >> 127); - *t &= prime1271; -} - - -void fpneg1271(felm_t a) -{ // Field negation, a = -a mod (2^127-1) - uint128_t* r = (uint128_t*)&a[0]; - - *r = prime1271 - *r; -} - - -__inline void fpmul1271(felm_t a, felm_t b, felm_t c) -{ // Field multiplication, c = a*b mod (2^127-1) - uint128_t tt1, tt2, tt3 = {0}; - - tt1 = (uint128_t)a[0]*b[0]; - tt2 = (uint128_t)a[0]*b[1] + (uint128_t)a[1]*b[0] + (uint64_t)(tt1 >> 64); - tt3 = (uint128_t)a[1]*(b[1]*2) + ((uint128_t)tt2 >> 63); - tt1 = (uint64_t)tt1 | ((uint128_t)((uint64_t)tt2 & mask63) << 64); - tt1 += tt3; - tt1 = (tt1 >> 127) + (tt1 & prime1271); - c[0] = (uint64_t)tt1; - c[1] = (uint64_t)(tt1 >> 64); -} - - -void fpsqr1271(felm_t a, felm_t c) -{ // Field squaring, c = a^2 mod (2^127-1) - uint128_t tt1, tt2, tt3 = {0}; - - tt1 = (uint128_t)a[0]*a[0]; - tt2 = (uint128_t)a[0]*(a[1]*2) + (uint64_t)(tt1 >> 64); - tt3 = (uint128_t)a[1]*(a[1]*2) + ((uint128_t)tt2 >> 63); - tt1 = (uint64_t)tt1 | ((uint128_t)((uint64_t)tt2 & mask63) << 64); - tt1 += tt3; - tt1 = (tt1 >> 127) + (tt1 & prime1271); - c[0] = (uint64_t)tt1; - c[1] = (uint64_t)(tt1 >> 64); -} - - -__inline void fpexp1251(felm_t a, felm_t af) -{ // Exponentiation over GF(p), af = a^(125-1) - int i; - felm_t t1, t2, t3, t4, t5; - - fpsqr1271(a, t2); - fpmul1271(a, t2, t2); - fpsqr1271(t2, t3); - fpsqr1271(t3, t3); - fpmul1271(t2, t3, t3); - fpsqr1271(t3, t4); - fpsqr1271(t4, t4); - fpsqr1271(t4, t4); - fpsqr1271(t4, t4); - fpmul1271(t3, t4, t4); - fpsqr1271(t4, t5); - for (i=0; i<7; i++) fpsqr1271(t5, t5); - fpmul1271(t4, t5, t5); - fpsqr1271(t5, t2); - for (i=0; i<15; i++) fpsqr1271(t2, t2); - fpmul1271(t5, t2, t2); - fpsqr1271(t2, t1); - for (i=0; i<31; i++) fpsqr1271(t1, t1); - fpmul1271(t2, t1, t1); - for (i=0; i<32; i++) fpsqr1271(t1, t1); - fpmul1271(t1, t2, t1); - for (i=0; i<16; i++) fpsqr1271(t1, t1); - fpmul1271(t5, t1, t1); - for (i=0; i<8; i++) fpsqr1271(t1, t1); - fpmul1271(t4, t1, t1); - for (i=0; i<4; i++) fpsqr1271(t1, t1); - fpmul1271(t3, t1, t1); - fpsqr1271(t1, t1); - fpmul1271(a, t1, af); -} - - -void fpinv1271(felm_t a) -{ // Field inversion, af = a^-1 = a^(p-2) mod p - // Hardcoded for p = 2^127-1 - felm_t t; - - fpexp1251(a, t); - fpsqr1271(t, t); - fpsqr1271(t, t); - fpmul1271(a, t, a); -} - - -static __inline void multiply(const digit_t* a, const digit_t* b, digit_t* c) -{ // Schoolbook multiprecision multiply, c = a*b - unsigned int i, j; - digit_t u, v, UV[2]; - unsigned char carry = 0; - - for (i = 0; i < (2*NWORDS_ORDER); i++) c[i] = 0; - - for (i = 0; i < NWORDS_ORDER; i++) { - u = 0; - for (j = 0; j < NWORDS_ORDER; j++) { - MUL(a[i], b[j], UV+1, UV[0]); - ADDC(0, UV[0], u, carry, v); - u = UV[1] + carry; - ADDC(0, c[i+j], v, carry, v); - u = u + carry; - c[i+j] = v; - } - c[NWORDS_ORDER+i] = u; - } -} - - -static __inline unsigned char add(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) -{ // Multiprecision addition, c = a+b. Returns the carry bit - unsigned int i; - unsigned char carry = 0; - - for (i = 0; i < nwords; i++) { - ADDC(carry, a[i], b[i], carry, c[i]); - } - - return carry; -} - - -unsigned char subtract(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) -{ // Multiprecision subtraction, c = a-b. Returns the borrow bit - unsigned int i; - unsigned char borrow = 0; - - for (i = 0; i < nwords; i++) { - SUBC(borrow, a[i], b[i], borrow, c[i]); - } - - return borrow; -} - - -void subtract_mod_order(const digit_t* a, const digit_t* b, digit_t* c) -{ // Subtraction modulo the curve order, c = a-b mod order - digit_t mask, carry = 0; - digit_t* order = (digit_t*)curve_order; - unsigned int i, bout; - - bout = subtract(a, b, c, NWORDS_ORDER); // (bout, c) = a - b - mask = 0 - (digit_t)bout; // if bout = 0 then mask = 0x00..0, else if bout = 1 then mask = 0xFF..F - - for (i = 0; i < NWORDS_ORDER; i++) { // c = c + (mask & order) - ADDC(carry, c[i], mask & order[i], carry, c[i]); - } -} - - -void add_mod_order(const digit_t* a, const digit_t* b, digit_t* c) -{ // Addition modulo the curve order, c = a+b mod order - - add(a, b, c, NWORDS_ORDER); // c = a + b - subtract_mod_order(c, (digit_t*)&curve_order, c); // if c >= order then c = c - order -} - - -void Montgomery_multiply_mod_order(const digit_t* ma, const digit_t* mb, digit_t* mc) -{ // 256-bit Montgomery multiplication modulo the curve order, mc = ma*mb*r' mod order, where ma,mb,mc in [0, order-1] - // ma, mb and mc are assumed to be in Montgomery representation - // The Montgomery constant r' = -r^(-1) mod 2^(log_2(r)) is the global value "Montgomery_rprime", where r is the order - unsigned int i; - digit_t mask, P[2*NWORDS_ORDER], Q[2*NWORDS_ORDER], temp[2*NWORDS_ORDER]; - digit_t* order = (digit_t*)curve_order; - unsigned char cout = 0, bout = 0; - - multiply(ma, mb, P); // P = ma * mb - multiply(P, (digit_t*)&Montgomery_rprime, Q); // Q = P * r' mod 2^(log_2(r)) - multiply(Q, (digit_t*)&curve_order, temp); // temp = Q * r - cout = add(P, temp, temp, 2*NWORDS_ORDER); // (cout, temp) = P + Q * r - - for (i = 0; i < NWORDS_ORDER; i++) { // (cout, mc) = (P + Q * r)/2^(log_2(r)) - mc[i] = temp[NWORDS_ORDER + i]; - } - - // Final, constant-time subtraction - bout = subtract(mc, (digit_t*)&curve_order, mc, NWORDS_ORDER); // (cout, mc) = (cout, mc) - r - mask = (digit_t)(cout - bout); // if (cout, mc) >= 0 then mask = 0x00..0, else if (cout, mc) < 0 then mask = 0xFF..F - - for (i = 0; i < NWORDS_ORDER; i++) { // temp = mask & r - temp[i] = (order[i] & mask); - } - add(mc, temp, mc, NWORDS_ORDER); // mc = mc + (mask & r) - - return; -} - - -void modulo_order(digit_t* a, digit_t* c) -{ // Reduction modulo the order using Montgomery arithmetic - // ma = a*Montgomery_Rprime mod r, where a,ma in [0, r-1], a,ma,r < 2^256 - // c = ma*1*Montgomery_Rprime^(-1) mod r, where ma,c in [0, r-1], ma,c,r < 2^256 - digit_t ma[NWORDS_ORDER], one[NWORDS_ORDER] = {0}; - - one[0] = 1; - Montgomery_multiply_mod_order(a, (digit_t*)&Montgomery_Rprime, ma); - Montgomery_multiply_mod_order(ma, one, c); -} - - -void conversion_to_odd(digit_t* k, digit_t* k_odd) -{// Convert scalar to odd if even using the prime subgroup order r - digit_t i, mask; - digit_t* order = (digit_t*)curve_order; - unsigned char carry = 0; - - mask = ~(0 - (k[0] & 1)); - - for (i = 0; i < NWORDS_ORDER; i++) { // If (k is odd) then k_odd = k else k_odd = k + r - ADDC(carry, order[i] & mask, k[i], carry, k_odd[i]); - } -} - - -void fpdiv1271(felm_t a) -{ // Field division by two, c = a/2 mod p - digit_t mask, temp[2]; - unsigned char carry; - - mask = (0 - (1 & a[0])); - ADDC(0, a[0], mask, carry, temp[0]); - ADDC(carry, a[1], (mask >> 1), carry, temp[1]); - SHIFTR(temp[1], temp[0], 1, a[0], RADIX); - a[1] = (temp[1] >> 1); -} - - -void fp2div1271(f2elm_t a) -{ // GF(p^2) division by two c = a/2 mod p - digit_t mask, temp[2]; - unsigned char carry; - - mask = (0 - (1 & a[0][0])); - ADDC(0, a[0][0], mask, carry, temp[0]); - ADDC(carry, a[0][1], (mask >> 1), carry, temp[1]); - SHIFTR(temp[1], temp[0], 1, a[0][0], RADIX); - a[0][1] = (temp[1] >> 1); - - mask = (0 - (1 & a[1][0])); - ADDC(0, a[1][0], mask, carry, temp[0]); - ADDC(carry, a[1][1], (mask >> 1), carry, temp[1]); - SHIFTR(temp[1], temp[0], 1, a[1][0], RADIX); - a[1][1] = (temp[1] >> 1); -} - - -#ifdef __cplusplus -} -#endif - - -#endif diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/FourQ.h b/ffi-deps/FourQlib/FourQ_64bit_and_portable/FourQ.h deleted file mode 100644 index f3a453a..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/FourQ.h +++ /dev/null @@ -1,217 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: main header file -* -* This code is based on the paper "FourQ: four-dimensional decompositions on a -* Q-curve over the Mersenne prime" by Craig Costello and Patrick Longa, in Advances -* in Cryptology - ASIACRYPT, 2015. -* Preprint available at http://eprint.iacr.org/2015/565. -************************************************************************************/ - -#ifndef __FOURQ_H__ -#define __FOURQ_H__ - - -// For C++ -#ifdef __cplusplus -extern "C" { -#endif - - -#include -#include -#include - - -// Definition of operating system - -#define OS_WIN 1 -#define OS_LINUX 2 - -#if defined(__WINDOWS__) // Microsoft Windows OS - #define OS_TARGET OS_WIN -#elif defined(__LINUX__) // Linux OS - #define OS_TARGET OS_LINUX -#else - #error -- "Unsupported OS" -#endif - - -// Definition of compiler - -#define COMPILER_VC 1 -#define COMPILER_GCC 2 -#define COMPILER_CLANG 3 - -#if defined(_MSC_VER) // Microsoft Visual C compiler - #define COMPILER COMPILER_VC -#elif defined(__GNUC__) // GNU GCC compiler - #define COMPILER COMPILER_GCC -#elif defined(__clang__) // Clang compiler - #define COMPILER COMPILER_CLANG -#else - #error -- "Unsupported COMPILER" -#endif - - -// Definition of the targeted architecture and basic data types - -#define TARGET_AMD64 1 -#define TARGET_x86 2 -#define TARGET_ARM 3 -#define TARGET_ARM64 4 - -#if defined(_AMD64_) - #define TARGET TARGET_AMD64 - #define RADIX 64 - typedef uint64_t digit_t; // Unsigned 64-bit digit - typedef int64_t sdigit_t; // Signed 64-bit digit - #define NWORDS_FIELD 2 // Number of words of a field element - #define NWORDS_ORDER 4 // Number of words of an element in Z_r -#elif defined(_X86_) - #define TARGET TARGET_x86 - #define RADIX 32 - typedef uint32_t digit_t; // Unsigned 32-bit digit - typedef int32_t sdigit_t; // Signed 32-bit digit - #define NWORDS_FIELD 4 - #define NWORDS_ORDER 8 -#elif defined(_ARM_) - #define TARGET TARGET_ARM - #define RADIX 32 - typedef uint32_t digit_t; // Unsigned 32-bit digit - typedef int32_t sdigit_t; // Signed 32-bit digit - #define NWORDS_FIELD 4 - #define NWORDS_ORDER 8 -#elif defined(_ARM64_) - #define TARGET TARGET_ARM64 - #define RADIX 64 - typedef uint64_t digit_t; // Unsigned 64-bit digit - typedef int64_t sdigit_t; // Signed 64-bit digit - #define NWORDS_FIELD 2 - #define NWORDS_ORDER 4 -#else - #error -- "Unsupported ARCHITECTURE" -#endif - - -// Constants - -#define RADIX64 64 -#define NWORDS64_FIELD 2 // Number of 64-bit words of a field element -#define NWORDS64_ORDER 4 // Number of 64-bit words of an element in Z_r - - -// Instruction support - -#define NO_SIMD_SUPPORT 0 -#define AVX_SUPPORT 1 -#define AVX2_SUPPORT 2 - -#if defined(_AVX2_) - #define SIMD_SUPPORT AVX2_SUPPORT // AVX2 support selection -#elif defined(_AVX_) - #define SIMD_SUPPORT AVX_SUPPORT // AVX support selection -#else - #define SIMD_SUPPORT NO_SIMD_SUPPORT -#endif - -#if defined(_ASM_) // Assembly support selection - #define ASM_SUPPORT -#endif - -#if defined(_GENERIC_) // Selection of generic, portable implementation - #define GENERIC_IMPLEMENTATION -#endif - - -// Unsupported configurations - -#if defined(ASM_SUPPORT) && (OS_TARGET == OS_WIN) - #error -- "Assembly is not supported on this platform" -#endif - -#if defined(ASM_SUPPORT) && defined(GENERIC_IMPLEMENTATION) - #error -- "Unsupported configuration" -#endif - -#if (SIMD_SUPPORT != NO_SIMD_SUPPORT) && defined(GENERIC_IMPLEMENTATION) - #error -- "Unsupported configuration" -#endif - -#if (TARGET != TARGET_AMD64 && TARGET != TARGET_ARM64) && !defined(GENERIC_IMPLEMENTATION) - #error -- "Unsupported configuration" -#endif - - -// Definition of complementary cryptographic functions - -#define RandomBytesFunction random_bytes -#define CryptoHashFunction crypto_sha512 // Use SHA-512 by default - - -// Basic parameters for variable-base scalar multiplication (without using endomorphisms) -#define W_VARBASE 5 -#define NBITS_ORDER_PLUS_ONE 246+1 - - -// Basic parameters for fixed-base scalar multiplication -#define W_FIXEDBASE 5 // Memory requirement: 7.5KB (storage for 80 points). -#define V_FIXEDBASE 5 - - -// Basic parameters for double scalar multiplication -#define WP_DOUBLEBASE 8 // Memory requirement: 24KB (storage for 256 points). -#define WQ_DOUBLEBASE 4 - - -// FourQ's basic element definitions and point representations - -typedef digit_t felm_t[NWORDS_FIELD]; // Datatype for representing 128-bit field elements -typedef felm_t f2elm_t[2]; // Datatype for representing quadratic extension field elements - -typedef struct { f2elm_t x; f2elm_t y; } point_affine; // Point representation in affine coordinates. -typedef point_affine point_t[1]; - - -// Definitions of the error-handling type and error codes - -typedef enum { - ECCRYPTO_ERROR, // 0x00 - ECCRYPTO_SUCCESS, // 0x01 - ECCRYPTO_ERROR_DURING_TEST, // 0x02 - ECCRYPTO_ERROR_UNKNOWN, // 0x03 - ECCRYPTO_ERROR_NOT_IMPLEMENTED, // 0x04 - ECCRYPTO_ERROR_NO_MEMORY, // 0x05 - ECCRYPTO_ERROR_INVALID_PARAMETER, // 0x06 - ECCRYPTO_ERROR_SHARED_KEY, // 0x07 - ECCRYPTO_ERROR_SIGNATURE_VERIFICATION, // 0x08 - ECCRYPTO_ERROR_HASH_TO_CURVE, // 0x09 - ECCRYPTO_ERROR_END_OF_LIST -} ECCRYPTO_STATUS; - -#define ECCRYPTO_STATUS_TYPE_SIZE (ECCRYPTO_ERROR_END_OF_LIST) - - -// Error message definitions - -#define ECCRYPTO_MSG_ERROR "ECCRYPTO_ERROR" -#define ECCRYPTO_MSG_SUCCESS "ECCRYPTO_SUCCESS" -#define ECCRYPTO_MSG_ERROR_DURING_TEST "ECCRYPTO_ERROR_DURING_TEST" -#define ECCRYPTO_MSG_ERROR_UNKNOWN "ECCRYPTO_ERROR_UNKNOWN" -#define ECCRYPTO_MSG_ERROR_NOT_IMPLEMENTED "ECCRYPTO_ERROR_NOT_IMPLEMENTED" -#define ECCRYPTO_MSG_ERROR_NO_MEMORY "ECCRYPTO_ERROR_NO_MEMORY" -#define ECCRYPTO_MSG_ERROR_INVALID_PARAMETER "ECCRYPTO_ERROR_INVALID_PARAMETER" -#define ECCRYPTO_MSG_ERROR_SHARED_KEY "ECCRYPTO_ERROR_SHARED_KEY" -#define ECCRYPTO_MSG_ERROR_SIGNATURE_VERIFICATION "ECCRYPTO_ERROR_SIGNATURE_VERIFICATION" -#define ECCRYPTO_MSG_ERROR_HASH_TO_CURVE "ECCRYPTO_ERROR_HASH_TO_CURVE" - - -#ifdef __cplusplus -} -#endif - - -#endif diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/FourQ_api.h b/ffi-deps/FourQlib/FourQ_64bit_and_portable/FourQ_api.h deleted file mode 100644 index 3b046a3..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/FourQ_api.h +++ /dev/null @@ -1,147 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: API header file -* -* This code is based on the paper "FourQ: four-dimensional decompositions on a -* Q-curve over the Mersenne prime" by Craig Costello and Patrick Longa, in Advances -* in Cryptology - ASIACRYPT, 2015. -* Preprint available at http://eprint.iacr.org/2015/565. -************************************************************************************/ - -#ifndef __FOURQ_API_H__ -#define __FOURQ_API_H__ - - -// For C++ -#ifdef __cplusplus -extern "C" { -#endif - - -#include "FourQ.h" - - -/**************** Public ECC API ****************/ - -// Set generator G = (x,y) -void eccset(point_t G); - -// Variable-base scalar multiplication Q = k*P -bool ecc_mul(point_t P, digit_t* k, point_t Q, bool clear_cofactor); - -// Fixed-base scalar multiplication Q = k*G, where G is the generator -bool ecc_mul_fixed(digit_t* k, point_t Q); - -// Double scalar multiplication R = k*G + l*Q, where G is the generator -bool ecc_mul_double(digit_t* k, point_t Q, digit_t* l, point_t R); - - -/************* Public API for arithmetic functions modulo the curve order **************/ - -// Converting to Montgomery representation -void to_Montgomery(const digit_t* ma, digit_t* c); - -// Converting from Montgomery to standard representation -void from_Montgomery(const digit_t* a, digit_t* mc); - -// 256-bit Montgomery multiplication modulo the curve order -void Montgomery_multiply_mod_order(const digit_t* ma, const digit_t* mb, digit_t* mc); - -// (Non-constant time) Montgomery inversion modulo the curve order -void Montgomery_inversion_mod_order(const digit_t* ma, digit_t* mc); - -// Addition modulo the curve order, c = a+b mod order -void add_mod_order(const digit_t* a, const digit_t* b, digit_t* c); - -// Subtraction modulo the curve order, c = a-b mod order -void subtract_mod_order(const digit_t* a, const digit_t* b, digit_t* c); - -// Reduction modulo the order using Montgomery arithmetic internally -void modulo_order(digit_t* a, digit_t* c); - - -/**************** Public API for SchnorrQ ****************/ - -// SchnorrQ public key generation -// It produces a public key PublicKey, which is the encoding of P = s*G, where G is the generator and -// s is the output of hashing SecretKey and taking the least significant 32 bytes of the result. -// Input: 32-byte SecretKey -// Output: 32-byte PublicKey -ECCRYPTO_STATUS SchnorrQ_KeyGeneration(const unsigned char* SecretKey, unsigned char* PublicKey); - -// SchnorrQ keypair generation -// It produces a private key SecretKey and computes the public key PublicKey, which is the encoding of P = s*G, -// where G is the generator and s is the output of hashing SecretKey and taking the least significant 32 bytes of the result. -// Outputs: 32-byte SecretKey and 32-byte PublicKey -ECCRYPTO_STATUS SchnorrQ_FullKeyGeneration(unsigned char* SecretKey, unsigned char* PublicKey); - -// SchnorrQ signature generation -// It produces the signature Signature of a message Message of size SizeMessage in bytes -// Inputs: 32-byte SecretKey, 32-byte PublicKey, and Message of size SizeMessage in bytes -// Output: 64-byte Signature -ECCRYPTO_STATUS SchnorrQ_Sign(const unsigned char* SecretKey, const unsigned char* PublicKey, const unsigned char* Message, const unsigned int SizeMessage, unsigned char* Signature); - -// SchnorrQ signature verification -// It verifies the signature Signature of a message Message of size SizeMessage in bytes -// Inputs: 32-byte PublicKey, 64-byte Signature, and Message of size SizeMessage in bytes -// Output: true (valid signature) or false (invalid signature) -ECCRYPTO_STATUS SchnorrQ_Verify(const unsigned char* PublicKey, const unsigned char* Message, const unsigned int SizeMessage, const unsigned char* Signature, unsigned int* valid); - - -/**************** Public API for co-factor ECDH key exchange with compressed, 32-byte public keys ****************/ - -// Compressed public key generation for key exchange -// It produces a public key PublicKey, which is the encoding of P = SecretKey*G (G is the generator). -// Input: 32-byte SecretKey -// Output: 32-byte PublicKey -ECCRYPTO_STATUS CompressedPublicKeyGeneration(const unsigned char* SecretKey, unsigned char* PublicKey); - -// Keypair generation for key exchange. Public key is compressed to 32 bytes -// It produces a private key SecretKey and a public key PublicKey, which is the encoding of P = SecretKey*G (G is the generator). -// Outputs: 32-byte SecretKey and 32-byte PublicKey -ECCRYPTO_STATUS CompressedKeyGeneration(unsigned char* SecretKey, unsigned char* PublicKey); - -// Secret agreement computation for key exchange using a compressed, 32-byte public key -// The output is the y-coordinate of SecretKey*A, where A is the decoding of the public key PublicKey. -// Inputs: 32-byte SecretKey and 32-byte PublicKey -// Output: 32-byte SharedSecret -ECCRYPTO_STATUS CompressedSecretAgreement(const unsigned char* SecretKey, const unsigned char* PublicKey, unsigned char* SharedSecret); - - -/**************** Public API for co-factor ECDH key exchange with uncompressed, 64-byte public keys ****************/ - -// Public key generation for key exchange -// It produces the public key PublicKey = SecretKey*G, where G is the generator. -// Input: 32-byte SecretKey -// Output: 64-byte PublicKey -ECCRYPTO_STATUS PublicKeyGeneration(const unsigned char* SecretKey, unsigned char* PublicKey); - -// Keypair generation for key exchange -// It produces a private key SecretKey and computes the public key PublicKey = SecretKey*G, where G is the generator. -// Outputs: 32-byte SecretKey and 64-byte PublicKey -ECCRYPTO_STATUS KeyGeneration(unsigned char* SecretKey, unsigned char* PublicKey); - -// Secret agreement computation for key exchange -// The output is the y-coordinate of SecretKey*PublicKey. -// Inputs: 32-byte SecretKey and 64-byte PublicKey -// Output: 32-byte SharedSecret -ECCRYPTO_STATUS SecretAgreement(const unsigned char* SecretKey, const unsigned char* PublicKey, unsigned char* SharedSecret); - - -/**************** Public API for hashing to curve, 64-byte public keys ****************/ - -// Hash GF(p^2) element to a curve point -// Input: GF(p^2) element -// Output: point in affine coordinates with co-factor cleared -ECCRYPTO_STATUS HashToCurve(f2elm_t r, point_t P); - - -#ifdef __cplusplus -} -#endif - - -#endif diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/FourQ_internal.h b/ffi-deps/FourQlib/FourQ_64bit_and_portable/FourQ_internal.h deleted file mode 100644 index 30d4387..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/FourQ_internal.h +++ /dev/null @@ -1,418 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: internal header file -* -* This code is based on the paper "FourQ: four-dimensional decompositions on a -* Q-curve over the Mersenne prime" by Craig Costello and Patrick Longa, in Advances -* in Cryptology - ASIACRYPT, 2015. -* Preprint available at http://eprint.iacr.org/2015/565. -************************************************************************************/ - -#ifndef __FOURQ_INTERNAL_H__ -#define __FOURQ_INTERNAL_H__ - - -// For C++ -#ifdef __cplusplus -extern "C" { -#endif - - -#include "FourQ_api.h" - - -// Extended datatype support - -#if defined(GENERIC_IMPLEMENTATION) - typedef uint64_t uint128_t[2]; -#elif (TARGET == TARGET_AMD64 && OS_TARGET == OS_LINUX) && (COMPILER == COMPILER_GCC || COMPILER == COMPILER_CLANG) - #define UINT128_SUPPORT - typedef unsigned uint128_t __attribute__((mode(TI))); -#elif (TARGET == TARGET_ARM64 && OS_TARGET == OS_LINUX) && (COMPILER == COMPILER_GCC || COMPILER == COMPILER_CLANG) - #define UINT128_SUPPORT - typedef unsigned uint128_t __attribute__((mode(TI))); -#elif (TARGET == TARGET_AMD64) && (OS_TARGET == OS_WIN && COMPILER == COMPILER_VC) - #define SCALAR_INTRIN_SUPPORT - typedef uint64_t uint128_t[2]; -#else - #error -- "Unsupported configuration" -#endif - - -// Define if zeroing of temporaries in low-level functions is required -//#define TEMP_ZEROING - - -// Basic parameters for variable-base scalar multiplication (without using endomorphisms) -#define NPOINTS_VARBASE (1 << (W_VARBASE-2)) -#define t_VARBASE ((NBITS_ORDER_PLUS_ONE+W_VARBASE-2)/(W_VARBASE-1)) - - -// Basic parameters for fixed-base scalar multiplication -#define E_FIXEDBASE (NBITS_ORDER_PLUS_ONE + W_FIXEDBASE*V_FIXEDBASE - 1)/(W_FIXEDBASE*V_FIXEDBASE) -#define D_FIXEDBASE E_FIXEDBASE*V_FIXEDBASE -#define L_FIXEDBASE D_FIXEDBASE*W_FIXEDBASE -#define NPOINTS_FIXEDBASE V_FIXEDBASE*(1 << (W_FIXEDBASE-1)) -#define VPOINTS_FIXEDBASE (1 << (W_FIXEDBASE-1)) -#if (NBITS_ORDER_PLUS_ONE-L_FIXEDBASE == 0) // This parameter selection is not supported - #error -- "Unsupported parameter selection for fixed-base scalar multiplication" -#endif - - -// Basic parameters for double scalar multiplication -#define NPOINTS_DOUBLEMUL_WP (1 << (WP_DOUBLEBASE-2)) -#define NPOINTS_DOUBLEMUL_WQ (1 << (WQ_DOUBLEBASE-2)) - - -// FourQ's point representations - -typedef struct { f2elm_t x; f2elm_t y; f2elm_t z; f2elm_t ta; f2elm_t tb; } point_extproj; // Point representation in extended coordinates. -typedef point_extproj point_extproj_t[1]; -typedef struct { f2elm_t xy; f2elm_t yx; f2elm_t z2; f2elm_t t2; } point_extproj_precomp; // Point representation in extended coordinates (for precomputed points). -typedef point_extproj_precomp point_extproj_precomp_t[1]; -typedef struct { f2elm_t xy; f2elm_t yx; f2elm_t t2; } point_precomp; // Point representation in extended affine coordinates (for precomputed points). -typedef point_precomp point_precomp_t[1]; - - -/********************** Constant-time unsigned comparisons ***********************/ - -// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise - -static __inline unsigned int is_digit_nonzero_ct(digit_t x) -{ // Is x != 0? - return (unsigned int)((x | (0-x)) >> (RADIX-1)); -} - -static __inline unsigned int is_digit_zero_ct(digit_t x) -{ // Is x = 0? - return (unsigned int)(1 ^ is_digit_nonzero_ct(x)); -} - -static __inline unsigned int is_digit_lessthan_ct(digit_t x, digit_t y) -{ // Is x < y? - return (unsigned int)((x ^ ((x ^ y) | ((x - y) ^ y))) >> (RADIX-1)); -} - - -/********************** Macros for digit operations **********************/ - -#if defined(GENERIC_IMPLEMENTATION) - -// Digit multiplication -#define MUL(multiplier, multiplicand, hi, lo) \ - digit_x_digit((multiplier), (multiplicand), &(lo)); - -// Digit addition with carry -#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \ - { digit_t tempReg = (addend1) + (digit_t)(carryIn); \ - (sumOut) = (addend2) + tempReg; \ - (carryOut) = (is_digit_lessthan_ct(tempReg, (digit_t)(carryIn)) | is_digit_lessthan_ct((sumOut), tempReg)); } - -// Digit subtraction with borrow -#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \ - { digit_t tempReg = (minuend) - (subtrahend); \ - unsigned int borrowReg = (is_digit_lessthan_ct((minuend), (subtrahend)) | ((borrowIn) & is_digit_zero_ct(tempReg))); \ - (differenceOut) = tempReg - (digit_t)(borrowIn); \ - (borrowOut) = borrowReg; } - -// Shift right with flexible datatype -#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \ - (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (DigitSize - (shift))); - -// shift left with flexible datatype -#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \ - (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (DigitSize - (shift))); - -// 64x64-bit multiplication -#define MUL128(multiplier, multiplicand, product) \ - mp_mul((digit_t*)&(multiplier), (digit_t*)&(multiplicand), (digit_t*)&(product), NWORDS_FIELD/2); - -// 128-bit addition, inputs < 2^127 -#define ADD128(addend1, addend2, addition) \ - mp_add((digit_t*)(addend1), (digit_t*)(addend2), (digit_t*)(addition), NWORDS_FIELD); - -// 128-bit addition with output carry -#define ADC128(addend1, addend2, carry, addition) \ - (carry) = mp_add((digit_t*)(addend1), (digit_t*)(addend2), (digit_t*)(addition), NWORDS_FIELD); - -#elif (TARGET == TARGET_AMD64 && OS_TARGET == OS_WIN) - -// Digit multiplication -#define MUL(multiplier, multiplicand, hi, lo) \ - (lo) = _umul128((multiplier), (multiplicand), (hi)); - -// Digit addition with carry -#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \ - (carryOut) = _addcarry_u64((carryIn), (addend1), (addend2), &(sumOut)); - -// Digit subtraction with borrow -#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \ - (borrowOut) = _subborrow_u64((borrowIn), (minuend), (subtrahend), &(differenceOut)); - -// Digit shift right -#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \ - (shiftOut) = __shiftright128((lowIn), (highIn), (shift)); - -// Digit shift left -#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \ - (shiftOut) = __shiftleft128((lowIn), (highIn), (shift)); - -// 64x64-bit multiplication -#define MUL128(multiplier, multiplicand, product) \ - (product)[0] = _umul128((multiplier), (multiplicand), &(product)[1]); - -// 128-bit addition, inputs < 2^127 -#define ADD128(addend1, addend2, addition) \ - { unsigned char carry = _addcarry_u64(0, (addend1)[0], (addend2)[0], &(addition)[0]); \ - _addcarry_u64(carry, (addend1)[1], (addend2)[1], &(addition)[1]); } - -// 128-bit addition with output carry -#define ADC128(addend1, addend2, carry, addition) \ - (carry) = _addcarry_u64(0, (addend1)[0], (addend2)[0], &(addition)[0]); \ - (carry) = _addcarry_u64((carry), (addend1)[1], (addend2)[1], &(addition)[1]); - -// 128-bit subtraction, subtrahend < 2^127 -#define SUB128(minuend, subtrahend, difference) \ - { unsigned char borrow = _subborrow_u64(0, (minuend)[0], (subtrahend)[0], &(difference)[0]); \ - _subborrow_u64(borrow, (minuend)[1], (subtrahend)[1], &(difference)[1]); } - -// 128-bit right shift, max. shift value is 64 -#define SHIFTR128(Input, shift, shiftOut) \ - (shiftOut)[0] = __shiftright128((Input)[0], (Input)[1], (shift)); \ - (shiftOut)[1] = (Input)[1] >> (shift); - -// 128-bit left shift, max. shift value is 64 -#define SHIFTL128(Input, shift, shiftOut) \ - (shiftOut)[1] = __shiftleft128((Input)[0], (Input)[1], (shift)); \ - (shiftOut)[0] = (Input)[0] << (shift); - -#elif ((TARGET == TARGET_AMD64 || TARGET == TARGET_ARM64) && OS_TARGET == OS_LINUX) - -// Digit multiplication -#define MUL(multiplier, multiplicand, hi, lo) \ - { uint128_t tempReg = (uint128_t)(multiplier) * (uint128_t)(multiplicand); \ - *(hi) = (digit_t)(tempReg >> RADIX); \ - (lo) = (digit_t)tempReg; } - -// Digit addition with carry -#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \ - { uint128_t tempReg = (uint128_t)(addend1) + (uint128_t)(addend2) + (uint128_t)(carryIn); \ - (carryOut) = (digit_t)(tempReg >> RADIX); \ - (sumOut) = (digit_t)tempReg; } - -// Digit subtraction with borrow -#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \ - { uint128_t tempReg = (uint128_t)(minuend) - (uint128_t)(subtrahend) - (uint128_t)(borrowIn); \ - (borrowOut) = (digit_t)(tempReg >> (sizeof(uint128_t)*8 - 1)); \ - (differenceOut) = (digit_t)tempReg; } - -// Digit shift right -#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \ - (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (RADIX - (shift))); - -// Digit shift left -#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \ - (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (RADIX - (shift))); - -#endif - - -/**************** Function prototypes ****************/ - -/************* Multiprecision functions **************/ - -// Check if multiprecision element is zero -bool is_zero_ct(digit_t* a, unsigned int nwords); - -// Multiprecision addition, c = a+b. Returns the carry bit -unsigned int mp_add(digit_t* a, digit_t* b, digit_t* c, unsigned int nwords); - -// Schoolbook multiprecision multiply, c = a*b -void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); - -// Multiprecision subtraction, c = a-b. Returns the borrow bit -#if defined (GENERIC_IMPLEMENTATION) -unsigned int subtract(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); -#else -unsigned char subtract(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); -#endif - -// Clear "nwords" integer-size digits from memory -extern void clear_words(void* mem, unsigned int nwords); - -/************ Field arithmetic functions *************/ - -// Copy of a field element, c = a -void fpcopy1271(felm_t a, felm_t c); - -// Field negation, a = -a mod p -void fpneg1271(felm_t a); - -// Modular correction, a = a mod p -void mod1271(felm_t a); - -// Field addition, c = a+b mod p -void fpadd1271(felm_t a, felm_t b, felm_t c); - -// Field subtraction, c = a-b mod p -void fpsub1271(felm_t a, felm_t b, felm_t c); - -// Field division by two, c = a/2 mod p -void fpdiv1271(felm_t a); - -// Field multiplication, c = a*b mod p -void fpmul1271(felm_t a, felm_t b, felm_t c); - -// Field squaring, c = a^2 mod p -void fpsqr1271(felm_t a, felm_t c); - -// Field inversion, af = a^-1 = a^(p-2) mod p -void fpinv1271(felm_t a); - -// Exponentiation over GF(p), af = a^(125-1) -void fpexp1251(felm_t a, felm_t af); - -/************ Quadratic extension field arithmetic functions *************/ - -// Zeroing a quadratic extension field element, a=0 -void fp2zero1271(f2elm_t a); - -// Copy quadratic extension field element, c = a -void fp2copy1271(f2elm_t a, f2elm_t c); - -// Quadratic extension field negation, a = -a in GF((2^127-1)^2) -void fp2neg1271(f2elm_t a); - -// Quadratic extension field addition, c = a+b in GF((2^127-1)^2) -void fp2add1271(f2elm_t a, f2elm_t b, f2elm_t c); - -// Quadratic extension field subtraction, c = a-b in GF((2^127-1)^2) -void fp2sub1271(f2elm_t a, f2elm_t b, f2elm_t c); - -// Quadratic extension field addition/subtraction, c = 2a-b in GF((2^127-1)^2) -void fp2addsub1271_a(f2elm_t a, f2elm_t b, f2elm_t c); - -// Quadratic extension field multiplication, c = a*b in GF((2^127-1)^2) -void fp2mul1271(f2elm_t a, f2elm_t b, f2elm_t c); -void fp2mul1271_a(f2elm_t a, f2elm_t b, f2elm_t c); - -// Quadratic extension field squaring, c = a^2 in GF((2^127-1)^2) -void fp2sqr1271(f2elm_t a, f2elm_t c); -void fp2sqr1271_a(f2elm_t a, f2elm_t c); - -// Quadratic extension field inversion, af = a^-1 = a^(p-2) in GF((2^127-1)^2) -void fp2inv1271(f2elm_t a); - -/************ Curve and recoding functions *************/ - -// Normalize projective twisted Edwards point Q = (X,Y,Z) -> P = (x,y) -void eccnorm(point_extproj_t P, point_t Q); - -// Conversion from representation (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT), where T = Ta*Tb -void R1_to_R2(point_extproj_t P, point_extproj_precomp_t Q); - -// Conversion from representation (X,Y,Z,Ta,Tb) to (X+Y,Y-X,Z,T), where T = Ta*Tb -void R1_to_R3(point_extproj_t P, point_extproj_precomp_t Q); - -// Conversion from representation (X+Y,Y-X,2Z,2dT) to (2X,2Y,2Z,2dT) -void R2_to_R4(point_extproj_precomp_t P, point_extproj_t Q); - -// Point doubling 2P -void eccdouble_ni(point_extproj_t P); -void eccdouble(point_extproj_t P); - -// Complete point addition P = P+Q or P = P+P -void eccadd_ni(point_extproj_precomp_t Q, point_extproj_t P); -void eccadd(point_extproj_precomp_t Q, point_extproj_t P); -void eccadd_core(point_extproj_precomp_t P, point_extproj_precomp_t Q, point_extproj_t R); - -// Psi mapping of a point, P = psi(P) -void ecc_psi(point_extproj_t P); - -// Phi mapping of a point, P = phi(P) -void ecc_phi(point_extproj_t P); - -// Scalar decomposition -void decompose(uint64_t* k, uint64_t* scalars); - -// Recoding sub-scalars for use in the variable-base scalar multiplication -void recode(uint64_t* scalars, unsigned int* digits, unsigned int* sign_masks); - -// Computes the fixed window representation of scalar -void fixed_window_recode(uint64_t* scalar, unsigned int* digits, unsigned int* sign_masks); - -// Convert scalar to odd if even using the prime subgroup order r -void conversion_to_odd(digit_t* k, digit_t* k_odd); - -// Co-factor clearing -void cofactor_clearing(point_extproj_t P); - -// Precomputation function -void ecc_precomp(point_extproj_t P, point_extproj_precomp_t *T); - -// Constant-time table lookup to extract an extended twisted Edwards point (X+Y:Y-X:2Z:2T) from the precomputed table -void table_lookup_1x8(point_extproj_precomp_t* table, point_extproj_precomp_t P, unsigned int digit, unsigned int sign_mask); -void table_lookup_1x8_a(point_extproj_precomp_t* table, point_extproj_precomp_t P, unsigned int* digit, unsigned int* sign_mask); - -// Modular correction of input coordinates and conversion to representation (X,Y,Z,Ta,Tb) -void point_setup(point_t P, point_extproj_t Q); -void point_setup_ni(point_t P, point_extproj_t Q); - -// Point validation: check if point lies on the curve -bool ecc_point_validate(point_extproj_t P); - -// Output error/success message for a given ECCRYPTO_STATUS -const char* FourQ_get_error_message(ECCRYPTO_STATUS Status); - -// Mixed point addition P = P+Q or P = P+P -void eccmadd_ni(point_precomp_t Q, point_extproj_t P); - -// Constant-time table lookup to extract a point represented as (x+y,y-x,2t) -void table_lookup_fixed_base(point_precomp_t* table, point_precomp_t P, unsigned int digit, unsigned int sign); - -// Computes the modified LSB-set representation of scalar -void mLSB_set_recode(uint64_t* scalar, unsigned int *digits); - -// Generation of the precomputation table used internally by the double scalar multiplication function ecc_mul_double() -void ecc_precomp_double(point_extproj_t P, point_extproj_precomp_t* Table, unsigned int npoints); - -// Computes wNAF recoding of a scalar -void wNAF_recode(uint64_t scalar, unsigned int w, int* digits); - -// Encode point P -void encode(point_t P, unsigned char* Pencoded); - -// Decode point P -ECCRYPTO_STATUS decode(const unsigned char* Pencoded, point_t P); - - -/************ Functions based on macros *************/ - -// Copy extended projective point Q = (X:Y:Z:Ta:Tb) to P -#define ecccopy(Q, P); fp2copy1271((Q)->x, (P)->x); \ - fp2copy1271((Q)->y, (P)->y); \ - fp2copy1271((Q)->z, (P)->z); \ - fp2copy1271((Q)->ta, (P)->ta); \ - fp2copy1271((Q)->tb, (P)->tb); - -// Copy extended projective point Q = (X+Y,Y-X,2Z,2dT) to P -#define ecccopy_precomp(Q, P); fp2copy1271((Q)->xy, (P)->xy); \ - fp2copy1271((Q)->yx, (P)->yx); \ - fp2copy1271((Q)->z2, (P)->z2); \ - fp2copy1271((Q)->t2, (P)->t2); - -// Copy extended affine point Q = (x+y,y-x,2dt) to P -#define ecccopy_precomp_fixed_base(Q, P); fp2copy1271((Q)->xy, (P)->xy); \ - fp2copy1271((Q)->yx, (P)->yx); \ - fp2copy1271((Q)->t2, (P)->t2); - - -#ifdef __cplusplus -} -#endif - - -#endif diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/FourQ_params.h b/ffi-deps/FourQlib/FourQ_64bit_and_portable/FourQ_params.h deleted file mode 100644 index 8a064d6..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/FourQ_params.h +++ /dev/null @@ -1,52 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: FourQ's curve parameters -* -* This code is based on the paper "FourQ: four-dimensional decompositions on a -* Q-curve over the Mersenne prime" by Craig Costello and Patrick Longa, in Advances -* in Cryptology - ASIACRYPT, 2015. -* Preprint available at http://eprint.iacr.org/2015/565. -************************************************************************************/ - -#pragma once -#ifndef __FOURQ_PARAMS_H__ -#define __FOURQ_PARAMS_H__ - -#include "FourQ_internal.h" - - -// Encoding of field elements, elements over Z_r and elements over GF(p^2): -// ----------------------------------------------------------------------- -// Elements over GF(p) and Z_r are encoded with the least significant digit located in the leftmost position (i.e., little endian format). -// Elements (a+b*i) over GF(p^2), where a and b are defined over GF(p), are encoded as a||b, with a in the least significant position. - -static const uint64_t PARAMETER_d[4] = { 0x0000000000000142, 0x00000000000000E4, 0xB3821488F1FC0C8D, 0x5E472F846657E0FC }; -static const uint64_t GENERATOR_x[4] = { 0x286592AD7B3833AA, 0x1A3472237C2FB305, 0x96869FB360AC77F6, 0x1E1F553F2878AA9C }; -static const uint64_t GENERATOR_y[4] = { 0xB924A2462BCBB287, 0x0E3FEE9BA120785A, 0x49A7C344844C8B5C, 0x6E1C4AF8630E0242 }; -static const uint64_t curve_order[4] = { 0x2FB2540EC7768CE7, 0xDFBD004DFE0F7999, 0xF05397829CBC14E5, 0x0029CBC14E5E0A72 }; -static const uint64_t Montgomery_Rprime[4] = { 0xC81DB8795FF3D621, 0x173EA5AAEA6B387D, 0x3D01B7C72136F61C, 0x0006A5F16AC8F9D3 }; -static const uint64_t Montgomery_rprime[4] = { 0xE12FE5F079BC3929, 0xD75E78B8D1FCDCF3, 0xBCE409ED76B5DB21, 0xF32702FDAFC1C074 }; - - -// Constants for hash to FourQ function - -#if (RADIX == 32) - static felm_t con1 = { 6651107, 0, 4290264256, 2147483647 }; - static felm_t con2 = { 1725590130, 1719979744, 2225079900, 707200452 }; - static felm_t b0 = { 3738038324, 2664081113, 587564626, 1252475115 }; - static felm_t b1 = { 17, 0, 4294967284, 2147483647 }; - static felm_t A0 = { 1289, 0, 4294966384, 2147483647 }; - static felm_t A1 = { 1007904792, 2866591091, 4136083791, 1668973403 }; -#elif (RADIX == 64) - static felm_t con1 = { 6651107ULL, 9223372036850072768ULL }; - static felm_t con2 = { 7387256751988042354ULL, 3037402815281497692ULL }; - static felm_t b0 = { 11442141257964318772ULL, 5379339658566403666ULL }; - static felm_t b1 = { 17ULL, 9223372036854775796ULL }; - static felm_t A0 = { 1289ULL, 9223372036854774896ULL }; - static felm_t A1 = { 12311914987857864728ULL, 7168186187914912079ULL }; -#endif - -#endif \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/FourQ_tables.h b/ffi-deps/FourQlib/FourQ_64bit_and_portable/FourQ_tables.h deleted file mode 100644 index 0de7264..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/FourQ_tables.h +++ /dev/null @@ -1,365 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: precomputation tables -************************************************************************************/ - -#ifndef __TABLES_H__ -#define __TABLES_H__ - -#include - - -// The table below was generated using window width W = 5 and table parameter V = 5 (see http://eprint.iacr.org/2013/158). -// Number of point entries = 5 * 2^4 = 80 points, where each point (x,y) is represented using coordinates (x+y,y-x,2*d*t). -// Table size = 80 * 3 * 256 = 7.5KB - -static const uint64_t FIXED_BASE_TABLE[960] = { - 0xe18a34f3a703e631, 0x287460bf1d502b5f, 0xe02e62f7e4f90353, 0x0c3ba0378b86acde, 0x90bf0f98b0937edc, 0x740b7c7824f0c555, 0xb321239123a01366, 0x4ffcf5b93a9557a5, 0x297afccbabda42bb, 0x5948d137556c97c6, 0xa8189a393330684c, 0x0caf2b720a341f27 -, 0x3a8ba018fd188787, 0x5546128188dd12a8, 0xb0b3cc33c09f9b77, 0x1baeeaf8b84d2049, 0x006425a611faf900, 0x18f7cd12e1a6f789, 0x6dccf09a12556066, 0x448e05eeace7b6eb, 0xbf2f33689d2829b0, 0x6d911dcb2957bdb4, 0x9f2353dbdc3c03ee, 0x06c54305babee501 -, 0x2eaf45713dafa125, 0x72963058648a364d, 0x61b7771f9d313ef2, 0x4f41c7f8bfe2b069, 0x408623ae599790ac, 0x4d33858644330a42, 0xfc5696649cdd7487, 0x74df72e0e598e114, 0xc9a06325913c110b, 0x076bd4115fe4b0d8, 0x76619e65d6bff3d9, 0x249240147cee3a08 -, 0xd695b96148965a73, 0x28aac8a28829f706, 0x41f1c05329f7a57b, 0x441ca9e89f03e00e, 0xe1aa38ab8bf7241e, 0x58f28cafc832b7f4, 0xcadaf8b8fa5400c6, 0x34b6d106284e863e, 0xf5498cab3af15097, 0x6dbe7790017d9c49, 0x63bf76a81448e8bc, 0x6371925bf23ae006 -, 0xc5e2c721bded81fa, 0x4ede70eed68056ab, 0x8f3cd9b5b4975810, 0x4752fd192f0a9aa8, 0x318794eb1f734414, 0x11ddf7d2c8468662, 0x2613b06f72b1a34e, 0x465575b37ab06770, 0x40b9845f82638d2b, 0x48894050790298ce, 0xbedb93a501b4f131, 0x04f3560d2889b2fb -, 0x457dd875115b278b, 0x56f25ee54d92858a, 0x92d4c1cdce0c977e, 0x078fca4187d74996, 0x3bbb2ded76cc22a1, 0x117b28853ddc2bf6, 0x43f3767cb9c2baa2, 0x73079e25e0ea8a8f, 0x0177992b5a15796d, 0x2e77721480d9ef92, 0xbe09883567372916, 0x258f176b7af7576d -, 0x308338fd6168391b, 0x7285925f9a7353a4, 0x862c0fd04fe85114, 0x53259ee7423aeb51, 0xfe0031a84b3b1a68, 0x1a4f1d661fa071fc, 0x2ddd54168dc928a7, 0x60185c1adf196a6a, 0x49809717dc6da9b4, 0x6062094b4dcffc03, 0xa41ea6fa05fa7e8d, 0x4a4fe06f277148a0 -, 0x7bb253a9ee9e80f0, 0x419a928bccb11733, 0x84323be66a9a039e, 0x01b2d1ae972814bb, 0xa7588584d3051231, 0x54df1e20cc979dd7, 0x91d906fe3e2f22dd, 0x4e36e9975fdf1a0f, 0xd81871746b747634, 0x3e5e31baeee13433, 0xe4da80979573baa3, 0x4b852ad97cfe77c6 -, 0xe08b346714418b9e, 0x283d719b2fe6ef88, 0xb7339d2de45c180b, 0x75acfcef11d2d5c8, 0x8f40777a8c561876, 0x0c54ac40a7134c4b, 0xb92e287d66baee08, 0x6f357e5006a188bf, 0xc5903319ed1e6971, 0x747c45ef91dafd40, 0xde4086a91d2f816e, 0x5dcb27edb3b3ef7d -, 0x43fdc46cfa1dd2ee, 0x51551f9f70966498, 0xb54534f761ed9bdc, 0x453455b3073fb07f, 0xf24773e383cab70b, 0x679be25e758cf4df, 0xda17edf2943eee29, 0x3dc9e5b8d6dc0f66, 0x56a50cba413fb75b, 0x1e65315bc5a8537f, 0x5ff90242802c7213, 0x73c9d8c8f425252e -, 0x3c637b8633198c8f, 0x534f84b3ed414f33, 0xad313e72dedd6902, 0x5ed57e941cdf33af, 0x5a6fe01d2a57306e, 0x73b63dea344713f9, 0x39cb70570f1c2bf3, 0x2df8c6e49f1a18db, 0x661bc349677797e4, 0x501ae7cbbebe9062, 0x5b52a88de8959643, 0x0372752811c01d51 -, 0x010c57a2301bb928, 0x378b317155554fc6, 0xf883fa4229a02cf1, 0x5f0047b850d7db29, 0x4d247ae328402daa, 0x0d030627a850a2bc, 0xb4e65d9a88a443f5, 0x6ec9686b2d6db089, 0xde202e08fea1d987, 0x5c64e1d3f28d7600, 0x157d17bef661bfb7, 0x56392d36dd75334c -, 0xe25478d8bd19155c, 0x146d4f2d3d336afd, 0x9bfbe00bf94e15e8, 0x2b185a9a6adf10c0, 0x926527b3ed52ab7b, 0x67997e1473101e80, 0xb58f4ff4947cc541, 0x36f800c7fac99a7a, 0xd0302e32400456d9, 0x4372e43640bc697b, 0x9144cabb4750d898, 0x75d25afac9a23cbf -, 0x794591767655cbfe, 0x74db216617fc4b07, 0x7057b2242566d0c9, 0x1d543b5908417b23, 0x19c280b444428783, 0x352309fd8b6cc3ef, 0x37833d6ac068ae72, 0x4ec0671a23c019f4, 0x9d9836e1a3d05bb5, 0x44fe1adff224efe3, 0xa296bc3ce57efb4a, 0x2efec86835a14150 -, 0x2fe19c09fb194bca, 0x18cc07d3953cd206, 0x5bdff217c9c0b9e0, 0x671aa756581abcee, 0xe1cc33ae28f7d1a2, 0x1b6f254937a0a3fe, 0x51503d1665babb83, 0x74b95636d5889211, 0xbdb97ae4ea96f869, 0x1507ce189e2510bd, 0x796e4d54fab93b13, 0x6a81765f05960929 -, 0x2e940521e5a833ed, 0x3bdea532b245f644, 0xbea76975ffd52693, 0x64b94848ba6d4ed6, 0x9db52d0194e33ec7, 0x71cf65da55639f25, 0xede73b1fdb5a8138, 0x12e4d13b6c62dc22, 0x9d19b0c265185517, 0x77a011d257b5fdd0, 0x1fedc5caaecd84e4, 0x46844e151e3492d1 -, 0x7a423a31904220df, 0x5b3165c747e8f099, 0x1c665eeadf35e22e, 0x7802b556fc45595b, 0x85a2def4015bd2de, 0x17f2ab87957166ad, 0x19cf6d352060c1e5, 0x122a7ad1be408e6a, 0x5b79bbc8645bf766, 0x20fb009d4d0adacf, 0x97526a272ba28538, 0x7041b4e90d420bde -, 0x3b30113358dab057, 0x3d398b66f0d24243, 0x91a5999a03cd4708, 0x1eae2409cd938096, 0x66dd6b604c36108c, 0x1713083789081968, 0x57cad6917125dcfd, 0x34b06cb89704f1ca, 0xdcafe8e71f35abf2, 0x698331198d544db9, 0x6287676643af075b, 0x200950e5559d2b6d -, 0xd4f63fc3ecdd9074, 0x7473317142ac13a2, 0x96b0030805319356, 0x2c20ffe0244378ba, 0x4889511ad26ac01a, 0x4ee327219997fcf6, 0x15ffe6e70f0bf8ea, 0x6b617fb4a6d0a6d7, 0x4916dca1c52f7324, 0x3c8269f086468277, 0xc24210c4c837e04b, 0x4e480b4f915a542c -, 0xc5fef3b09a7fe35e, 0x31a501de44fd84b2, 0x79f29e4940a407b9, 0x0ba7e03ca5cce5ab, 0xa7a8b2058a74d8ea, 0x46f4c7810e26dadc, 0x46171ace94a1128a, 0x44db55025495a811, 0x7f889e1a4bf18d5c, 0x4d4f172a43f306b2, 0x33a99766bb1cffad, 0x6254775924d39aca -, 0xd855230ec225136e, 0x1c544dd078d9211d, 0x12fe9969f63f63ba, 0x069af1dc949dd382, 0x305bcf40cfe5c256, 0x63ae90924bbbb595, 0xe451097793b7de06, 0x09780cf39fc0043e, 0x827af8e7eb798871, 0x3ace8a6c77577a37, 0x79df061332e055ba, 0x561dc07aaacea92b -, 0x7e4422d9820d2673, 0x6b85df83e0af5348, 0x1f151ac1ded8526b, 0x35ead8e5157142bd, 0x6da6ef6c33c79dd4, 0x5f2ea04d2594fde4, 0x91037d0cc027d5fa, 0x53b5401007b0331b, 0x810f198a3d4ba5a3, 0x4463bd259ba94195, 0x32b894acec2acf9e, 0x78711761d64349ce -, 0x253ae1b3f51fe211, 0x409e4b3f535b6463, 0x3a236d10da5e49de, 0x19d2b1029c21336a, 0x2835f40436aadd90, 0x0942a31505190b19, 0xc189131876828279, 0x3afe96c3ca8e1f9c, 0x9f1801b491230693, 0x39e28db8625fd091, 0x9fab50355dd44c8e, 0x145155da729b280d -, 0xd3ccf8101d4d76d5, 0x5a0faa1a8c2b6c68, 0x3cc66c84cb54ea8a, 0x51052ce3f566c773, 0x3bee14de65ae9ff5, 0x7586118a01ccf024, 0x089e791c896bf15e, 0x35ff022d261d93d6, 0xcd3ce13d8f7d1cf9, 0x4f1de98f95b7b8f6, 0x51e68a2462dc41b4, 0x61ad9e3c23f6dd29 -, 0x584fea6480ebdb51, 0x5d52fe073f9decf3, 0x9afe483eadf336d5, 0x1dfa03c980b1696a, 0x55f73d47ff819a19, 0x697bf55d361100ed, 0xded4804446399419, 0x618c94467fce259f, 0xf2597ff1f08ef50c, 0x07c935b98dd933c0, 0xbb758cbc78ded5f6, 0x1e9a0d06af13148f -, 0x879ce1457f4cd4db, 0x28396ca1962d4994, 0xf5095a3dc57605c3, 0x1e570f3da4c527b1, 0x2af69a3904935787, 0x591ee376fdd01cce, 0xf77b58df88bc8633, 0x5464d651b2f395d1, 0xafbc096b1e9a86ae, 0x6ce2df4bf65b6b28, 0x3b3a828d2e9d3e08, 0x6382011d8d2d66d0 -, 0x94987ca64d3d193d, 0x50ddf70d3b6d56af, 0x8d5df67cc8ad15a9, 0x39208098bc5b1f92, 0xce99f520dfd5a4fb, 0x323bbc87b86a7ba9, 0xe13f88a8d803c789, 0x56ffdcbdf2200055, 0x3aff0da31b24c72d, 0x70011566460c0c16, 0x76f7b7f53ac46a13, 0x1c069bfeb7077bc2 -, 0x8f47193ca14a3c36, 0x6d73e34af088de3d, 0x634b2bd9317d6634, 0x5b404738b77f1ec8, 0xf34fabb71ca1cb1d, 0x054abbcaca546a46, 0xe8cdcadd08eda660, 0x6971abbf958bdef1, 0x41338557dddb4eaf, 0x1e158585b079b67c, 0xd2270474cfa26068, 0x53b36d32b3cea469 -, 0x011523c16c543d08, 0x4668e92c5f73314e, 0xbaef3ebe4117acd1, 0x04037d1aa713931a, 0x68e118e4e390c68d, 0x6b80cd55a44c1575, 0x7307ea8a5729c032, 0x5cc5475feee99ab2, 0x34450e424c14ac75, 0x3f09157e5db3dcd8, 0x62ce2b1b50588052, 0x27a899c54e652f8f -, 0x0acd039f2fc2a5ed, 0x4b4044ddd5813eec, 0xc04d189e90a75958, 0x242551bce71d33a1, 0xd95af96b51f87f05, 0x02988820f809d815, 0xb27f65f73b9483c5, 0x2ef60745f4364b43, 0xcb66bdc93f4fb8b9, 0x2b86c9b48756bb8a, 0xf8ebdae09b9867a1, 0x441e70184e6fe9aa -, 0xfdc2530330cc1289, 0x47d8d65a8b4d6992, 0x8c03b6fa30ae74be, 0x1ca8693cc3bd99d5, 0x699eb1511018f2a6, 0x3da04764d9f4fff5, 0x361720433d3aab59, 0x2fa911612cb857ff, 0xa4057da10c2f1cac, 0x48a219b933a5c619, 0x42341020d15f0bc5, 0x73f8895046a09dad -, 0x1bad5312c67421b8, 0x4194771b368e622e, 0x8cc71a79e44e0dff, 0x4b4564e45467f1c2, 0x7759f16aafe52093, 0x391b71dcd75fbea9, 0x2a1c0694ab4ef798, 0x023087545444130d, 0x4b7ae1ffcfaa1aa1, 0x64e26f32d73361e7, 0x8da47038bd0b54b9, 0x148cfa6feaecee15 -, 0x3756d4d479c2cc3d, 0x25d44ea8d31543de, 0xd82c8bef26bb2c43, 0x2c2047033d27f37f, 0x5bd33d9837dad260, 0x77943117a3383b7d, 0x12071d697ea583f2, 0x3c7c41272a225bf2, 0x92ebbdfaf1f03ad3, 0x5d61030c68b63704, 0xca6e2853baee75d1, 0x12404b34771a3636 -, 0xbe13c46326667e4f, 0x2bd261916f9be3b0, 0x86e3f8cbadc80f89, 0x74520d8a1794cb48, 0x1e15c745024cf97e, 0x5cee741e1e53eb02, 0x8d088de0af99cda1, 0x625812961cc0862c, 0x4313437321c0e934, 0x60bbc768c424f7a4, 0xaba71fbf3c10e143, 0x37b8ea9f14a915b8 -, 0x8d96ec65c40213ff, 0x74a08828ff77845c, 0xbedb7194daf607a3, 0x17e86671161c8706, 0xaceb98e0524059cf, 0x68552ac494916f09, 0x4cd2971baf1b3c47, 0x68442ebcdde21b70, 0x19629b8c0e867595, 0x6a6955d3635fa47a, 0x6fab45e0f2e393ad, 0x66dd3ef4fcf050c4 -, 0xbb0b7abcfddc7df1, 0x14eb5b751b0bcf9c, 0x1cf79f9ca2fd411d, 0x5c496f73fff0600a, 0x49648d8555426d70, 0x46c1016a2322d8a9, 0xb57fdb870d9b6d4f, 0x609eb65209ddb633, 0xe70f9166bedc82c5, 0x772fb5b5c8afaf27, 0x79a294d9b0227a20, 0x7f75b141112dbc8d -, 0x98d1c7f88e070020, 0x5953d0aac48217b1, 0xe28253ebe15f33ff, 0x267d1dc11e614c45, 0xbe64f50ab99e2246, 0x4eaaab5c82fe5495, 0x927d5ac07e60bed0, 0x67d3786de6aa1b4d, 0xa71962bf0f6e2945, 0x63d93844a35eea9b, 0xb34228c7d26640ac, 0x169c38d2eb28f5a1 -, 0x4b7972b33439dc22, 0x71478457cdaa1e14, 0x5226e125ec1d58c7, 0x669d8796e78fd4f1, 0x750dd1aaaa44a07f, 0x327c62b55aebbecf, 0x006b8e95b54fbd25, 0x2ab3f95d01eb364e, 0xfcbe5080c0d5e196, 0x2a1b9bd75a57e725, 0x1d2b2b6758139b5d, 0x751cf4af849b7a73 -, 0x164a7d2e337d00a5, 0x00cee3a4cb83a4bc, 0x3498e0366dbe28f9, 0x053d899148d28502, 0x01665d64cab0fb69, 0x4a99132208d68e74, 0xba44bbd4bd3f915d, 0x1d34b0f9172122bb, 0x5d114dc729e8a9f3, 0x08e7a43dd5334b60, 0x28db8e9232f0f3e8, 0x5cb7be1b80264f62 -, 0x9af2c78782508f23, 0x336ae7ccf7e3a1b2, 0x7fe2d4ee2dd194be, 0x573d2e1b2b8a6872, 0x3332ea3363b2ea36, 0x200bc1375b1f4243, 0x65c47c8c06b3260d, 0x42021fca53995c5e, 0x2f7e6cf49bb19946, 0x311fba6a23196d2c, 0xc30c13b62be0d70d, 0x61eeac142711b0dc -, 0x88526996597d35d4, 0x70169bcbe6bd21d7, 0xa0f1b2d0ad29a510, 0x2ade531472c1b94d, 0x11e320dc189873e7, 0x2d2a1794e85cdb38, 0xa0a8c453a6f621e3, 0x4b06d5b54525f6f7, 0xf42916691848ec1c, 0x1d4216555d578730, 0xf8c60da7290a5b4e, 0x66dd9f39a1f3565f -, 0x55ac29d937b474a0, 0x4291967a4a369ee4, 0x918dacaa12e6bc89, 0x3d46e8900651c310, 0xaf055430a00e90b1, 0x16f62bf56da5ca39, 0x1a021c33488c51e6, 0x0d64dadf63fbbcd5, 0x0918ece59dbfea7c, 0x3b3319d7dd74203a, 0x1d88545b8b9fa90c, 0x13b792dc908c59e6 -, 0x0a2d939a9c3d0979, 0x321a5dbeb74bf127, 0x5e5947fff66d8470, 0x22ec9ecafd26bc99, 0xde17ca8293b10536, 0x593f56c0559dd846, 0x1148373375485023, 0x23c6b0fdf7448b1c, 0x377904458a27804f, 0x573e91962726ea70, 0x35e1b24f3235ac70, 0x51ba082049f4f85e -, 0x4bc4918160d47194, 0x5d29a21e3308e1dd, 0x7e15894b3e6e4e33, 0x50dbbd2f4f31d0fb, 0xef248bd235a9c9de, 0x3418add21b634710, 0x96c7233a52363bd2, 0x7c8414ad9a08c99f, 0xbc6acb4a54e6c05c, 0x5729021a1193579a, 0x0627c3e00b08fa1c, 0x3d0b4ff9e17c2a73 -, 0xd507e8755990317f, 0x75b27bb3bc7bfe48, 0x44a80f2c6ce651f5, 0x7b9795fc1b706e46, 0x9de75bdefdf9a640, 0x75ade50ababffaa8, 0xce0ab116870889a0, 0x6f3ddcfcdd59ec6c, 0x6e36833588de0674, 0x291d1129ea28a073, 0xf8b8e53864884d61, 0x706ef8f1ae854d76 -, 0x137a8c6583753069, 0x01e45f1cc620f966, 0xe28e1ff82f76c7ba, 0x36d29eace3e89c54, 0x83379f157f0b49cb, 0x65e9c39e2bacb937, 0x9b323c45070cda3e, 0x16e02f31ab7e2de5, 0x53bcf346635122b7, 0x1fd7e207d6c2de09, 0x3a5f5f94ea1e57ac, 0x0cba06e8d0f0b4df -, 0x70b440c387a9c392, 0x1e7dc143dee1d800, 0x5498ba6d7239912b, 0x332870a017182d14, 0x6be306fc672d794c, 0x2c2ce211245b2b4e, 0x109b722c8d2ba79f, 0x268520fa9c5f727a, 0x515b300524fe78ee, 0x736201eccbaea698, 0x4608ac113210bf78, 0x32d8fd919c441843 -, 0xc9557e1b04b8f2d8, 0x775437f798dc7459, 0x1200f5585ba417f5, 0x2e00ec5f3e7ad304, 0xfc873d5f2b446288, 0x32270a93624876e4, 0xc646a47c08789b22, 0x2370d9fe925616be, 0x430afa3619e671c4, 0x156468ceac1f5fb2, 0x3b84dec2f2417635, 0x31140e9017c0e58f -, 0x5c85f88ccb7443fa, 0x0da75f5d64d864ac, 0x295ff44871b0fb84, 0x1b79e10bad3336c3, 0xffdf9942dd2977b3, 0x4c1b198d0f9a1a23, 0xba778a24c112864e, 0x74f66897f26d48d0, 0x3fd5c06e867ab611, 0x4b98ce33ff7878b9, 0xf7db4dce75cb9165, 0x11665aa099ec5163 -, 0x2a498f16ae7118b9, 0x265ec3dbb4eb509a, 0x3da4230668ce2c86, 0x36e62baab2e33385, 0x99507d4a79ab4478, 0x25bfb2fc411e8875, 0xd7ac1ec933022ce1, 0x23d341ae033d0466, 0xd295b465e962bc00, 0x23d0211ba2d73180, 0xa03ccd7aff922d4d, 0x1e767148de301514 -, 0xc241ab36a894efab, 0x1c9fc2f343fc1e58, 0xca3b96562bd27a87, 0x53623e2285dd7015, 0x557411f01c219420, 0x19265577096b42f9, 0xd3312d941b23592f, 0x30a9a9a1c3c51c06, 0x3d89b0b3ea6e8f79, 0x7eab751dc5c77cb2, 0xc0a9b186e6df6e36, 0x4f844d583f155694 -, 0x419018232793dffa, 0x2add440b6bd3854d, 0xd55480f131df6e32, 0x318ce3846ae3e417, 0x0565062d1a0984f4, 0x6ebaec63d2bff9f6, 0x77075fe729e79790, 0x0dd9434624c8a4e7, 0xbf8f11e2dfa9b062, 0x1b17d8255ee8b364, 0x62c2150cf72c6344, 0x28106880d081e8dc -, 0xf4a4af0ddfec91c1, 0x1a8f0e6c977e1f2e, 0x72a7a3a738b9316f, 0x323716728c4e22ec, 0xc14069065ba4af3b, 0x081514248911d367, 0x51bd4afaa8b6c337, 0x50e77a9b513400e7, 0x46c0051b2a822548, 0x024886e41a5edcfc, 0xa06b0efa41cac17f, 0x336a30b01b9c5675 -, 0x74fb2c10ca097626, 0x2b204caa48e90981, 0x6902c952b9a17b74, 0x39c2e9b6b922303b, 0xb9216b9b3c597419, 0x6d92930264f15f76, 0x7b1297d5eeae1427, 0x0f0744adfe1bd307, 0x33b57e265be6a89d, 0x282fa2e533356c10, 0x3a03995c61dc772c, 0x4f5d8f5e893dcff5 -, 0x4bfc927efc48023f, 0x596f2241d6a685ae, 0x3cb3e0afec29b8a2, 0x31018e0d10653842, 0x2fd00fe944575626, 0x1241d8704982e011, 0x970d56664e6781a7, 0x1b05f49d0f3de2ce, 0xa994ffdf63717e66, 0x416374a76ba88e98, 0x8b082ced53f1579a, 0x56781dfab5d2aa4b -, 0x8151defd1865b318, 0x64669b840d6081f7, 0xe436f4bb5f38e14e, 0x43d438410a974b40, 0x5832ceb3d666be02, 0x06347d9e1ae1828e, 0x6979471b39e3ea86, 0x2cf2cf61cb4b5ae4, 0xb7ab29eada5a6ee4, 0x12e75cb29aca5768, 0xe65b1109d30d1ffc, 0x71f9becd6b320e5a -, 0xdc8289026647eed9, 0x31d62d050ca5458f, 0xea2bbf523a54c1e5, 0x602bf0b9e3ee5491, 0x25aa73622380ad4b, 0x2b6b1e3271df5f58, 0xdbc5efd86aa0470d, 0x05353c24b8c4354b, 0xa3c7db3cf5e06bca, 0x288a1c8f2b4ea5f7, 0xd6152f5e12ce7ca1, 0x59d4c1b436673c7d -, 0x1e02554e521fcb95, 0x66d3980f240ad440, 0xabf16f6b39a4d9d1, 0x7fea351ca94c2f62, 0x3d62b6f3389163ba, 0x0fc6b44f2e7895ea, 0xd5c64403cda7c669, 0x2e4099090e603193, 0x9b5c0faf15fa4c2f, 0x46295c9d8e12b639, 0x5ce4add63a5b331b, 0x5fa7bd736c4c5879 -, 0x47b3471447d1aef2, 0x28004c1c22325739, 0xd588437d9a3c5299, 0x2ab19c1812cd27e8, 0x3ae700f680037802, 0x1ad163800b422b36, 0x45b7ef36fabc2139, 0x44bcdeff21dcbd1d, 0x41c6da2171e11c7b, 0x2c35ee79f7c4cc14, 0x4852942759c13849, 0x6492d26f10be050a -, 0xa6f54e988c50f0d9, 0x6a2db2b6dd62181b, 0xf7d9806b2a5e57a3, 0x57526bdb3ba53d20, 0x17ce6cb1f500e650, 0x05d841b042f8f345, 0xaa800a6c698de970, 0x04f4b559abe2cb8e, 0xc050dfd7259ce49d, 0x213839bdf94db935, 0xb371258655306204, 0x7d323b8b19f9705a -, 0x26d4502b16b6c618, 0x79717069aa89595b, 0xf867c0e36db41872, 0x13d601d86c76e1d0, 0x2dfc8b0d331b7383, 0x185472f3e42e8075, 0x05bd13e72b10eba0, 0x519a387490f79b95, 0x8d09c1b2d3ad2500, 0x045da45d2cf0f733, 0x640181956862426c, 0x728d57f59bfe1b09 -, 0xf9a99f878da2c585, 0x4fc4831e61dc4e10, 0x6dc602cc54394fe0, 0x0484566b67e9e8ae, 0xc5fcf0474a93809b, 0x71c0c23a58f3e2bb, 0xb400fabe36fe6c43, 0x614c2f3eaee4c0a7, 0x7610a980d0e1c6c1, 0x1ce8197c88885dcc, 0xeade1c9f3ac2cb2b, 0x471ad07baf2f341e -, 0xd67a837c6b01121b, 0x2a8e64281f59cb59, 0x52e701e42f3262ca, 0x19e0a27dece50580, 0xb5691c17a7bda6ac, 0x43484c311b9df1f2, 0xa68155549bae49ea, 0x43a2c5dda225fae5, 0xfa5e992aed700eef, 0x58911f5623918856, 0x648b81a1e48c4da9, 0x66e6e30cbdd0c3bd -, 0xf3ba209c169d266b, 0x20f7a86230447685, 0xd1bb5aaa1a0c3d2e, 0x366c29843d1111f1, 0x06c78b642dcc9013, 0x27484a64e109e3fb, 0x8f8eacbca4677464, 0x0b6cb31b1dc24cc1, 0xdf69c84f898f0fa0, 0x2dd426744920f2a2, 0xc0912a197d4c5c69, 0x489ade7f6a98d8d6 -, 0x458769f47f203e28, 0x124f4123fc05ac97, 0x3bb936f4ad6d7d67, 0x330954fed4f00ff8, 0xc2ce650046f90eaf, 0x7bf94762d4f9debd, 0x2e93172a586dfb83, 0x3c7a6062b4113d96, 0x5ddb0397147f0d93, 0x08e3596fc6839034, 0x374e67ff67639bfa, 0x19021c2119888232 -, 0x002f5d04fdd55efa, 0x05b4c6e079e1baa3, 0xe5678ea3ad74c84c, 0x1c42f7826a58a77d, 0xe054668bd2cafacd, 0x237668d3ede4261c, 0xedf46a6374aebb32, 0x31ec8c5931cf0ef4, 0x955c2e95c35b5825, 0x27d8b0ea68259603, 0xb7a8976e427d1ec0, 0x6b6cc5c07152bd13 -, 0x03d88f0ca0b244cd, 0x001cae9a8cfed897, 0xa844b3a1f693a7fd, 0x676c9acb7abdec96, 0x631b6bd5e0cdbd33, 0x29f289dc0cddd9b8, 0x0947d57536fb2eff, 0x1eb2ce650e3eb059, 0x2139b3a40e8bf405, 0x4165edfb39f4ae8d, 0xe061eda67a70d6a6, 0x2e3cc0328c9084f6 -, 0x1ef8329ed056063f, 0x6d4d01ce49e8b3d5, 0x0110c92f1656d34b, 0x6dad1c4e170829e0, 0x584c56c590b477be, 0x597e5f0ad525e935, 0x6008264d8eb7d36d, 0x3f586754999c829e, 0x3d7ea89df5546a1d, 0x41754f7d9a3f4364, 0x3b0796822ef879a7, 0x1ab2779598262872 -, 0xdc37c9f0bbef7923, 0x256ec818ec35a097, 0x4a72da5c09dd5846, 0x51df6c61edcad45c, 0xaef24fcdcf5ce819, 0x0ba6bb959ae689f1, 0xe667bd65a57b3a9e, 0x71ffd591a28a8e4a, 0x06c325fa53a7fadf, 0x6667f2986b2dcf13, 0x3ef751a6d52a09e4, 0x517a104240b8c74a -, 0xd08cddfd8c8183f5, 0x59237cc71b8147f1, 0xfff94fd188395933, 0x538acc592d10ef67, 0xac51ce386ff0eb1d, 0x69d42b8114c5fe65, 0xa17eda3995bfe8b9, 0x5dc6d98fdf05a341, 0xf2304d375ce8be78, 0x31b58521ecc483ca, 0x04d2d8140780222a, 0x3dc18b2be3ed95c9 -, 0xa48e1639f2d70d2b, 0x4ffd54a6bc0f38d0, 0x8ae3c65ba6b7143b, 0x482eb41f9178fa9d, 0x240b8b4e87ad4f1d, 0x6d8532420059eb40, 0xc135f77e44275132, 0x6261076a0daae349, 0x35316bdb3842765c, 0x246165ba3a8bfd92, 0x1c2d774bd5177a75, 0x045a2f991647e3b6 -, 0xed3b5923594671a8, 0x0514fada5acd4db5, 0xe8297fc358a0f50f, 0x7cd2badcf2952a91, 0x0da45130ea9ac266, 0x26a0d43c1e14c979, 0xbb62b729fe93a390, 0x360357aff7f67ccb, 0x3ad4835d1c7c59e8, 0x570daffd86fa470b, 0xd7c4be698fa3bd96, 0x17e4bdec2ad76ffc -, 0x43ce4ea9ead7dc51, 0x58ba7ae0d64a518e, 0xe014cc7e64680555, 0x03abc953ce2630b8, 0xa318620c7799be57, 0x2b258fa2e84da952, 0xdd88fdc5063b2ffd, 0x17371dd79a3aa556, 0x927b837578981299, 0x554552101d90ab2d, 0xb45306218ce54bd0, 0x59109b65ffdb6235 -, 0x8663e0c4a180a515, 0x41467fe41c6604f4, 0xae2c1aa4dcb73878, 0x19d3cb02c6c07517, 0xaa147c97ea6745f1, 0x70dac71a31cac43c, 0xb9213ec26af87dfa, 0x67f228e9f60e7b25, 0xbfb59b8cf78df3df, 0x36687792a4256fa3, 0xe1be5c1f23177544, 0x786a9e1b644b1c90 -, 0x4172f47393ca7f5b, 0x62ae5bb4b8aaeb59, 0xbcd9c431fa631b6f, 0x1fbe20b2edc9cc6d, 0x5fdd829fbc0ee085, 0x241dd315adc5dd59, 0xb4b688d625f7dbb6, 0x595a82fee5bed2d4, 0x69653ae0cc11880d, 0x2b9e85fefc402f76, 0xbb2495b507770a81, 0x05d20c575fb34731 -, 0x9d9e623436485ab2, 0x27012a9665f3febb, 0x586cfef484c04ff7, 0x44a5860cc0eabfbe, 0x6fbfe6e2f3532e80, 0x05abeabaaf3220fe, 0x1bed21f2cb809678, 0x2aa62112b7eafed2, 0xe298837cf610190b, 0x1ec8fbbcef9158f8, 0x1efe9b3aa4f96f6b, 0x6a3b842a068b0ef3 -, 0x92dd4b7cd7f827f7, 0x605175bbf3fd1c97, 0x139bb6419c1f6d98, 0x3a3ab2e9978db310, 0xc5c95941c9d5dd0b, 0x34c6c76025b2bce0, 0x0d44115a49bb8126, 0x7622cbeb11daf619, 0x785bff93164ef5ad, 0x7191647d355cb45d, 0x117f255c4cce6e5c, 0x581b448b0e9aae3e -, 0x54a4f3cb36225414, 0x790180c539bc4685, 0x47064043b7c6b96f, 0x43cccf5b3a2c010b, 0x1dfbf3afc14c3731, 0x1c368f3195572574, 0x00bc2ed3b5070b5a, 0x0332d8dd63b37f60, 0x0744b1908c9bd8f0, 0x2d258e628dacb9ce, 0xbba5b4bdb9c61e14, 0x0bca12295a34e996 -, 0x059c84c66f2175d4, 0x1a3bed438790be78, 0xdf394f577dabb5b0, 0x304777e63b3c33e4, 0x59a29d4fe82c5a6a, 0x72e421d1e88e77a4, 0x69e6230313312959, 0x2da03aad8cf2bbb8, 0x2858d8608fecb0b6, 0x343099e7a40243a6, 0xba29b675d29a8f63, 0x3d2028a4f6f15886 -, 0xf068e2d286047d0a, 0x14999b5d6c770e20, 0xd1874a592385da79, 0x78aeb552c15a1cd9, 0x482dcccc23e9c06e, 0x7b18a19fb54b5745, 0x036c896efe9a7a06, 0x2f2c2ce0d1871c13, 0x3b2d9b9ed65492c7, 0x0649c7e50819d077, 0xcdab66ea7b65e3cb, 0x49b15b40c4aaf03f }; - - -// The table below consists of four mini-tables each generated using window width W = 8. -// Number of point entries = 4 * 2^6 = 256 points, where each point (x,y) is represented using coordinates (x+y,y-x,2*d*t). -// Table size = 256 * 3 * 256 = 24KB - -static const uint64_t DOUBLE_SCALAR_TABLE[3072] = { - 0xe18a34f3a703e631, 0x287460bf1d502b5f, 0xe02e62f7e4f90353, 0x0c3ba0378b86acde, 0x90bf0f98b0937edc, 0x740b7c7824f0c555, 0xb321239123a01366, 0x4ffcf5b93a9557a5, 0x297afccbabda42bb, 0x5948d137556c97c6, 0xa8189a393330684c, 0x0caf2b720a341f27 -, 0x892756b15bcf68c4, 0x5742f77c98a526ba, 0x340a5a1de9f89f9b, 0x14ef680aee75d0f7, 0x84e770e14043a41f, 0x0212c41116c33c95, 0x35b791e6de4dc0e2, 0x5949df08518d5d28, 0x6a0e120744ed10db, 0x5a5183ce844391d3, 0x6f618b158afdba50, 0x2ce2037e470e2088 -, 0x1f49fa149a64ba3c, 0x5f9876d519670451, 0x030105056f55586b, 0x020f1a557d8fd726, 0xdf4cb175b06d86c8, 0x694fbcbe7fe58390, 0x7933294a756a1b67, 0x09dbe9924b58f8ec, 0x590f4403cdf197b6, 0x1c07969fc87a0ba7, 0xc496477712252367, 0x5508976022f1b096 -, 0xefda361e452e1775, 0x7a0a0cccacc838fb, 0xb07e791c0be5dc5f, 0x24d9b6b418cbcb93, 0x497970f3c6117e03, 0x3986a158cb96d595, 0x8f80586ce692612b, 0x305cafda7e4df9d6, 0xc1a1c2e06452914a, 0x7ef989c0eb583079, 0x3a765b1f7364b099, 0x4fee236d58299c6b -, 0x6f81095f770e8419, 0x53bbd86b7396bc09, 0x2b72ba726b2b4210, 0x625dda1d2901c78b, 0x0ff5bc7b18cd2b3e, 0x0556598c7358d332, 0x0991245f20ff50d7, 0x0e7f58e5e919a97e, 0x5a0561373b758756, 0x6447bc93f87c198a, 0xf9230604c34c7520, 0x6b214425475c1bfa -, 0xe93de62d6a7f9497, 0x2129459d86f4493c, 0x456394c7c464cfe4, 0x612434fec3f4a1b3, 0x1ed91eddf44261f3, 0x0c6d3854f9e0a3ff, 0xd3fd153188a7e4e3, 0x24691fbdca16910c, 0xbe97465cd7625c9d, 0x2aa61cd373f759f4, 0x824d5763a326d62b, 0x1a0ae39e50da20ba -, 0x32d0c8481ee4c3b9, 0x6c3687109cdd18c6, 0xe52717142fbf95da, 0x67bfa41fb52ce9c6, 0x4e24d6a088a01474, 0x49a6ca0ae3fb6626, 0xd67f8faa9103191e, 0x674888f5aa6d3062, 0x4ba73824c2e85a99, 0x406b2fd18d35b314, 0xa7087b1bea728ac1, 0x11d2f222317b160e -, 0xf8946e007e23a469, 0x22a196fabbce31a2, 0x5309ee1bdc1216ba, 0x240fe9953827a324, 0xf9fcb89b63aeb5c7, 0x603b8149ed16b1b0, 0xb1f1876c02cf61fb, 0x4a5e32af612f948b, 0xfc491aede69a8813, 0x1ad9379136e53aa5, 0x5da50db1d5e6c123, 0x2f4014f7fe2c12ca -, 0xe4f6791d7685c3f5, 0x4c218521c3745a9b, 0x0c0521af98555f97, 0x1462a12953cada7b, 0x0bb2ab63d6452c1b, 0x5783c531ec98bb87, 0x737def53605dbc9c, 0x49f982b930e86719, 0x75b16790cb5211e3, 0x45ad6574cdbae99e, 0x1062b72dfeec9851, 0x45029a09cc468c88 -, 0x532240de77f3a1f2, 0x17bd291eaa9ad0ea, 0xe0a2d7efc2f8a0a0, 0x3a7412052021778e, 0xb0dfb0976acc90df, 0x7fd603b689a7b1f3, 0x1152579ccb00d6c6, 0x6340743b631849a3, 0xebaa47290e0cda01, 0x143265a6d53fef0b, 0x45325d6fd981e75a, 0x0e9780cc39586f2a -, 0xa4f68d207a8628dd, 0x50d230b51893e841, 0xf3bd769a4bb504b6, 0x55975c063969292e, 0x07727ba25fb8756f, 0x07ff86cf8ed731fd, 0xef57fa40cc35a1f0, 0x70753a70874218fc, 0x615954e2342b973c, 0x5aa9d68f1a59df86, 0x3b8e9e9ff5e44468, 0x2e749114d60a3d23 -, 0x14a1b91ec176db4b, 0x55f91a63d69aae6d, 0xf42382327b1b6d27, 0x2acf1f475facaafd, 0xfd9069b479b58968, 0x3baaf4e5c4a45f77, 0xa2ac9ab98a7aaab6, 0x5466cb5018f50981, 0x3e6ba27771ba3205, 0x31ea90cdea1bbbe4, 0x0000416b5c557393, 0x464cb0415a510d7d -, 0xd02087d206ff2bbf, 0x2b9c8ecd7fabe736, 0xb2b56d3842caab0d, 0x046ea0b7767700a7, 0x113a7a889e317310, 0x5992a354bef7d0ca, 0x3edda94ed50388bd, 0x052661f767839154, 0x4c28edf6e19e28e0, 0x1d19c2f2d2f644e5, 0x5d732148db35ab3d, 0x680c4714b83580f5 -, 0xa374f282bb80ccec, 0x789e609bc77ae11c, 0x10d2577d599b45f2, 0x1c548b5b857721b1, 0x7baea726b4543fdf, 0x3c1562912d1b4ed2, 0xd6362203b7e82082, 0x1414e523d3c7a900, 0x7ca349951c1d23a9, 0x4da4265e3ce80fb4, 0x7981ebbcaca9ef36, 0x4ebac9e5b5bf980b -, 0xabd2c1dcf49cb5a4, 0x3f54acfc25c6340f, 0x202eeffabbd11cbd, 0x67216b7cb3695e8c, 0xff7cbcf9b23fc9f1, 0x2eebebdff7fa7afb, 0x71156befa111f85e, 0x1b8fd98df522902c, 0x6b28ebad62519791, 0x6cf0ea960e01d8ed, 0xb4617bc2006967d5, 0x323da065cb3df0ad -, 0x31687d0741e24d9c, 0x02db8f2b509a7cc2, 0x9243f85924320527, 0x68c360f01d6e6d2b, 0x2351c5e877d5306a, 0x6f56ccfc85c5f3a9, 0x1b09652837c4928f, 0x0b3337554c83f971, 0xe2931be2ccc783ec, 0x46829694ba08c64f, 0x9f35e36358e2c6ac, 0x1474b333b000d170 -, 0x24d792756fc96640, 0x618fda9fef868c5e, 0xb7ff5b125afd9375, 0x778dd97e0440c258, 0xfbff314886219627, 0x3417e1e1e2a7e811, 0x21e959a88f7b7bdc, 0x3508c2eb8c3c8672, 0x827ecdde111c430f, 0x21bcb19fb07aa134, 0xe0c1fa50ab2f5746, 0x401e680b4e6658fa -, 0x2cc24bab313693cc, 0x20541c12b964447a, 0x374975b6fb81c3cc, 0x52905efb344e17f7, 0x79c5c9b56d8b5f9e, 0x3390bf75d2b9a3ec, 0x7ef3807d895bf4e4, 0x2814165a42046b51, 0x7f8cfd09326fe158, 0x3232fb4f4c9762ec, 0x5678d6dacc194d25, 0x6f7caffb0a7545e8 -, 0xbd981637b23e7963, 0x691d7b7cb88a0ef5, 0x10ba319ae2062914, 0x06fb144f8295a85b, 0x80e620976bf62f8f, 0x2a425971ec73d6b4, 0x800aa9e741d10b1c, 0x230d7d8bd1a0469b, 0x65aace37428dfe8c, 0x0fcab5297f58b667, 0xcf0e9526943af7b8, 0x7d90915b75d4dae7 -, 0x7455a46156259d6b, 0x29bcc06374cce1b5, 0xf2fb0ed3aa87aefd, 0x211a06af0e54dd58, 0x6c0c95c5723de9bc, 0x6299b6ed25008ca7, 0x7fd63e784d4dfb18, 0x2cc93b4d9bc1db30, 0xebc7e2d44c5d13ea, 0x3278e18d4d3d11a0, 0x349e3dd25a215f79, 0x7eb2a7150b30416d -, 0x05f3d7d5f6a094cb, 0x2a3771d48e331405, 0x08ef39e9dc96f009, 0x012248373a364992, 0xf758f92fc9fd4d33, 0x2339d8c6dfd3ca6c, 0x8b000965962673b4, 0x746ff43eb99d9054, 0x47ecdc054a422eff, 0x33d8f7c8267b7f0c, 0x22fe00ac921a42ae, 0x31e57f3d31fcd8e6 -, 0xbb912315a1c50869, 0x4ac8cdb0fa7ebbaf, 0x0541d74a60973edf, 0x7234900334b2c5d7, 0xf2e545f730adfa33, 0x224e44e63db5ac96, 0xfcba3d005c6fdeb9, 0x2c93a4e6559936b5, 0x7727a0d7ad88d758, 0x2e33100216719cdd, 0x7b2ef89aeb2c0254, 0x1f6de5b74758afb4 -, 0x6ae89047114fb321, 0x3d605e9a6ec6d80d, 0x18e915c727a874d8, 0x699088b5e9d0912f, 0xaf9344618e056f10, 0x1b9169df8245e0b3, 0x5eb8c33d70f4c891, 0x1609ddfb222b13c3, 0x8131c885d1b366ed, 0x7bc3cf9d9cb1a7b0, 0xd297478d2fc93968, 0x13cbb4573a4ea7f5 -, 0xdd37b5cc64d5986b, 0x7ed3d1d7d81ab5dc, 0xac53485f23973c9e, 0x0705675d333b91d7, 0xade5d213c43186c1, 0x6a8bdf57b4bfdf14, 0xa87f88a1de717963, 0x17f29220b519bce2, 0x7af2d7fb0f95c610, 0x28d1d3923b144a7c, 0x8e73c3d8972813e1, 0x00100b40c62e72c1 -, 0x84de7a81fa1f50da, 0x4fa391d6589d8244, 0xbcc3596f0834b285, 0x4d4acbd60a24e9ce, 0x97fa98b8c1835a0d, 0x33abcf8e29901d0b, 0x60a73d1975b3d082, 0x60666aa4325b948d, 0xad54adb769284a39, 0x227a98d113609b28, 0x4a1e1ffcae6a3872, 0x1e4ee44bd67f818c -, 0x5a74c6bb4387d315, 0x019428c0b1b18795, 0x5cc153e270bbb055, 0x2b3cabdf00dc4a61, 0x834110c026924b57, 0x2d30e985f2d9f217, 0x47116979333389f5, 0x53e3fd6a18202417, 0xb1393cd79c2e5864, 0x58d92935e4112e82, 0x86989a7ec8305b6d, 0x42a8fe4eee28f37a -, 0x74e212ef01591901, 0x3277917a0397b1b9, 0x7bbcbe6e3d687544, 0x0b8957701d09afb6, 0x6cfbc8ee74503668, 0x48a9925ada9f8348, 0x57045753ba2d0f4e, 0x7d69ca3866223d66, 0xc7054ce22917271f, 0x41bce1e1133b51de, 0x3a3ae42df81ec35e, 0x7eaada0f42d47cc3 -, 0x13b138f1048a57cc, 0x64f98abd7e915a8f, 0x7af195eb16a0c732, 0x11be81a791d634d2, 0x97d8df47430f61b8, 0x0767c7b381271004, 0x3e949136fb940aa6, 0x3bdee340cd956dba, 0xb250ec4ff91d2602, 0x4cde2454d47f59db, 0xaf5e749530d978cb, 0x5a8e2f2119d4d835 -, 0xdf1cb5425a0744df, 0x3d3b08a7bf35d055, 0xc6335e832de4719c, 0x6eb8d97e09154d42, 0x2f6a3f8de3d20dd9, 0x13f23cfd276233da, 0xb4a6b80dfc0fa41c, 0x58d876403acfd7d7, 0x2ad422078b8e139b, 0x73dbee2abbaf494d, 0x09a2758891eca3c8, 0x6ef9a9f1178b0938 -, 0xfc7e9ecb90c637da, 0x3a04345fc10b1a7c, 0xc024e9cb62f9ff1f, 0x6c4f9c3aa4aa33d8, 0x049d6995b95ac1f0, 0x2243845195763a1b, 0xa1466a31700ac276, 0x600fb7123a325905, 0x9d391a64a0d35a24, 0x3b093b550641f108, 0x2275de5bfd2e221f, 0x25f5e7465963db1e -, 0x3e220107f7e7fb84, 0x6f06a23bc1b85a8e, 0xb4198d19f6eb0e48, 0x5dc11761dad45fda, 0xba303e492ab52a0d, 0x127c69c73da9f528, 0xd3a5b70cf6c790be, 0x0d72b0c50819da5c, 0x193f90d62ec2cdf7, 0x67f7d0cfc4f46daf, 0x7aec083d52f380ea, 0x7c0a1dda4a28bf4d -, 0x46fd20fe6008cba7, 0x7a588c914115d595, 0x8fb1d3daecf45f78, 0x0851dac094e7b036, 0xcae0a76e2a32a892, 0x104f861322dddb2f, 0xb79d81e46e1f9006, 0x1e4d28d7a2498912, 0xaf3175d3974b89bf, 0x613d00f9a69c55c2, 0x23f6883e8e65226f, 0x072f7ed65c6def05 -, 0x6690e643bb38e243, 0x1a81c4a7c9189b15, 0x1056d1669e4749ae, 0x0137f2a7418f190c, 0xed3192796e699d16, 0x3ed76db45c38a37c, 0x78e86d1475a88243, 0x45985aacc495b16e, 0x47d5c8208e8f1030, 0x6dbe5f68b4d0e782, 0x08d3d0182cf7f26b, 0x64c375ce172fadbd -, 0xba0f6db3a20c2875, 0x57e1d90a53241250, 0x0315433fddf8e63e, 0x33344750e37dad9b, 0x62cc0d28ae69b016, 0x435fe80f6100d547, 0x5874aea8669d3df5, 0x3b96913f8264d4a9, 0x738067d6bb1314b0, 0x48cccf24cc6f4ccf, 0x6f5e2bbd68b777af, 0x34c2c37ba9635d66 -, 0xd731534900fdbe5b, 0x4e4f9d97afe11d43, 0x81b41214351b73d7, 0x1d48d100ad11a5ae, 0x2a4ee76628e2b151, 0x34902e901877efb8, 0xb5a8561a0fd45394, 0x44317af6d5cd5ac0, 0x354c2469e9068bad, 0x0771fe2761cad022, 0xfda76ee8212d0f2b, 0x76cdeec6d4435495 -, 0x55c98575b3e825fd, 0x2983325ed5d73a1b, 0x563c4c4fb3f466e7, 0x731b0fa413338bb0, 0xdeb519ca57a05240, 0x7a7e909b5c4f7351, 0xefb7c153dd2ab28e, 0x11ca1c865dee30b3, 0x013ca8348d9d7de1, 0x575e0bdaeee8cf9a, 0x464c98a21083af7f, 0x683ddcd85c212ee3 -, 0x1171f0ab4cd02019, 0x22c7e01c7f4d64c8, 0x972ec0ef3f2e2ed3, 0x623f83c2611a476c, 0x99b3f16be9aa25a1, 0x2d3ebc5468990e0b, 0x5d5fba8546a4d5f2, 0x4716e6919d2986e3, 0x3ab2f2bc183f5d6c, 0x5f6257d3910cd4be, 0x341c6f2a78f94f2b, 0x6ee8390b8a5064f5 -, 0x9d8640b9b83ca8e7, 0x033c5ad24466be3d, 0x6f6cd68db30dfd59, 0x52aa6b1c0f90f3f6, 0xfe7bcd4c97403646, 0x11ab3fc960b05fb0, 0x24584b77575896da, 0x427f8deb932da137, 0x928a28cb505306f0, 0x04ae916fe863820e, 0xaabaa98911b9cd3f, 0x59e588ba994d9145 -, 0x9b8f1afabeee9e9f, 0x04ffc7ef3476ff8e, 0xe9cf53ce9937b146, 0x73fe42a801524448, 0x224bda3cf3bbaaad, 0x5fa85056d59884a4, 0x8e6eead48345726b, 0x09230936d41736d2, 0xe679eb58d1ad6be7, 0x08bb759b530b1eaf, 0x9688eb527860e24b, 0x13704d2daf9af278 -, 0xd9273ac71b906f14, 0x57ee05fbbd40deb5, 0xb7788e19ba9e61eb, 0x7967b6dc1c5d9699, 0x36e043fc230127c0, 0x2a716598bb2d519c, 0xc017b2840d4d1b07, 0x1d3bfa489f756a3f, 0x4ad73abf24318d36, 0x1915e6f53e12625d, 0xb219a7c941f89084, 0x2280087a8f4762fc -, 0x8eb280345fd1b4e7, 0x55b8d4ee5772fd79, 0xc9e63a787e2ce2e1, 0x685741adbda93885, 0xffb830ab11a3b491, 0x7e891121f9356428, 0xc03aea271a629078, 0x71c45932930a2639, 0xe7df192a6bf81795, 0x704aee8f183aadf1, 0x06ddb55a8a7a63d7, 0x52556d8763f3033c -, 0xb76b458c6f0c33a7, 0x28666b87c362b95a, 0x365ae575a4c27b9b, 0x36ef35110562adfd, 0x89955dd8d927f9c7, 0x526e787d6a586c9e, 0x762e0bc4eff988c1, 0x6c9523b4b5ae4946, 0xe90a909688cfe95f, 0x658a7dc8b3ffada3, 0xbee148ba7a58520f, 0x6819007d8573d1cf -, 0x75d3b5ec141be9c5, 0x4bc236ae634f3c27, 0x1192fa9b8b30e894, 0x4129d43e1d092cbf, 0xfcac068558bbea45, 0x513e8d87b8116534, 0x5377a179a155ecd4, 0x6c93531e5545572f, 0x727df81ba09aad91, 0x07527139dbc96250, 0x150320b1d8ba172a, 0x2281e85f60a1809b -, 0x7164b7d524eba6af, 0x50d387163fea4ca8, 0xe90de17d62aebe78, 0x6ab369ba28c0410d, 0x17d07e315a95d138, 0x58b496352453fefd, 0xb87a04dbbc101b92, 0x40a8f0fb757e9b0e, 0x2148b48a696e64d1, 0x4e004a3a350c17d7, 0x17927e9f386b563e, 0x29da9cd441e3e3c5 -, 0x883d2dc357417213, 0x2e94653ff7862644, 0x53a37af548453df1, 0x04475db3c300b93b, 0x2d65fa4d815e7204, 0x231a2db74c2c3ccd, 0x1fd734c0cf4d97cd, 0x32d255c105f6d122, 0xbb74fd9201eb07b0, 0x12e33f1c81ac6f60, 0xfb9a6439bea97072, 0x52e14b7db9cdcbc1 -, 0x637ac1a91ae374cb, 0x1c8622c35adc8224, 0xeb786c50a64b7d33, 0x362823a7232a5893, 0xf22dafca688d472a, 0x18598f0e0237f7c4, 0x97b8497bfff4bcf1, 0x7abf4cb27a9c5b7f, 0xea47c44e3b3d95d3, 0x58728fe3e1827a43, 0x7fd3681a6df902c8, 0x6db1dbbdc413de79 -, 0xbc4effed1ac3007f, 0x7f31a54744887cab, 0xe6559b4f8bd2519a, 0x18a78ec5b0c241db, 0xf6e10285b15d2030, 0x5c1323ea219a8ff4, 0x134b6f20dd116b47, 0x5d0abddbc8998733, 0xa3c993938702e151, 0x0ab6aeb494f6ad5d, 0x8cf3b4beda1815e6, 0x546ce323008c2fdc -, 0xa10eb5a6a78dbe39, 0x26d2e8a8b8457da4, 0x026ccbe31517d806, 0x2a35174b812f562c, 0x57d70499dd7a374d, 0x3368f951acd3c5e5, 0x490b2515f901062c, 0x316109e7c315c377, 0x32e20eba569535cf, 0x496a8c39d667d709, 0x5578096dc44d5e0f, 0x608a162ce73903b0 -, 0x6b2e65852cb37cab, 0x75b09a2e6ed609a9, 0x7ac84b3082602455, 0x7690cbb594e84b94, 0xfc85dad9511973fb, 0x738a74b08c9006d0, 0x83233fc939d5883e, 0x7fbfc08b5db3c9f4, 0x81a0e493fb5f7749, 0x2c255ef7e69a77c1, 0x234f02e609cc656f, 0x5960cf0b961f3cec -, 0xac72940237b1f17a, 0x434e038a29d446ac, 0xca6a090e00d8b0c6, 0x1f1aad24001e473e, 0x6d64b6dc133399fe, 0x0899ba41e9dd4607, 0xca590b3f25bbf5df, 0x57217978b0d8ce11, 0xd6b4cb13da6de9ac, 0x3c88520cf564f75d, 0x649fbd5075a7757f, 0x3f2593b90fe72161 -, 0xe1bee53e91dcc9a8, 0x010069dce4c74a92, 0xef83968978aa855c, 0x6cd8848183b53d73, 0x0b3df59610e403eb, 0x713225d446180a7f, 0xcc23112cc59850e2, 0x105796b670a3730c, 0xa147f4ec7a2fa4cf, 0x32da1f072d75b253, 0x4e7007455e85f560, 0x76a5376a771fdd60 -, 0x47eb4fabdcc699f7, 0x4e45db6334c6ed96, 0x36066f2bab72546f, 0x04f48065593ecdec, 0x3fec02793fbb5601, 0x122f74626b64a526, 0x21d0f66ff83b4dbd, 0x1370610ede647f1c, 0x57b82242b88172c9, 0x527dcbadfdc65ade, 0x5e9c9a04385c93f5, 0x64d1cf9e52548a6c -, 0xba0073337865c994, 0x633ee14e50bcd615, 0xf840228ec4251095, 0x49bb96812a98f08d, 0x82f57d0422f96678, 0x06d7e43bffe7e0e1, 0x33910cca752ae863, 0x04d46e7c66087e38, 0xf14935c4167017c3, 0x3f22e2f44d03c9ac, 0xa6196244f2cd6164, 0x15a2b4ce514fa4db -, 0x5191a04c4abbd0c4, 0x0e763360ecc8a19d, 0xfef583c184a673c0, 0x75c2f30a7c7433e7, 0xe947a55547c7c099, 0x245c7ae44f6e7a83, 0x67a666f9e6bec2d4, 0x5de0b922fa645ac8, 0xdd9b3e4a5cb72e22, 0x0139c2c857adba8e, 0xa7feb68e863ac231, 0x501381ef88ec2da0 -, 0xb2b8c6a470f40b01, 0x051d65bdb8363062, 0x4ce90414a6d65714, 0x1e510b525d19df0c, 0x569e723f5d374cf6, 0x4bfe02fd38fde1f0, 0xae7459ebc50f9aa2, 0x0f7e2cb170dfde32, 0x3c3da2326a7407cb, 0x0cfc50a85ffd1842, 0x62ab34c85e85c3c8, 0x22b4d9644bb37333 -, 0x57d313b3d87c2d98, 0x4f432c1cba49133f, 0x6163d11fa4befc0c, 0x1ab94e122fddf12e, 0xfb7c9358aefc85a8, 0x5b20068f81d949b1, 0xcf8ed6ff2145c810, 0x5794afc021932d00, 0x5c8987ad9b6e35d5, 0x6bb1f4b836fda03e, 0x794f1fed4a3ea1d7, 0x0cf6d128deb0e7bf -, 0x54ec3e1c65878cf5, 0x002811763ba2200e, 0x382d917051e77b71, 0x49e00cbd013a9e7f, 0xccf576e9a4cf019c, 0x4b4a66287970333a, 0xf772168915edfc1f, 0x278eb5eca6479685, 0x8a95c8b9cf41cf06, 0x6e58c9c7826d39db, 0x478e119889f2fe75, 0x73ecd21991bd98d4 -, 0x26e751fe9fbb9502, 0x29825b71b0632e95, 0x21668f96ef8bb5c5, 0x2f2a899e53c9a004, 0x2803292ed4345ce8, 0x72731055c7c65dec, 0x3aaaca9c4b6fe9a5, 0x6228d3ceda8bd671, 0x773e2c5effc48eaf, 0x017ab19e0fea9ac9, 0x9609e10496c8d766, 0x121e89f9b302c30f -, 0x4e87d00a0be96480, 0x09bd8d170ba9dbab, 0xc6756f947ecd4e52, 0x2c9e40bbbccd0f5b, 0x42a5b77669fd812e, 0x66aba9583b080d9e, 0xee55df99d16e77c1, 0x4cc00c5c5eff2509, 0x8c84d5e20ab7c16b, 0x00ae5c96184ffefb, 0xb295e90346dcef54, 0x5d1bda0a39dc3b72 -, 0x75f92d72a89b5ef2, 0x259d998c9ff9ac0e, 0x8a1cfb72a6c433c1, 0x23f5b71d49d67604, 0x478d8f30914f62ef, 0x08fe61135218eca9, 0x4da2ce9bc6488c4a, 0x15f1eafd35283e2e, 0xc2d2be3ebc42ea0f, 0x2a5216539d6ee902, 0xa1e99052e7bdeeb2, 0x3a8f2631ec78290c -, 0xb71518a82ebfbfe4, 0x24700671c46ebddc, 0x6ef52d591a221f75, 0x4794614db6a67d92, 0x761f5c8ee4bab607, 0x31d9dd8f2361b5d5, 0x1a45593be8db3b29, 0x7f06c365eb116260, 0x9d305a66e52eb65b, 0x5edcfcb5613eac18, 0xef34fd28154adb75, 0x790f805753b9d742 -, 0x6ecd5ac255dfb797, 0x0cbe14db5d9a88db, 0xc1c86c5efa815528, 0x2c636133ba59d887, 0xc75d42c2d9f52297, 0x4bd3540c21e2ebd3, 0x32e7cdf790de6903, 0x1aae3c9837d3e30a, 0xeed028e49d436f09, 0x779ae12351efed1c, 0x6e0145587d9797a5, 0x25156e4cee9a407b -, 0xac2fd82f2ac57119, 0x7f8c026f1d182ed2, 0xeacc0d8fb3241611, 0x5968db65d2d7545a, 0x7d525846b1121dbe, 0x57949fd7b80339cf, 0x471fe9bec9b66c01, 0x5c270057f1268efa, 0xce092463083f656e, 0x16e8241cdc862cf9, 0xb7cb2bbcaa06b312, 0x3c25936bd8863416 -, 0x19b8ca966c4a3827, 0x1ae43badfd21e63e, 0x1dfd002b95a6ac6a, 0x4708e27f6d98e997, 0xb5fd6322dc31ac7d, 0x53baf4d9a16dd550, 0x025aa2ea5463960c, 0x5b5b33c7a3cfa54f, 0xdba287866ee96b90, 0x4748c1f3f3a6dc4f, 0x2333ec05a80c154b, 0x4a47745d5b99fb96 -, 0x44955b062a6ecded, 0x7791feea9015f170, 0x736bf603d12fc35a, 0x2632adbca5388026, 0x956e4c48e1697c4f, 0x4ee9adfe8600e32d, 0xa584042a0da56406, 0x34a3d7f4bf457353, 0x8d4fd4fe00176fab, 0x15321ee855941f4e, 0x670701ef81f340a4, 0x0c7d7c618aed0ba8 -, 0x73283131d9bfd9d6, 0x34935a39e31bac65, 0x466cfbbcaae8b991, 0x250dd54e18478ac6, 0x659e46c51e40de4f, 0x618ea014fec50e04, 0xfe64d883080b877c, 0x572cabbb6688c4f7, 0xa2c817493a834146, 0x06cd734876378120, 0xe3de0b717336a849, 0x36942f5191db53c4 -, 0xa3f9adf66abf4d88, 0x2a9a144b8087fa96, 0xfe49fefcb78a5b4f, 0x1be40a8616928bab, 0x07a901975521f7aa, 0x1fc66ea683693510, 0x4dbf0084ba42380e, 0x1f374495b918c737, 0xb8346956a380a00a, 0x1346f4766fcdaa07, 0xb4db5689d46312c1, 0x775e7f3274dc1316 -, 0x07898828f32341c0, 0x144390a33b3e86df, 0x70bc604ce1e9c5e4, 0x127652de00220873, 0x2874bc669df50d45, 0x236f4585150161f4, 0x3bfa4ffd318214e2, 0x7cc92a6165059745, 0x2fae0e92090ef72a, 0x26676bd59c4fcc3b, 0x220c030974d1d447, 0x66455887e98686e7 -, 0x4164b8e4d8760ddc, 0x5517a86f840feb63, 0xd9b42c6c9371cade, 0x3a7f03ceecc160b9, 0xdd4086d64cae366c, 0x1b6290c327842533, 0x144efcd2a7a0e82b, 0x16621925ca10d31e, 0xa9dcd13118e208f1, 0x5a90f97edcb1c54e, 0x80c47331c8749d99, 0x6f061a3569a80b55 -, 0x0f6abf619e2a15c5, 0x29106c98122245f4, 0x5860b10985c9b47f, 0x4f379a379e15f410, 0x2dd6f45df68e1678, 0x2c475167ad9b283c, 0x23b7aa00952a6a3a, 0x5532bc26a40c5365, 0xa5c0a8be3596ce22, 0x4fa3127a9aefa56f, 0x944e843aa973e67f, 0x3c7727d45ae87854 -, 0x48fa2ce675117ea4, 0x7bca8e04ad3bbb9c, 0xd57439e4726f88e5, 0x3337d3a6a03b2286, 0xb0b6172902005953, 0x514bd76734e6c0a1, 0xf97f8934eed7c6b4, 0x0abe13cee7f1b75e, 0x6c88107a120e54a7, 0x634f966d7a6e11df, 0x5044c53109b94097, 0x68d49fc65522b73a -, 0x69e295cd8c444666, 0x542c4c5fd999a224, 0x13ff89418b5da76f, 0x7133fa786a87ecb4, 0x2f180926456402b4, 0x52ddada7931c4dcc, 0x6eaf0d2130c71590, 0x014ec2a2ec231826, 0xac05b61443b34dd6, 0x157acbfab118b219, 0xe4e2f4b84ad01099, 0x0abf4a4da29a0eb8 -, 0x5f852b85b59eab1f, 0x1bd259c4726869ed, 0xce565d9287790a15, 0x17a48442bcf58a00, 0x01e519522381363b, 0x2336d07a710da07a, 0xcfebf2fbdc714cb2, 0x2f7a51474c23b8a9, 0x77db2a07d4e3716c, 0x40e8d8d2d0a09806, 0x644363ce6d401ae4, 0x53f9cae0470172fd -, 0x58d96ecd8ddadc53, 0x15028204f3d6d696, 0x6f40a09214439ce2, 0x738c5371236c3e56, 0x64f87ee7a28bf9fc, 0x4f1899449a810fee, 0xd0aa95f4bf21e376, 0x6170cc24283856bc, 0x9dfc4927d764ff75, 0x227ea1563fa2e012, 0xaddd3665622ce087, 0x473d3bea07a5285e -, 0xc0b986ee0d2b0eb2, 0x78e584c740dd18ed, 0xd5adbf30a04fd508, 0x1c6aed5ab59bedbb, 0x25d05fccbddb5ba1, 0x4a58fb6b3f896319, 0xdb2f6343fd8144fa, 0x46a445de6d5b07e5, 0xf67a06684fe9e1da, 0x57b2515923b15c9f, 0x50439940820a2a0c, 0x62f4b9b26f04dab5 -, 0xe79ea601d01b033d, 0x009bc6176f10fffb, 0x333bff2f907ed39a, 0x253d0a9e626dd400, 0x7a9bbedcfcbef06a, 0x2d1b6a7a5b39342d, 0xbadfb462a124cc9a, 0x2e8cde9d82c15cb0, 0x7c3f81bcd6f1b2a1, 0x04cb0b8fa4075294, 0xfa36d3db38cbd304, 0x59fef93442883553 -, 0x91982a741cb9342e, 0x7b9d63ac17b01982, 0x530b4ec25a293ece, 0x611069ad9fa0f0a4, 0x7a262a59b656a79d, 0x6fe6f8f4d6d015b0, 0x2c2fd7641a5d4e50, 0x24b0c507058c911c, 0x834882e492fe45ae, 0x68d0b01b13432761, 0x0eacaaaf94178b8c, 0x123e3a93006d7d01 -, 0xecf2fe69377ff33c, 0x4fc960ab4408584b, 0x2adc445b1ee45654, 0x4989681cd1d09a93, 0x79509599afe9e3b6, 0x7f6ffbbeee861c15, 0x2ed2859fd6391b25, 0x5e8bd52289b6ad27, 0xc949280adbce7c79, 0x510999e865f0cd54, 0x7f957314ce7d373b, 0x4b2c0ea4bab08ef2 -, 0x2d7cc08b5c05a8db, 0x4609a0ea23507697, 0xe204ba35182c55b8, 0x5e4d5903fdef61e6, 0xfe63842f2826598b, 0x782a3fd3ab62a179, 0xd2f01a1979e5a0f3, 0x0fb4c6bdd637fba2, 0xfbff4c192020c350, 0x14859008c3d223c0, 0x65ed7a889c1a2e55, 0x1d78daf483fa12cb -, 0x5b54d11b01bc09ca, 0x54fde75737306515, 0x89725231105b63a7, 0x712d1f394adcda99, 0xb554006ee9abefab, 0x04dd8f7bbd4c5381, 0x98d22b3a31995549, 0x637a53de6b57122f, 0x8367d69b4c92da63, 0x236f2a9514250df6, 0xb265509af63d7b7c, 0x08522e36bc4b65f8 -, 0xabae725012ce8301, 0x493b257197a98ce9, 0x33185838570e5f0a, 0x65f5477ac414eb6c, 0xd002a36854699753, 0x2be693b4d96efdb3, 0x3b32484119bdc53d, 0x55691ac09a8fae1e, 0x0249e394514c047f, 0x765674c90b78171f, 0x1166f64638d6ab37, 0x746adba4cb52d18f -, 0x93e293653dda6cda, 0x5d004ed52ebf0b68, 0x65c7c42d0ad96cc2, 0x3350dbe11cafca74, 0xc638cfa8942fef67, 0x0ff2dfffc5ac1164, 0x9e1b625e649aa471, 0x13a219d03d2eb86d, 0xdb92859ebaf9f7f9, 0x645c50918f7d5abc, 0x25c10cfe99f7e5c6, 0x13d858b53f90170d -, 0xddb258b13ab1e7a6, 0x4849ff49f4e13fc4, 0x9ef87fa85511cda8, 0x48c50d4d3b4d2f7a, 0x6c98422c8007c9ac, 0x3fdd72e65a3d3491, 0x56b18cb165b4ec3b, 0x6e2c6df9e3fc3daa, 0xf6db5aa98ddc97a4, 0x423fd4082f3fb795, 0x42f8f5edf424d0a0, 0x1a091c2696139936 -, 0x3161c2bbb3b2d58a, 0x2e8d339eb0fb9099, 0x45ef7d11f6fab685, 0x7f222a068db3da4b, 0x9af96f9742549a7c, 0x55370df31dcec81c, 0xde98e81b131af02e, 0x58bd0622a474acee, 0x8ab40fa7ca882e0d, 0x5b4db195655f2410, 0x4754eb479ada77fd, 0x67a8a437d6fc8a7d -, 0x9888254a4f0c9d58, 0x3232ba83bed0c618, 0x587b0de0207b57d9, 0x020df6becb096aa7, 0xef9e41052a29a8ab, 0x4ae671ee70a15a69, 0x167ce954923ee086, 0x6878c3996c1de887, 0xb29c711490ac097e, 0x1cf41a9c2577d144, 0x0590796ba46d8d29, 0x1c2e6dc8d4aebb65 -, 0xbfb904f8ac9b4cb9, 0x4ea1742c786469e7, 0x5a422f48401be57d, 0x0be0afdc77d6d32f, 0x5e8765cba2c738d3, 0x7dad0475059a089d, 0x9288ae0c40df7df6, 0x51c65f97715a16d5, 0xa9615d4c786ff9d4, 0x507ffe03ec0189ef, 0x1c1f46684604e41f, 0x282fe9d567db0efc -, 0xebee7f8381fb8178, 0x5bd4b6045c208d57, 0xf35694743439ed71, 0x7cddd5a373ebc5ec, 0xa58df33cc68e3b5f, 0x40e6714f5c5c8df3, 0xea881d4bfd489131, 0x6b36400b491c28c1, 0xd4475cf594b6303b, 0x5b630cddc72e654a, 0xa0b587ad34394ce3, 0x3ea3ba6014f86275 -, 0xc3deac125d20eeee, 0x2ef3568410a2b3bb, 0xee6ba3fac5d7ec00, 0x5fabcb3337aaa23c, 0x6b1212e7b817889a, 0x0b37d285a9be51d1, 0x617ca543d762bf51, 0x0896b4ca694b01d0, 0xe3add9718277a1fb, 0x553dee7dd4784865, 0x904b8f7e936cf430, 0x5b6a78f20b244b90 -, 0xa2b876c2914b9bfa, 0x704de952e9d969f4, 0xb04ea1b54b7e7654, 0x5d307bb3949cf660, 0xcee4c23ebd049d17, 0x7a88293bb1031063, 0x00b8432b8286f656, 0x260a9c86a16216e5, 0xd140e6e6629d8686, 0x296011ff5601a000, 0x536f0f76cd9b2928, 0x267409c23a823dd4 -, 0x0f041043797f8423, 0x3da6102605962ca9, 0x2e69dfeea02098ea, 0x427e7eeeecd3a0c5, 0x75efa5e8a590793d, 0x1f5841df6dfdfc91, 0x1aa1e1b8b9f3c326, 0x07bd5b0983fcee91, 0xd169420be9c48939, 0x7940334f0bb9023d, 0x9bb330fff113764f, 0x674ff1b0cfe246c7 -, 0xe2083f8d7129cbab, 0x7e6223e3d9c04904, 0x9be411a7d5e883a3, 0x72642664e7c25590, 0xbb1f783b5c412322, 0x46716e8fd737280b, 0xfa363eeaeffde271, 0x6c256c131fc2c3b9, 0x13259abfcb2ce1d8, 0x53b96556e96aa708, 0xfaa7c8d25119da19, 0x05019f438e9f8995 -, 0x05e1d55a9424f1ee, 0x63e8e14e6c2f3f09, 0xe9d844e997a10158, 0x51904ed1e94a0ca5, 0xb09462d4df6bc6cc, 0x2ee5308e62172691, 0x3f8438484547187a, 0x62b92b8d9739ddd4, 0x3ca54ab5d39f083c, 0x25b3336048a288d4, 0x7cab0fd67e296979, 0x58ba2e783962cbb7 -, 0x77808f1a1b8f3515, 0x290c219ee7153bdd, 0x7584441f79128f01, 0x0442db406f5135e3, 0xe741de52ec030a9d, 0x37469756586776b2, 0xbd64c2a7173adde0, 0x2280b66d20888d0c, 0xdd1b53cb4adb0fb2, 0x3974964394c445be, 0x53b6a95e7c7fdd97, 0x6eacdc6f50496d95 -, 0x178d04c0578a5bb3, 0x0d171a5f5215c9c8, 0xfe0d0171c504962e, 0x04eece54b220495e, 0xac4d145001db67aa, 0x6577c466962160af, 0xcddae62d99686ad7, 0x7a053a048d230d89, 0x1ff09aa0e605a880, 0x5d260426f355232f, 0xfbdaf7b0b53aab89, 0x5eef31b9eb0df78c -, 0xfb787e56b7276288, 0x4dcccba87d630d06, 0x415e4a4bc0a44b01, 0x0f0a981f71d8ae33, 0xe0ebb786f98a1502, 0x0ea4aa3ce70dc628, 0x8d36240617ebe037, 0x2d20c0e1d2002b5b, 0x336f8aa411a30282, 0x1d87c67d8178ec4c, 0xe468dff8ac26b63b, 0x266086bd7f11c9bc -, 0x05cfeedc80d829f8, 0x146902a029dd3355, 0x413db9327c068394, 0x55fa413791f64c38, 0xe06395c10021bf9d, 0x18d66268cf79ce45, 0x9e7ae6858dcc21bf, 0x3ad51dbe97b558f7, 0x06792c747aeef43c, 0x27ec9b782170abb7, 0x6aafca394a23e935, 0x18f7cbd98db64112 -, 0x34146ce6b36edbfa, 0x1dcfb4eab7ccea23, 0x68498e1f45b35467, 0x1b20d71a3b71d412, 0x7a875fc94e602e3e, 0x78c15fa449576c2b, 0xb52326d01ccafe8a, 0x3f53f57324d70666, 0x3830836e39bcebaa, 0x27a30c73dd02c884, 0x5dfed73dedf2306f, 0x75ee4a8b6cf54f74 -, 0x97ecc9c5851a8e3e, 0x496b581690c3df2d, 0xf7bba1fe2d169e7d, 0x4b06184810a77bd3, 0x40e6d643b903c7bd, 0x3c90f63b5176906d, 0x92f47e1ac51f1ec6, 0x70c2454c53cc0dcf, 0xb5a75d246c653b4e, 0x7e5173a420a8b0df, 0xcafb44c471d0f4a3, 0x69a3a4e92bbe5977 -, 0x26e93183cdfeb424, 0x1e0489b56fa7e130, 0x669befa672fe9979, 0x0f8aea6a7ef65bf9, 0xff0b883ea96b51ff, 0x31a668763c3c8867, 0x6887a0029701c9be, 0x545644cd70c87d63, 0x537b6fb7db9410e0, 0x6ca227f10229b3b9, 0xc7d1b4d71ff22468, 0x522058d3b20569f9 -, 0x5f4bfd813a51fb62, 0x105b94a3a42424a1, 0x96dfdb685825857b, 0x14d98588154500bf, 0xb4db83514c7a9404, 0x67aaf998856faf37, 0x1229d7e95dbc821c, 0x7e617a17a2f72bd3, 0xe964cdba7222695a, 0x677619cc40a07eaf, 0x7f82c099a8df7538, 0x2a219175ec95a1ad -, 0x755ac147b51ff3dc, 0x4a87f652f86823ec, 0x6d8d4a923f50278d, 0x4bb952ac98c0120a, 0x968c57a6a31e482c, 0x0855a11481fd5653, 0x3f05db6ac608d16d, 0x33f9e5746e1079c6, 0x1f3458e3ec51f53a, 0x4ae3fc836ceccf81, 0x3c0b2e2db5875ddf, 0x42336a1262cbb5e0 -, 0xe3651453cadc3868, 0x25081cfd6e80a2de, 0xd4cb31092872e53a, 0x16ca9349a11a9c37, 0xb1d3ae440d1cb675, 0x41b2d6ecbccbd6a4, 0x475e6a844c3d0ca1, 0x2cd0e0dedbf07023, 0x85ad446ddb002a6e, 0x72a06e5419a64609, 0x9e779387e9a3276c, 0x414a8163a9408b10 -, 0x25c7b53c1791333e, 0x3ea57190b42cd838, 0xbf20b346b094f121, 0x47570cba99b06c9d, 0xe6bd01c8746cb5f2, 0x3c0b0b8c4c0968ef, 0xb22009690e243975, 0x251737e4a5643da2, 0x3cdd49123ab89dea, 0x68748cd1e3cc45a6, 0x563746685effea7b, 0x4e4c5b1c86eb3a29 -, 0xe1ba017516d32070, 0x5cdd35a0c4ba93a3, 0xdbc66a0c7de30288, 0x22107156a0f700f1, 0x0fb69045aac0f647, 0x111dcb9763d08bc0, 0x266db39f6d78cced, 0x02a32587c7033892, 0x76fc94ce6a2a4b19, 0x474db0f12fcfa96f, 0x0c44584c08377ac7, 0x5f435bf43140f4c0 -, 0xb9741c3014eef7a3, 0x54596c23b536ff04, 0xeadf56bb6ea39450, 0x32f24f6e1a656b10, 0x21422e4dd5f54e3f, 0x0d6ad57853660607, 0xf6f62ffdd0bf9928, 0x72569c930015caa7, 0xf4293579931b9216, 0x049d6a4057e6827e, 0x6223e20060be0e05, 0x20d91ae969dfa9a4 -, 0x02611b345456d47a, 0x601dd413d1bdea0f, 0xe6b017b26bbc9bf8, 0x63399ff3d6542359, 0xdbdfe225045a9764, 0x10acd93346649beb, 0xc652d5a50e0535ce, 0x49efbd5639c4caf1, 0x65a5dbd8a304de65, 0x08ddebed0e865be8, 0x5db8337d5e715261, 0x34cf4c75496807e2 -, 0xd840c7416e44b56a, 0x10fd30d282d8b151, 0x36ffe6df2c1c9568, 0x66d8a38b6d31a2b1, 0x01fad3aa61984774, 0x412a9fd87b303d90, 0x2720945ee0f0ec9e, 0x0c91b4c7ea84cf37, 0x98462f25fd5832f0, 0x6f4cd578c490d842, 0xecc7d24c31ed3342, 0x580ab96994515fd8 -, 0x6d8a97ed98465b3e, 0x16995dc010908ae3, 0x50626a4e555b774a, 0x082636e5a8a9b568, 0xa99435cc4823b413, 0x41fc423d10eff4e7, 0x114236dce6f9f9dd, 0x6c3995c4bbe0aadc, 0xf3f22c975935753d, 0x6b1b3f27edec2a78, 0xdbadaac32ccc292e, 0x3856036f8a3795aa -, 0x947154caaec01d73, 0x0a22e573e3f0f49b, 0xc50c949f39c184a3, 0x2aadd0868535d0c8, 0x22bc5bbe5f992446, 0x15d36adfca3ace90, 0x038010e37a6308f9, 0x161b06d8d7180307, 0xcfbf4e3abef8d056, 0x2a1765fe9c7696ba, 0x6a15d44ce18ef392, 0x5405239c0369de64 -, 0x5fabda1210f58e29, 0x40cbb03974b37035, 0xa29fdf2875322520, 0x3b32ace85edac547, 0x0f0c92b41d679df8, 0x7f07ecd47a7d2f0c, 0xb5fc65c05accc95a, 0x0e8b1da70636f221, 0xb2ebd131f4e8a846, 0x7df51e4aba57f391, 0xaa2f3d40fef689ed, 0x0ee1e115fde5d582 -, 0xf7d025b42e240ae6, 0x29fc1befeb526af2, 0x7c5ffcaff205e565, 0x4cf4d0d8840e2e1e, 0xb8b00d1810ad0ff6, 0x44d3af686ba915ff, 0x86a8fd1eeea8d08c, 0x3eb300adcf6edc4f, 0x8db03c266b588186, 0x289d0fd301e96881, 0xba83ba260cccc170, 0x26ee69546ceb0c77 -, 0x1109d8bf92c4ea05, 0x033aa036671937d1, 0x4bd9902e5a664a0b, 0x42bd48ed44fdbb71, 0x7359e19357a9622d, 0x0d6ee92855dae22f, 0xc24debb323643859, 0x4c60fee1e191766e, 0x3beaec0e99faa328, 0x056c2ae1709c5b0a, 0x7fe89e0c62710909, 0x7e3b5cd3ac4e6ce1 -, 0xe9d06486ac7370a4, 0x4b1a8c62e99f9429, 0xb11a50e20bc3197f, 0x75ec513c25dac300, 0xfb9fd064b1466dca, 0x290379cfce59308c, 0xca3ee3fb7db99943, 0x2af7a3e930faea44, 0x0d294e6d1505e35b, 0x7d534585181e001f, 0x90285700831d4cfe, 0x419f25105d06c90e -, 0x5f71e79f5f828172, 0x02921e2a43326798, 0xa0981553e84d4a6a, 0x220c82041938573d, 0xfd2b5b78ef20c927, 0x3c99a2dc611caddb, 0xfb1247fd99ed2828, 0x4b3a3739f724890c, 0x7775ea2d7d2d1017, 0x3ab07cb5ba8ac987, 0x82e5123a20a6b5c3, 0x44965098aa82161f -, 0x20948c77e9ac4c0c, 0x521e934ab214157d, 0xc8f4f4052dffedab, 0x1da963c2ef46f27f, 0x3be7631e212fa2e0, 0x0d188e88d1a4184e, 0xb4483ed385de4bae, 0x4ffadfde83d2b0d9, 0xacebd9a51a938608, 0x40968c0c9302b0e8, 0x85704404d06f3a5d, 0x3e9f477a61a26d37 -, 0x1da1efc7cbd18d12, 0x4fb87a47b9f2cb04, 0x7556a45e8b5c8caf, 0x7f6991b7723b35cc, 0x3fa10a169532635f, 0x15e61b1cd72bd52f, 0xe6b45dc3b4667c21, 0x45cf3bd4bbf39baf, 0x7343b0636a9d63f9, 0x457551c49ac49567, 0x331e611a3fcec018, 0x7d19e2584756b92d -, 0x78951df174059655, 0x0573cd896a793337, 0xb3e37121fd458870, 0x3cc032b1a1bebc3c, 0x2571dd06d24d5a41, 0x017382ec4aa29ffa, 0x6cda850c15a224ed, 0x6af59bee2d7586d4, 0x287d3c4027f80ee9, 0x6aa570b9e51d4f25, 0xf29f327c5e0490d5, 0x00fb62f93f43edfb -, 0x7b06e602dc313277, 0x5d8dc98e723b039e, 0x5bb61813041a589a, 0x2a4c9f13eef7f1ec, 0x9439edcb4bbaba6f, 0x027f4d494e7784ad, 0x087ae2a2fd6bbc8d, 0x230f37ba41aec2ff, 0x63876e43daaac09c, 0x28abd7ae6e17dbe3, 0xd354d50cf000982a, 0x1dd774a1273aea75 -, 0x243658930d4b0902, 0x0df50723a2da63d7, 0x22bc07b9ac9628c5, 0x134123d68aa939cc, 0x4e84ee2cf0d450e2, 0x53a8c6dbd4aa9ed1, 0xd06e741c45610565, 0x608da7f96f2f7e19, 0x59b7fc9fe6a0243c, 0x0da36bb46fd1eb3d, 0x09a11de836914182, 0x3becc1cc0b96f1e4 -, 0x820b8a4cad71c17f, 0x2a425dd0204a843c, 0xf6f7fdaae1523c28, 0x5fb74c0c961e6fb1, 0x0c76e0f72b7845a2, 0x273db117946ce778, 0x7a22d35cdea5934f, 0x73aeeb1b24265d5d, 0x938a618552e4392d, 0x6050215beb6c1923, 0xf32f6ab781efbf2f, 0x2e4ece5c476e1354 -, 0xf2a4a59613812356, 0x555185da018933fd, 0x2fffbf95863bce54, 0x72644f9c3181e7a6, 0x98c6b1d509e3d624, 0x5bddd5730939d7d0, 0xdd197613d550fbad, 0x7671fafa1facb923, 0x13dbb61148c5b802, 0x616bc5c73ccdc3bd, 0x0b175b4c46fd8871, 0x498a1eeb000ab870 -, 0xa49f1ca2d7802521, 0x6906346cce00be5a, 0xf1bc33c727dd52b0, 0x5d005ff3122fd749, 0x51318ad5d7c622e7, 0x50f93d6d15e46e82, 0x88dfa2123ffff3b9, 0x3848e6fce3cac6e5, 0x6cefc31a33ea4f5e, 0x0cc5e7dc4e5e144f, 0xee2009402e59a7e2, 0x257679fdb86f4712 -, 0x4cf68953d8b17e83, 0x710f970c16ce2070, 0x4000b8e9e51e6aad, 0x5af48dacd01f24f6, 0x209679d5d3fcc916, 0x0a3538dd7cbe8232, 0x2d6d7aba44d990d2, 0x46c718f2d4b2c1a6, 0x9953d799a378233c, 0x4f4e80f4a682e7a0, 0x9912f04acbb77eee, 0x317432079a195b2d -, 0xaccccda6a1c11e3b, 0x3fd895817d0f3be2, 0x016db17673f750ea, 0x635fc619a24009b6, 0xb8447ab3370da1e7, 0x6c893aa19abf4221, 0x5f35ac703d8508d0, 0x13533d324d4adcb5, 0x84610370dece8512, 0x2223f126f9a70f4b, 0x18f00d60f3bf6a04, 0x174bd78b20ef8543 -, 0xeb179bc6a1698189, 0x732bf44a62015302, 0x98352342bc0e4bc6, 0x053f6640c1549e85, 0x65eee8b0397c7ce8, 0x790451f39f2fa27b, 0x36ffa0cb286cdb97, 0x46d07cec4c967bf2, 0x7c849ace30868412, 0x6dee239d339ef499, 0x8ab78548f273e57f, 0x01c5bebd8b7f5ef0 -, 0xe440e5f042eae93b, 0x65583f57fe057db6, 0xe6d5d26c24a565c9, 0x6b3b87a0a6ad702f, 0xd3f5d533117b8e64, 0x4addb9d0da92df89, 0xf1bd51990e0f9bfa, 0x30c624ec1dbcd0a4, 0xafaf2f00da7023a0, 0x3086e132b54574e4, 0x93bdbd4bfd3dd8c7, 0x690976ee132c892e -, 0x86fc11c79524d198, 0x0f6b95662e02c734, 0x5b78bb385564f568, 0x55c9b3f55d7cd16b, 0xdf1316434ad1c07f, 0x093d67d3fdf312de, 0xa1fd2257ea57b3d6, 0x4b5b18abe4b54439, 0x66c28f5b59d796b2, 0x7baffe6e642fdea4, 0xb9d3753265e68ae4, 0x40903bd6dfb02d6f -, 0x357958d4d72d6bc8, 0x179330dea4659dd3, 0x5a9ca85bc8721aef, 0x0209f09e03c9b225, 0xc0bf2e9738933495, 0x5e0dde4d715e50c5, 0x2743c96b66a6b951, 0x6af96188a0d6d358, 0xb2f3c72820f2a709, 0x5e9b8fd43327d9a0, 0xf0b13f5324012177, 0x7abdeaf4f741bace -, 0x6f006249351471f7, 0x3204eb91cfe9ed6c, 0xe09af1c83c13afa2, 0x6d70ed88d5de535b, 0x2078873d1a2faa1f, 0x5c73bedb8d96f3da, 0x41bbb407a3a1ce1d, 0x7a40ec2fb54eea85, 0xd6d569cb9dd722e3, 0x10acf67805927b6a, 0x27c61d818cc0ea05, 0x57b175c9f59904e2 -, 0x4f7b40bc92b5a60d, 0x51431f647b46b89a, 0xcd84dd55cc2a720e, 0x6b36059700809a1c, 0x78e3e5dd060e9a0f, 0x630c0c1a146c77d4, 0xc9925b0dea8fee2b, 0x4728f0604b16a06d, 0xb4601050635b2318, 0x2484f7281864709b, 0xbe2ed2a2523211db, 0x6425d4ff23dd3a5b -, 0xf0868c09017aef5e, 0x2733d1e1adc6d5ee, 0xa631db49f17f87e9, 0x36d753ced54d5727, 0x451d17fb6c4af537, 0x1dcc4d611dd55b04, 0x0bb8de0c8d3e549b, 0x2fb2ca1271592c3d, 0xd877914ffbc31ced, 0x190809a196504d10, 0x44bdd65a970277e3, 0x13195c678b4b01fa -, 0xe69a41a54f84d41f, 0x61c7c870565e4508, 0xeca2d2fc6f0e1c9b, 0x7f065480e257152a, 0xfaaa9f7c3a8873b0, 0x43fcdb8db58a324a, 0x969a79026e9da7a2, 0x4eab135af328b9d9, 0xb38aaafe87f85f7c, 0x69eba4fe1a6b6f32, 0x5607f6c6b4d27cbc, 0x273072bea774f9e7 -, 0x3c1149e3c8d51db0, 0x161f8cd433c28bfa, 0x765a61f218fe70da, 0x442b5d405f2036bb, 0x96f790271c564cc1, 0x3d5dbb33505cc956, 0x621a38b446af395c, 0x2da978b45bb70ce6, 0x755aca711da49388, 0x46f2e33e55e86df8, 0xfc5b454d5cb7be24, 0x67df47d68d8f6d12 -, 0x7a1e224893898aad, 0x0400219c89c2d13e, 0x6c969e4d63d460d9, 0x4df64d5df8b60ad2, 0x1feed05a45ff89ed, 0x290c4b59e684b4ef, 0x97ffbc3df096adb6, 0x4ac6037e76561c96, 0x1bc40299115e51b1, 0x7169e0a1d96aa1be, 0x43f55f8b6bac596c, 0x1cc6a0603081a178 -, 0x8e1d2db69bc925d0, 0x6ffb86eed51d2931, 0x3ad1eb242e0af1b5, 0x338198152fcd6d7c, 0xc1f381496df13943, 0x05d9242fe1c60b02, 0x39617510de7eec81, 0x24d8ba5ac76b12b8, 0x280eb2db9e548483, 0x6c51317b3a8a93f0, 0xb2a9f90939bd1235, 0x2da9de86c39f9aa6 -, 0x7f54917103127b97, 0x7be2be5ad3276169, 0xc969d703d31e9da7, 0x0500df3bbb1f8a4e, 0xea05c77685795917, 0x049575a992d09345, 0xd567f8de2daabe35, 0x383fad35a8e035cb, 0xb9353eb2bbd43d56, 0x52b3953221860c5a, 0xf9e4bcd46dbec03e, 0x4b0db0b4a7b3279c -, 0x8cc5f6b6e1ff80c0, 0x1bd2ce464b552215, 0xd008eb25b39c4236, 0x3b4ce5bb2f42a9fc, 0xe1f249681d153d9d, 0x3e022cb14bc4c5b9, 0x8a11d021c8ed5a53, 0x560d3fb258bec495, 0xf4405852705a6012, 0x5c8bccd2b1b3efd3, 0xd93c0f63ba7ce0c3, 0x337798cb3e93dbba -, 0x7a9f68cf800c8e88, 0x579afe689f3ebcce, 0x7dd41d6cdfbdb4a9, 0x3802410c4e1b274e, 0x64241d770cf0db02, 0x2f7c8133c74bde23, 0xf3c3fd835ed1952e, 0x741b1d88a3cee37b, 0x74e1ae644683c68f, 0x0c80dd9e0f7a91e1, 0x3984d741f3e47c24, 0x4b3eb97b6a39d252 -, 0x32e9b9410da9a195, 0x11d09fdc04ec3b41, 0xf92fd5e53cddea30, 0x296e095589e0ce05, 0x4e3200c3a283b696, 0x7e33fbba44ecb32c, 0xed3c039790ad0033, 0x5c8ebb260b5ec084, 0xa667455bb79d2e9d, 0x12fbec9d4f5bb155, 0x3aa5f6bb4d0d8d49, 0x0ca652ed7065d80b -, 0xb7938753d51c6f83, 0x41644ac1a602f9f2, 0x84223d4d63c38f7d, 0x71057b4b8b931282, 0xd39fa015165f47b5, 0x7536c8a19c33c201, 0xbe713ca4166c2dad, 0x456c98c2b4198511, 0x4793f25e1cb44658, 0x1d002f1cfe1a1ba7, 0x9f9ed6e1e1a27957, 0x095dece028426bdb -, 0xe57d3412fc1001d6, 0x481c63a0d9b25e99, 0xc756b6ba0dc02aa5, 0x24af047d79ed4683, 0xe37ac10133b68275, 0x418b45e570802012, 0x87578def0c3900ce, 0x7c5661923b8c9740, 0x5f4ab0a6fdda7366, 0x0ac6100825e4eb3c, 0x308528e42c9e4d32, 0x436e5979933ddde8 -, 0x0cd6ebe123352222, 0x63d1768a46f33dc7, 0x96cc55dff38c9273, 0x474438da7140411d, 0xa184b89b81cf6402, 0x6bf820a3aa675050, 0x3bd4720417391f0e, 0x3f2b8f859a8e0cba, 0xed952561b125da29, 0x07eb1ac74165097d, 0xc3f70d0c7db0a9fd, 0x5ab896a489294a6c -, 0xd4b608975c20018d, 0x6243b039f25d0456, 0xf766e98fc24c7464, 0x20035c09d2291e42, 0xcc0e5b5eeb462524, 0x24bcba5505f90657, 0x43a98d98e4fa9bf6, 0x3b621ec4188264d4, 0x633472fe235c812c, 0x31a20844a3316d23, 0x47b80db7d7f5d0bd, 0x22d482f5663780f9 -, 0x4df227dc52142020, 0x25076d0624bf137e, 0xcb4a6ee30a657645, 0x0ce469dbb5ada433, 0xfdb06251f65b9c5b, 0x44f82274a8e8f538, 0x98fa4c81cdec4b97, 0x0ccd61d1abb61d0d, 0xb9dc371344c5ab54, 0x35dcd9ccf8e5f919, 0x67fc81f369ba5722, 0x121b5aa1af6024da -, 0xe0b1b16b0fb1f1fa, 0x4dc688d6d3b1805b, 0x05c187cf10e40104, 0x71af39c743daacd9, 0xe691e97f82acf4b3, 0x0c46305b9243bf5b, 0xb063af137fde616b, 0x4e26e72a1de067f6, 0x61fe66d01a221004, 0x172fe9240cea50b1, 0x4ff50d37b2effefc, 0x06be02ab0b89aa5d -, 0xdd4aab96717af213, 0x32322555b58a7ffc, 0x7812aa965889326d, 0x1bd608f60d6457a4, 0x2c7b6b44e999e141, 0x113a86a87856a8a8, 0xd95469fc33814855, 0x4a18dc36f6bfd586, 0x0706b60bdb854fd3, 0x4dc356685650fa90, 0x24ef7cfce41f8dcc, 0x19049c3e632deae8 -, 0x5c9a4e28b7138a89, 0x0f0b7dbc1e5087e2, 0xebf49cdc66a362d2, 0x19e4b815e6576c85, 0x1896051ee3b6063d, 0x09ecc741852a68e4, 0x4009034def986795, 0x36b440ff39b4b5e8, 0x9bc2647ee28af1cb, 0x62613c9dd152b3a8, 0xc2018ae5dfae5f2d, 0x29ce5ef30009c855 -, 0x0b653558b21d2b1c, 0x45e2c505d1f74936, 0x48304373240553d3, 0x0528569885a82310, 0xa90d402e33924181, 0x5e610edc23cb9555, 0x28890ae7e007d28a, 0x7e5132b6b1ebae37, 0x0d5252eb7c94cb1b, 0x308ddaea1fdbb672, 0x99fac0b431730534, 0x77d54ed63b9325b9 -, 0x4d647bcb76c6ec3f, 0x0e968b22ec2cad86, 0x4b22b5ec30b08a35, 0x3b31df3b52326b5c, 0xbe84f638dac3105d, 0x7db085f133ecbed3, 0x7a8b694596f2cf2a, 0x67b2e6c15d16e0aa, 0x4808b20bf173011d, 0x25d5fbbfbe66f864, 0xf67f3f3cd9743987, 0x654250e89617ddf3 -, 0xf5a1a7e0ba0a88c0, 0x3616c781799ab50a, 0x2669c27a2d256902, 0x3a8ec380e12fd7dd, 0xa25361f44a418e30, 0x2942f3001d233645, 0x60f1d3b7535a4133, 0x14deaaa12e5c7bdf, 0x0089fbece10c8d6f, 0x4bf7c313757c803d, 0x65aa30bfbb70567d, 0x4fed47af409a3fb3 -, 0x07557dd875d3daf5, 0x36c49c2380e3c9bb, 0xa21f643d329ae02f, 0x6cf6f7474338bcb0, 0xb5df78136a0f3012, 0x031fb2df2e00e9d4, 0x4d86fccbe75e79cd, 0x23f890e082d03b7d, 0x5716a1ffb50a8262, 0x0199b50aa6cf3302, 0x6a1be351f86090d5, 0x36095efc13349364 -, 0xffe752be8ce46920, 0x65047a340b652f65, 0x320ee55fd03156a6, 0x5af6aa45278409f6, 0xa6caf283b1cf3850, 0x4e3a988f61072f96, 0x750f67926b18f680, 0x09fc3f2927d21a4a, 0x914893c2f2ce1169, 0x4d15b367121b3e75, 0x6cb12559723774f2, 0x3ee5b8c2a70e054a -, 0x7dd9b3518d84d2d7, 0x147d5a5a53f57a58, 0xe1bd0904ad842a05, 0x3a0f3b029c9a5845, 0x7153c03261410074, 0x4e203d6737058c17, 0xebecf5cb79f28af9, 0x574b889870c279f4, 0x326317b005f444a4, 0x7480da44b34f4b1e, 0x7c5f21cdc46275b2, 0x210494b9ee24e4e0 -, 0x3cbf6ca1f4aa4ead, 0x6bf3872ccbfed940, 0x19e8a84673a566ca, 0x61a80e16990401a2, 0xea2e029e7f9b3824, 0x5762298465f0ebd3, 0xf60e36d4969f9af0, 0x00b826180531c799, 0x17120ec95cf3c61d, 0x47196cd6de85c7d0, 0xb0d47cff46a5cba3, 0x29271400d7ede26b -, 0x835908353516b894, 0x4bc57f8c1eedec8e, 0x2ec5deede5c0db5f, 0x7b9fc48ac4a689fb, 0xf82ce6de88fc10e5, 0x6c5d84a70e03a3d6, 0x88a211fc4ea531f9, 0x7d5583e5918aa03e, 0xbdf2d70766fb8f39, 0x5926497e734ab18a, 0xd6a9872b800cacb4, 0x757c1cd521fd22d6 -, 0x22d50b0c13ec4bc0, 0x288a77d34a15e99a, 0x95c8e78fced3d4eb, 0x45ece109c15be169, 0x878ef262d0132128, 0x48110e9fd98939d6, 0xe3fc5425d2e7741e, 0x050ca6e71f599c65, 0xe02f97605d9fe375, 0x2af48b9bfee410e4, 0xfd34a1c107229a54, 0x43dc6f0cdcbd41fe -, 0x15b4eb7d65cc562b, 0x369a7b0dd3e91248, 0x2b087611edd32810, 0x116b234ddce09d7f, 0xcdb03cae8e90d2b0, 0x4017d51587566038, 0x081793739242b600, 0x5086e8e633cd52a1, 0xf5ddaee155cb8087, 0x773311b60d59a7e9, 0x36e5aa0acadf2068, 0x7126a4281b192882 -, 0x54a10df54f7ecef8, 0x3cd7d2fbb6e33f67, 0xac31eb6c3e740c25, 0x517db54840feb2de, 0xf17cb269b3ce27a2, 0x04a8fecd1dcc99e7, 0xfc887c1f2f85a2da, 0x280da7425bb55b01, 0xa1af72f5256a5a53, 0x71da839fc459f465, 0xc203fe7ba6587f71, 0x08a4201f77a4f335 -, 0x6cb9ea5683014d96, 0x5da17076b6b51ae2, 0xb55ac168c3e3997f, 0x41b9a32373d78f7a, 0x96f58033b8600a50, 0x6ebfba3ec9d956cc, 0x0ff8883707d66d0c, 0x2f562b035445226f, 0x2388fc015bd368c7, 0x2b7d802ce27f627e, 0x301f0369c24083a6, 0x77e139f6da8d5aaa -, 0x9f78574697fce43c, 0x02726c94565421b6, 0x1ad6007338e26585, 0x6134cc5eb35c02ff, 0x77ae739c9cdcd1e1, 0x04e96543233c7a13, 0x97d3926dcded2e10, 0x6bcdff7e14cebb73, 0x9c46ae2b32489774, 0x04a97b9a0473af8d, 0xb0350bd910d9784e, 0x448212d3e2164ad7 -, 0xf3464e0351f5e995, 0x68ab4d24b3ade8d6, 0x86854d534002af20, 0x613f7ffe5de92aeb, 0xb385b4f4608a370a, 0x220dccecbc6f2688, 0xc31ec5384abd3680, 0x25a82841a2000fd8, 0xd19e422504694236, 0x0bc1124d541781f5, 0x0808651edcd99176, 0x41b81f223d429c76 -, 0x1a6dcb2662cc80c6, 0x0b101fb0ef0d1f74, 0x6f02aed8f8327119, 0x5b4c5176ccc4a340, 0x8fcefd200d6ee8ed, 0x0548127287f44749, 0xe1efeca1fadd1341, 0x0e74bc189dc9016c, 0xe90470353f46cb12, 0x69513d3455bc890c, 0x9503686f1f2497d1, 0x280a0bb7733f1086 -, 0x14e5f99930a91dea, 0x7840ad84b03c3878, 0x46e32c654fdbceb1, 0x7e88d2822bb2cecf, 0x4d78a8aed7f8661d, 0x70eb17416ef40180, 0x97b6f1733c474a10, 0x3d0d27fc4c7084ef, 0x730f60f6a1ee0d71, 0x7bf6e3885d3d9302, 0xa1e8af33742f1611, 0x73b798ec129822ed -, 0x0f669bb094642a70, 0x142927de789fc4a4, 0x0db18e01fa98cbd7, 0x6ae4d37674be1451, 0x7175e98f178b4b74, 0x40534e319bc52c6c, 0xb7211d252c4db879, 0x1a7651f8f3ed1aae, 0x9c9a43932d50cc97, 0x630b232b7201c359, 0x327d77575f5b3839, 0x5f0e19e78431864a -, 0xbfbb00b6530a3bb6, 0x19ba9d60d97f7857, 0x759779de744bd764, 0x5facbe63177791e1, 0xc74ea511c56a3b61, 0x1d8909e84083c31d, 0xcd20094b507af492, 0x2ef1b9c07c92ab37, 0x8430ed9ef8494fc9, 0x3f9170e6df5b1fa1, 0x1fb8dbc837175d73, 0x65b961b58008d022 -, 0x7e1afb6816864b6f, 0x54c4b92c534871e9, 0xc0a1dcd60d61ef84, 0x4390f0e992c41298, 0x1e54e2c8b7c27348, 0x7a987e01a2ec308c, 0xee42fbd90c4a89fc, 0x1ed8c77f8d7c609d, 0x569dedaca99a3346, 0x0eb471e609fef4ed, 0xc915522a3b9fd03c, 0x726453b246746bfb -, 0x4ed3cae53dc5fa4b, 0x1bf1e4b34b9feef6, 0x0850df9f0401fac3, 0x0a58d33cb2422e2f, 0x3d197f9603ecfc29, 0x45e46edba1cc432e, 0x96c0c93310d9bcaf, 0x18de3a458be2c33f, 0xc9e65e5bcc12a49a, 0x71a5345f0239b187, 0x53b3b2f01c5710b3, 0x438350f57ce2ec4a -, 0xdbbd368a760391db, 0x4033638dfec29fe2, 0x297ad75ed73117fd, 0x269c08d54b106e8c, 0xa4e3e4fd238b4218, 0x1f48a1cb09208aaa, 0x9575153115cf5fa7, 0x59feeff0876fb74a, 0xfdedb4af6f368710, 0x79be1fe79fa674d4, 0x689d6bbb4c707c39, 0x394a451499057bb1 -, 0x5887d4fb21fc43b3, 0x37628dfc4b5c23bf, 0xc66b76944b34bd13, 0x6e97f0a8a45bcb36, 0x3ac6b10139edbbdd, 0x313f4846b6745833, 0xf8758d9777cd9037, 0x02fdc98f02692537, 0x9e79f381fff833a5, 0x25ac5d68c49b105c, 0x1e9f48a076d8c9ee, 0x788c85c9fe9543b3 -, 0x776ea51db3b3b778, 0x0007c44055b64db2, 0x3c392c2a82fddd25, 0x65000203be8ee976, 0xea119666ab7c50ab, 0x528b2700e8f82d39, 0xc4aaf797118b8282, 0x55e5a7d5382e0d3a, 0x15a80b22e89f1039, 0x199f68594b1247a0, 0x8d5630750d622435, 0x2687f48cc6def5b2 -, 0xa16b0c0259eafaee, 0x7aeb10834e93595a, 0xe31bcf34ce679d9f, 0x4e2c19829eee3c87, 0xa46869cb8ca35c9d, 0x3cd35313c08504eb, 0xa088eca66e98389c, 0x44c562f0f7262740, 0xd3eb8a28f447523a, 0x43a0e059bfe37576, 0x0312c5d6d0f2e0ad, 0x5f30aaf0d1614c61 -, 0x6f09a7a6e182b0aa, 0x575db3d21a82296b, 0x6599bb5eee7925e6, 0x093f89458dcc2fe3, 0x70c4af785151fc84, 0x1230c0c519de5480, 0x0e66f8f93075a4f6, 0x5de4a122633a5c6d, 0xdb99cf83f9ece1b6, 0x1c3acd4a13ded617, 0x4dfe69e68f59c447, 0x482ba1f7715a3c16 -, 0xefeed2a7c81ea8fd, 0x4e089eeb713a572f, 0x78bc74acfbdf322b, 0x4b4951ce8eb86fbf, 0x0eafb6b46ac6714d, 0x72913ed109f7d404, 0xb498bf6fcde9e3a2, 0x3c08a283ef5ded62, 0x9af09f593a48b346, 0x7ed52441d00d4980, 0xa78e843ee5df44ac, 0x25db12d420a86151 -, 0xec840e7e89d049e0, 0x5a34cbe928bf96cc, 0xd875dc5525da882c, 0x2af4442fc256827d, 0x089fb428c2ef5a5d, 0x0b573ace080a3d9c, 0x6f57282554c240da, 0x425ceda6707b6bc9, 0x94b5a8c3dde824fb, 0x264f6f6a445b5da9, 0xadf292191c5c1eb7, 0x5e302e82fa4e5533 -, 0xf51712fc44237f35, 0x2b0af62c42e56e66, 0x10392cb4d9c71b75, 0x4d7e08fe8457a95b, 0x210b9eceb04534bf, 0x73329d1c7d88e1e5, 0x667a43fdb4ba79e9, 0x3435ec04276ede87, 0x38b8540a1a78b098, 0x4f6c266e6793bb78, 0x447ea35172754041, 0x109d7b742d8c3dac -, 0xe3ccab45d2a4f6f7, 0x59040bb73f3bbd2a, 0x730b39d65645bab5, 0x5c61aed2f83382aa, 0xa992143de3cf83e1, 0x13455cb889b700f9, 0x54648228b310e2f7, 0x5b837752ee0f733a, 0x3923a6c0e5ea0dd9, 0x5ebebd01fc9ca9a2, 0xa34c205b8fd94258, 0x7d1a10029a0b6cd5 -, 0x6c83c02241a46527, 0x4127c85d6be1fc62, 0x26f86ff5ca7240b6, 0x2167391e7dd95cd9, 0x79227506ac78caef, 0x1a2cf919b8832a0f, 0x07745266405cf574, 0x38095a07f5713ae1, 0xe5eeab985ca3e7e7, 0x6a5dd9eeb734d639, 0x991027ebe44a4822, 0x311085fb4de9c1f0 -, 0x33f361e21066c3b5, 0x550091d2dfc8688f, 0x376345c5532bac13, 0x0aa0898f990931b5, 0xea2f3346e5d3226e, 0x208790ab78776afc, 0xac7c2ae63433850c, 0x3c5c373ada10ef52, 0x96c1b4003f4cde6a, 0x4546a9c475c09781, 0x6c961fd3e8536294, 0x43f36e63fc0d5066 -, 0x296601d8c42167f4, 0x241c1fc38565471b, 0xdb00a27e11ce9617, 0x60381181b7e7e4ee, 0xc1076b7635ac4d52, 0x0166010ffb8dda38, 0x5238f69becc43e0b, 0x63303a2015708b17, 0xe8badb2e5bb22591, 0x3a10a4e218b6131d, 0x236ab01aabf1a7b3, 0x1ce8a51a68a4126f -, 0x59e775e2a2a87928, 0x770b48eb4b738301, 0x0b43c2be176bf79b, 0x1957850fb6424660, 0x44455ee1ecb0ab2a, 0x620ceaa116eef4f0, 0x0198f62cb6183f6b, 0x3274f78eaf2d55db, 0xd2ba4e460cf7ed5f, 0x19cfc17bc0b66f43, 0xcbae6f45b1942722, 0x5d93e44739147b58 -, 0xd07180b9d28fc597, 0x35372b21b2ea5a46, 0xed2673477f083464, 0x7a9ebeeecc57e6c2, 0xb51d991a81a6b314, 0x35e7d90f4ed6de58, 0x45f21e209510dd05, 0x446ffd2715c8d380, 0xe69b5c7a9b6d3e76, 0x1379e79fb96912e6, 0xc161c848bd508738, 0x22264a049d8cfff6 -, 0x32321a68ff7ef7b3, 0x57b0e50cc585b333, 0x1c08c65ba9d764e7, 0x5534c793f92f00f5, 0x7a1ced97eafe6fe4, 0x6b8933739202599c, 0x618c5f8fcadd3ff2, 0x2a8719b3e6548653, 0x346a9ec5c4200f0c, 0x7a36b8d00d0eda58, 0x844b22b75021accd, 0x769737059fc5e465 -, 0xdb1ba69b5019f266, 0x1777242305db9ac1, 0x491d11ad264b6ff3, 0x136198dfc57a3053, 0x4a6cc64741eb7176, 0x14e811c97fc97650, 0x6b64667f71be386d, 0x3286fcadf019eb5e, 0x3f2591f4498e10a0, 0x674fa7c32df7867b, 0xbae8ec7ee100dcf2, 0x03b2c0a20a6372a4 -, 0x4c8d76b471e24474, 0x421fb6a7b8a3216b, 0xc672bdb2fe8f514d, 0x202af653d9aff3f5, 0x05e5f80f9626953e, 0x7b721fa3ccd42ffc, 0x99d8e481c0f70479, 0x054c31746d23362b, 0xfbef2e20430e8025, 0x60e1e3f02e7720c2, 0x161701874eb347e3, 0x363924e90cbb77a6 -, 0x180f5ee1863a1a6a, 0x2f79c0046ff79fe2, 0x44679866e35447f0, 0x1c64c6dd73e0d636, 0x1d8175566341469d, 0x5ba634965b8b9e87, 0x8f48744f976952a5, 0x744f28d23db94c8a, 0xd15e84b1f232da34, 0x556f3d7aa38bee8c, 0x14693c56e866ef89, 0x1564fb9a0f81eb03 -, 0xe97eed56fa2b483f, 0x6d3f7e01aebd1957, 0xae8f128aca3b3e45, 0x3d41e85ba2afd3a9, 0xe4fe485e4b6d8328, 0x65c49b4c3e98098e, 0xe96a00e054d6e91a, 0x394a2122738cd006, 0x715cca3dffd90785, 0x7bc3dcde15890965, 0x6dcdc47a33a148ac, 0x435db9d6dbe1bd55 -, 0xd74d4d6e0fd89c27, 0x25e727f6a5380553, 0xbe54127ba6c5189a, 0x65c87d3c3e61939c, 0xc34a6d122a809e2e, 0x7de6b787f097eafa, 0xb8f8b6e701758661, 0x10705fbf97042046, 0x1591614e6da2d44f, 0x7c74f26ec6eb070f, 0x9ad98c1a50249c60, 0x6e1bbd44d64b2302 -, 0x937cee76047790f9, 0x5b4ccbc70beaf690, 0x332e79ae75ae0dae, 0x2e6394161d093556, 0x4b378bf68f6849f0, 0x6c419fa0cebba72d, 0x8bb431e1e273f2a4, 0x357cec80bbe024fd, 0x83a6e913962f11a9, 0x7808df02e2523718, 0xb6690b5dabc49e13, 0x6cef23259375972a -, 0xd18ac767b5e551fc, 0x5a0ba1dddb15bd36, 0x6f7923de219e3e1f, 0x3ec23588db9b5cfe, 0xa4fc23d42c83bbe0, 0x21581a00768658cd, 0xa295b6e57110218e, 0x3e7bbab1d15f477f, 0x2266c03d3f0d0635, 0x4174f08a95be03b5, 0xaa1a674abb8cbeb8, 0x6bdf6ba553ae3390 -, 0x8a31f824638545e2, 0x2a9e37a0f0eede53, 0x148a53d8cba69f65, 0x64c587e816d96316, 0x777a028a47e97e93, 0x13728e46befb2e0e, 0x13138b44862fa665, 0x0fca8c38a87775f6, 0xcc44bd580dd067fa, 0x40f2f7642e22d02e, 0xab3ba6db80c2f728, 0x5068aa2e2d25b7f9 -, 0x5a8a842c0a2923ff, 0x67c39e8a1006c196, 0x8f5cb9ff55460a84, 0x2e735c20a419a518, 0x0c6ee3fcbfdc2da4, 0x5bf6ed60a87b92bd, 0x5e4ce130e8e1608f, 0x0932ceb3e50028e8, 0x793cf8a0538cbfb8, 0x4e89e2c018beb7bd, 0xcaaa79642f5060de, 0x542a38a4d13f0016 -, 0xa1b0fd9aac663e55, 0x5158bf1f7b33c0e4, 0x060e82f65a4119fe, 0x32347069a1529fc4, 0x5c96ef69127480d5, 0x409a902134df6ffe, 0xdbe8c392eb6c7013, 0x73f2c48b0e3b4a79, 0xddf5060b937e2dff, 0x1534f901278611d9, 0xf47fe29ae4fd49a7, 0x7a2c0bfe75539f29 -, 0x19e04d1b2b0fe7fb, 0x56381ebd8181b50e, 0x5c8970c249df4ac3, 0x08acaece8ede7685, 0xc44f1a71aca0d20b, 0x623edc8d92e4ac3a, 0x5496a7e5885a0c95, 0x20a9ba37315b116e, 0x3765873809f5b55d, 0x23c44c42ebef2ff5, 0x56a96d921f724573, 0x3217815b72b8a9eb -, 0x2cc1b42f5350a489, 0x31f0b36e85b8c70b, 0x504a5c8c4d2ce34d, 0x1af8ea26b3786eac, 0x69bc5e26d7afd62f, 0x21e399d04247bf9a, 0x6e6d6676a88efb27, 0x476212b9fe9a6fd4, 0x0740fb65284168de, 0x5f7570be65e69408, 0x0166c3279dd81c29, 0x6565489007c4ed6d -, 0xbafb5bd37b5219c9, 0x00251709f2e210f7, 0x0d22639b51c1198b, 0x0f3c0df3be3de811, 0x3552612be3374eef, 0x0834744318ffa0aa, 0xcb9f1c1e3557a00c, 0x20c359f5de8b6614, 0xd319482a34d05268, 0x42165771b46b75d7, 0xca336c22e8d911a6, 0x4d072f70067a47e1 -, 0x9022c6f101555e9e, 0x4c8c7eaf7cc2d697, 0x629810b2d8044817, 0x25110bc01b06c9c1, 0x1bf9c06bf39eaff7, 0x6cc36f151f52b4e8, 0x76b73a6a14b62068, 0x47dcb0dc89db3821, 0xfe9dfeac2f670f41, 0x625b5c93b973c417, 0x5f8c917930133c1a, 0x6bd35f3e0992bb2b -, 0x03b5391a85409e5e, 0x7981d8fd16362767, 0xdb45c80a32a23cb6, 0x67356a7ef48b2dc3, 0x6189236e9f01adaf, 0x07a1e954e5032bd6, 0x53d627199c69727e, 0x25d67e4163cec014, 0x18e7bb6a63a80738, 0x3112be4cb5dcbc74, 0xad9ad6d381643f04, 0x116112cbeabb734d -, 0x32623abe2d66ff07, 0x4d780300822436de, 0x9bed066c04497808, 0x40db29b39ce86700, 0x6e5e5eb3805602a5, 0x52f227f2b1b9b40d, 0x51c2c4c197a18394, 0x6d8bca423ee270bc, 0xd6e60cfe8fb07f72, 0x7dd66c3970f940c6, 0x66aea7b59a0b17cc, 0x75fcf8b00160d729 -, 0xbedc5ea39b2402b5, 0x0dc3600425feedd5, 0xadc1ddf2cb1b6631, 0x205ee93e3aae976a, 0x7a2cb4e333c98498, 0x7d12eb776d56872c, 0x8e339bc1b41599fe, 0x4600f0a53fac9427, 0x1049d3a372f14304, 0x7b54e020b22db742, 0xd567962272a35739, 0x27a1178b1115f0c4 -, 0x6cfb39d619c35e1b, 0x5cb96fd1a9d9d486, 0xaf45cef7fb4fffea, 0x4a73d7b2ba9321d1, 0x44b46b4a80be86ac, 0x2769b50579e8f734, 0xab5d109e7472f372, 0x2bccfba1cbe995b6, 0xc00026115332f6a3, 0x7acb287da1561c53, 0x21555c608cd90dd9, 0x7731d1b2878dae13 -, 0x32122bf5ec1a0649, 0x592b5fa180ec8467, 0x876be1b5ad9ce66f, 0x484c1cc5bb34819d, 0x08e4cc425b30b06c, 0x2766065f0e4d22ce, 0xd90825644987aeff, 0x3a835fcc7fc456a6, 0xf4d801d2cc806d69, 0x41d767ecca55f839, 0xf2dea9fd01f1e74f, 0x74d01b97462211cb -, 0xe43e280ad29f80cc, 0x5cdf66a69029b231, 0xe8d655a03c862cd9, 0x388e38b58d0e8c79, 0x5d9aaa4848ff83a2, 0x14d6fbee4d6cbe74, 0x0426dcda912109ea, 0x1bb7b9cd75d4b541, 0x3a3c0504b39b8505, 0x35a3c5882b31367a, 0x678793d635a6473a, 0x66abca7e20202034 -, 0x4a90ff1dad300021, 0x18f29036544d2684, 0x2036d39b8f69095d, 0x36490f5645d18cc8, 0x9414d7368ad3562e, 0x7f8108a04558487e, 0x93db0e56d653e40b, 0x03f413ea960537bb, 0x984717b77f7267ef, 0x6c5d9da4a5ee7305, 0x725318dc36060a49, 0x274397f8e79a239e -, 0xbda7965b4095bab0, 0x6292b2505c7866e3, 0x451fb6a0672d6733, 0x37c560f40242a859, 0x151e56eb818f1423, 0x63451986f0c22ee1, 0x9275ff873a5c75e1, 0x178cdc734a32b96a, 0xff7adbb24244aacc, 0x76518aa0dfd96ddc, 0x161c1c8c81071219, 0x0584d44c10a3e6dc -, 0x2727282a09e9acab, 0x1298e49c34514ebd, 0x0323d059ca1c0e6d, 0x6072c8b87dd26bc6, 0x36eca2ab28d36f26, 0x2a977cb5aae4ea2a, 0xf157d43a0b9546a7, 0x04d60af0ed661d29, 0x34bc1080126e4402, 0x7677ef9a21589171, 0xbd13797278f07a40, 0x32c0daf0b57f20ac -, 0xbc83fd1b8366dc2e, 0x6cd07286c4e670ec, 0xf35485a3f339dc8a, 0x6e7e9285f2247e8b, 0xa9d19d3a09943bae, 0x43fa5197eed852a6, 0xf911398a043242fe, 0x4a100dcb1312cbe9, 0xbe2fd86be910a692, 0x614fd829368d7937, 0xdb5a98b1a92d578f, 0x46f1d23e1b0dca7e -, 0x8bf4c6725e813f36, 0x68bc89078129ce91, 0xff56503ae28f5c7f, 0x2b6e0f4e42178ce5, 0xa97cd947ec65895b, 0x7aa90b66280ff6c9, 0xebbaf32df158a0a0, 0x6a748d0ac02bb713, 0xdf79b5d619e83397, 0x16934947f6485b69, 0xe75185521ab32881, 0x20791e276a7460c9 -, 0xd25c403e22c70bc9, 0x0bf079518e66e1d3, 0x45dd5c971d3711de, 0x66bd2c6a30be232c, 0x607829e5b29e53ca, 0x30ed414e71dc08a2, 0x3fd38589ea0f1d39, 0x5a881a121f37fc5c, 0x27b9394368987a4f, 0x321fe45e13afae2d, 0xc6feb75080f33ea0, 0x02166d52f45eebbd -, 0x15026a1b0ccd2fc9, 0x1141be93d5bc3d6d, 0xfd20df606fc676c9, 0x4059e26b00ad78c4, 0x0709b409cec6b505, 0x68f020e8acf478e5, 0x875d77d1f5df0cfc, 0x66eb377735162ff1, 0x860482ab417a32ae, 0x21175f47da213935, 0xa07ff0cda099ecdb, 0x26ae5f177ae2b8e7 -, 0xa9a070ea5120eaf7, 0x2581feeba7383f81, 0x49e0f137f1fa2a7a, 0x7fe93c51cfd1ec62, 0x2d74dbdca7777f7e, 0x562da2ba74e823ff, 0x543b4f8609d77a2e, 0x3a0f65212f234ec8, 0xf842e3fea270ebc6, 0x4524322c6a289e11, 0x80815887aa6a8576, 0x46f49d53c3fe29a3 -, 0xbcc93cedfdb0d388, 0x4db312076ef0ad2b, 0x1f2cd56373654ad9, 0x4c6446970034d15f, 0x34d2cdbfd5d7130c, 0x6198950d03db2ae5, 0x736094b72faf1b1a, 0x1f6ca46a9f2588f7, 0xcba0b03d6259772a, 0x24e5a23d8d6be3a8, 0x7090e340c94f6d6f, 0x287ba27ee54e8466 -, 0x87320c8822d607f0, 0x44fd5802509df171, 0xf35c09860bf6ba4a, 0x6cf53130ef77cc0a, 0xaa81167a00b48ce4, 0x649f4c775b0d8b48, 0x59a25683ee98d33d, 0x651479007d1061a6, 0x155487411f6e16da, 0x411d036475404bf2, 0xc231f1344162458a, 0x4f36b7633f7dd368 -, 0xa98ddc0a4e7a89a4, 0x55d8a5da6eacd542, 0x5c3fb48b1001ed45, 0x5c7785ccafa702b9, 0xa64369fd216afb79, 0x1f405ef10e940669, 0x755f4831bc327b6f, 0x2bc1b67d71f1882d, 0x8eab15cfed7777d0, 0x517370d580d99326, 0x0811b75701c9db39, 0x234d84cb52f7b621 -, 0x970c4fbddddae49c, 0x3ba8d842475e41e1, 0xb0720f6ad75e7008, 0x275cd5c5184bf345, 0x5eb9833888d3796a, 0x1b3a42dfde11c2f3, 0x946548fe092b5f4d, 0x119917b50f263cc9, 0x622de955a20a3f82, 0x6a552ea3a60c7ff4, 0xc79230138150372a, 0x18083b9518de76a7 -, 0x55fb74dd7d3b5455, 0x523eea9a70ff8334, 0x5994a7335e356271, 0x3bb011f60430f1d2, 0x1ec434cba1d6ea7c, 0x69b632960feb5780, 0x46c50417541ebf07, 0x01470bfbf9d23830, 0xe9551f4c049bc5cc, 0x1c124638f35ee8ed, 0x09ca3a9141e83a38, 0x44daaf3e7411127b -, 0x0e54717b6c2fcd10, 0x518ab46b26d5914b, 0x528ac6c82341e833, 0x2247fa99d41f4672, 0xabe30c65c0f327a2, 0x3ac74e012b77e1b4, 0x35defd694c0e86b3, 0x7c382e10bfe60e4e, 0xf37e382996b8461c, 0x4d47481c53631e1a, 0xac8f167884f7b7b1, 0x5ae1bb6ab1a4c643 -, 0x63eb02590829df80, 0x623126862a793fa1, 0x6e1e242f1ce09807, 0x7bf96130aaecfd2b, 0xedc5e9ea10bff70a, 0x66b548233b94d26e, 0x70c70ee4594d30ab, 0x79b0006c8811353e, 0x4352792c91710c1f, 0x0c7bf15181a9f539, 0xfc995ee769e3779c, 0x44871c6cb9dcedcd -, 0x0d180bbf2c9a046b, 0x5445c598c45d0cd9, 0xdefb32386875fb94, 0x5b0d235355660f35, 0xbe1dea825b3a7973, 0x10658ec4e1bbe147, 0x48af5e87fad77504, 0x55f5d3c94a7dd694, 0xa9a3e7062cad6ba2, 0x36c0a7e3f9e0ea31, 0xc4bd65217010aebc, 0x1d031dfc8b9fb598 -, 0xe3621c104113889e, 0x774b77ee1e6a6477, 0x124c5b8a07785fd7, 0x5a6c0df18188cada, 0xf4adcd545e72d7be, 0x38100fffb66ba966, 0x2100cbe35fe4a4d0, 0x4489be2df052c175, 0xa03a22403b26899f, 0x5ae4a0a0fec13928, 0x89dfbfb802795eaa, 0x34917e9c4ecf2532 -, 0x64b93674c60cbbb3, 0x25c098506334c71d, 0x8a723f66f1ee34e1, 0x3a960adf48f141e4, 0x659f386695e440bb, 0x577a0fbf6e8095e6, 0x8ef419b0f4b25496, 0x044176a30b9e465b, 0x7a98705df2013e6f, 0x77d0b2483aa95ce7, 0x309e917b978effd7, 0x08f1e55bfe942c7f -, 0xfc241629b8d613c8, 0x140f2e35cd68949f, 0x38899f6a3ee4f9fa, 0x7abc8ecdd300f3b5, 0xd3dad23505d23eaf, 0x75e73f09376b2c7c, 0x5644a663b60ec5c4, 0x511ade8afe1eaec9, 0xbb005fe4e1abca89, 0x2838de73b0ca1f6c, 0x800a6658b80d28c8, 0x48aaba61c91641ec -, 0x222759cab704d4e2, 0x106dd3c0ce85beca, 0xa1ce1ce341f69d03, 0x1651b210e8e4ee10, 0x47329a5e7133e136, 0x58c02f47dc9367b9, 0x09dcba56947b02af, 0x435c251178125b48, 0xd56979a3f0cd9315, 0x2f02b0a6422afddb, 0x23920f500731f32d, 0x0ab833238232cb5d -, 0xa7b3d1bfb0bb60db, 0x2342c2a03c6eaec2, 0xac5e6e5a14d5282e, 0x5b9a421ddc42a24b, 0x018506414543e056, 0x6d7c377c084954e6, 0x4f8bf71ed3db1ced, 0x5150dbc15ab10979, 0x00b50a1b373a7fbf, 0x140be5c3d3244705, 0x5005bfe96e5b7911, 0x77cea555bb133f3e -, 0x2ab1e1a9d7a973c6, 0x3897ac98314968d3, 0x9e0f74764b23c9c3, 0x2e5ecbbae41997cd, 0x43e2ea5648f12433, 0x3a515a0e4808e69c, 0x17d36c03c36bb343, 0x44cebd053481ce43, 0x89008656c21b0d76, 0x2f8513fcb9009be6, 0x2e223f90208a0e83, 0x3828c2d4efd36a73 -, 0xbf17d64f89a8527d, 0x59ebb42b9656151d, 0x7d7bc7245c7dc5ef, 0x191b682a0cb695ec, 0x8931172fad9f9add, 0x239b6cbbab2ebdcf, 0x76932f9ca7002dd1, 0x0c140548f858d8b5, 0x6c7adfddcf741ea5, 0x3b39c4b9e2e1a567, 0xc5135a25f87436fe, 0x690d8fecb7dd0ae0 -, 0xd782a618ecda10c2, 0x4f2a84b3134cf832, 0x35a81f71bbc955a4, 0x457f88ed64ae6398, 0xc27eb71c31479985, 0x4ae91808569aab32, 0xa5f2e9785a75eb11, 0x619cb199b837ed36, 0x0e7e5912b9484e40, 0x3b5831e87fdbcaf0, 0x49a2779c2d2b039d, 0x3d4b81e07f49061a -, 0xaa119b0fa222b55c, 0x265c1b11b42fd4e2, 0x6b4d28e519dd7637, 0x3d2da7900de5a4b2, 0x99b06586b5f21d63, 0x4ce62bd9e6a1ee18, 0xb671e753932f8c92, 0x390b7821d0987834, 0x1adf7c73c3f1fc2f, 0x78c636a8514a7af9, 0xaee3b35fe11e7533, 0x7fbd199278f6ffd7 -, 0x41aabbf4363d77de, 0x1b27fdf18b96bf6a, 0xda264a1dff9a981c, 0x36efc08530c0bf9a, 0x5bd8862a5d830854, 0x23d7c905e656e6cb, 0x4523324c5b64fdcf, 0x36627f376238665f, 0x564f53925c6d5ea4, 0x17c7cc86a1913022, 0xf90db52a543b009b, 0x15192dc91f8b994b -, 0x80bfa3c1a79ec6e2, 0x48fca8ea99772ecc, 0xfee6a3b98c0f1824, 0x46a8c75601b81e22, 0x2cb3c402a8895fcc, 0x1d1dff9c04305ce2, 0xc1aefe78e85971d7, 0x79c6a083ab5a80b2, 0x379c7bca5dbf2518, 0x2419358989b3ca02, 0xc9c42c9cfa5f470e, 0x4481c2ce91b14459 -, 0x6b04dea1ea26deca, 0x26ee3ceee0d0a101, 0xe36cc6bcd8fa4f26, 0x4d14709719764fbd, 0xe0572a706f1fef52, 0x0f75fb69a23f2ec1, 0x32ae4b04a864cf3b, 0x0b6373a91b944773, 0x1a8f2bc20bd088af, 0x586b0d5ace401747, 0xa0e6b094a3c51433, 0x1752a123c268c1c7 -, 0x643c2a93b5770ea1, 0x536cb9d1b71eeb43, 0x6bfb0525d0cc6b3f, 0x1f4dcfeec3adefc3, 0x28a0169dd0bf57f0, 0x1336c9aa20a35449, 0xbbcda068703ad7a1, 0x5e33478283c1e03d, 0xf1997733d18fdaf2, 0x789af507a17bb867, 0x79970c14d5695613, 0x79452342e845256f -, 0x6c12f9367a26a018, 0x11beda1c8f9cdfbe, 0x720e6ddf24b30929, 0x7706e91e3e544755, 0x4460381d3a6c9059, 0x7e01916c3678c424, 0x6024355a61d2bb07, 0x68bae01d79c869e2, 0xf21cbcff285df659, 0x02f7ee6aeb57c933, 0xce0f078c17266467, 0x039b4fdb5170a103 -, 0xd5de0fec61a4ae1b, 0x33d37a152a778695, 0xea64e40e6a10ded9, 0x1f1d394373bdb213, 0xf63598b6ef59fd14, 0x57922adc3ae52283, 0xe39a90e18b76f4a1, 0x27f3dbebd98a9dae, 0x18179dd9c03804b3, 0x511d72c1912e2d73, 0x88e1f6d24b2f3225, 0x56009999cdc2997f -, 0xda6df977b7d82fe4, 0x76f746bba63da226, 0x0b5facfc3bf13bd7, 0x4a31eb04f66f0e18, 0x8ace73d5e7cfe28f, 0x19aa731bc30c20b1, 0xa91979fe73400317, 0x6795ce71a09c7c9f, 0x93d55501933700ba, 0x3850eaf08b1fd14d, 0x450c5abc89edca71, 0x1be5db848bdfa5ef -, 0x77667d3f4fcf082b, 0x673b6e6c4824bc45, 0x6f22c12a5fe0ed6d, 0x006ee6722b5dfed1, 0xb47a13c1468d0c62, 0x40564879a378e6e4, 0x0bc6b553a9d3ab58, 0x21761c79e44dfcfd, 0x66f36ed3eb1050fb, 0x2e67df1312dd01d3, 0x48744c4a68dbf2ad, 0x7844962b6d6e039c -, 0xe07b5675d378b65f, 0x336262aa3d2c1df0, 0x320a5667d78c2e2b, 0x4f668dd96dda5e2a, 0xe21556795c7b8470, 0x3061905b2ef82bb1, 0xaee53211472206b6, 0x1f87377fee0d7a39, 0xdac58c52a3b1a0c7, 0x6e3c4ce04f0d7ffd, 0xfdffec45d4a3990f, 0x4b5340f79e2ae2c2 -, 0x0537c8b7b3d1f332, 0x55292744ae35ee1a, 0x42336d0e6d057f1e, 0x5ac40e9e645cb3d7, 0x848f7b7f845e46c7, 0x74bda86736eff150, 0x891acf622baf4f35, 0x14bcef9cf39667bb, 0x9aa1354d9731b9b3, 0x27e855a19295e59f, 0x1a829a8e10662ed0, 0x3bbc43f9ec4437a7 -, 0x8bfa8b1cb1de5341, 0x3432778068d35549, 0xe3d807da41f25a48, 0x1bb6ee1ce2efe552, 0x08d9bded0bd3affc, 0x290f1c5299a917a8, 0xda8dfd79562f8939, 0x1bf7aae68686211f, 0x2ab6daf9bc860765, 0x7bef6e2f0eb58a0b, 0x8746faab7c439b94, 0x017ea87750bb8bda -, 0xf8dfeb22239c9a7c, 0x35cec0d2887b3a13, 0x68aa94ac601f1606, 0x7470553f8ba61767, 0x37894f91c9eac410, 0x55b22aeb8337f732, 0x53f8d90f29a2fe94, 0x0aec068aec69023a, 0x40506162ad6182ee, 0x6a327ff1ac1e5475, 0x968d7095492df3c8, 0x3f93f46195f67521 -, 0x4983bca28970d546, 0x2716b931296b53c2, 0xf42b013266b6f8b3, 0x76f29b084b6a369f, 0x8e28749222216249, 0x4f2fa1d3a6c1acfd, 0x0ee66697eab8f954, 0x37c33e28fec0cce5, 0x7d0419e2bafd1dd1, 0x01f04d4299b94daa, 0x5ec06abbc1e5c7e6, 0x3a24c66060ed72a9 -, 0x0db764e15f960f26, 0x1d5973d5d59f9c3a, 0xf3dc2608dc6d9149, 0x1d80e0461b72f518, 0x2264dccd49c8b09c, 0x1f03e7a246334d5e, 0x2d6e38871b1fc2ad, 0x418588ae4f284bd3, 0x3efb071bafe1afa2, 0x0799ba0c80bdd8dc, 0xa6b273222dcc4a76, 0x13859f08ac8a4b23 -, 0x0194acc2663c5acb, 0x459fa55bd0bbedf6, 0x1b055550f06f8cc1, 0x09e5fad46599ea75, 0x6b3916ef772958a3, 0x4aaaa5c18093a431, 0x8e1503e36610f594, 0x620ef55048a263b9, 0x5a28963c8cb8ecbc, 0x6aee46b1b740c15a, 0x67e39606f59cfea9, 0x13a579e3777ca8b1 -, 0x45ad92f61cbb8de3, 0x53068a1a42460eab, 0x9b163546de379578, 0x07bf38a7cecd4860, 0xf84c77031d282de1, 0x402aed6399f78ffc, 0xfb83dd20295f6d45, 0x3702e257340d2ecd, 0xb8db2d8b979b97c8, 0x617526d2a50b0c51, 0xd86f6278313017db, 0x2f35eedec55f9d92 -, 0xeecb69493517973b, 0x7a111a74e0baf09a, 0xb82c6da8ec39f63d, 0x4217076312833746, 0x5d36d11f3dda88d9, 0x7baebcb360f2a887, 0x9829b62d093d6cbb, 0x10f17a2f6edf28fd, 0xfe3efa4353f40626, 0x731ca3065c118e34, 0x6185678827960895, 0x07f906a4f4c6355c -, 0x361d9cd10e657142, 0x2b5f5d452dd861ce, 0xa3e01df05d04b69a, 0x533723bc4cfcc0db, 0x820384afa1bbccb5, 0x4e67e941595d8dfd, 0x0f8da50839e13646, 0x6887a0573a596968, 0xe93dd1df5ace7343, 0x0d4076f28ecf96c8, 0x0ba2f854988074c1, 0x5eb2a314a41a40b6 -, 0x49ff6d27a676b27e, 0x15f7ca40acd5114e, 0xc171f9a750d7da95, 0x3bedbe891f79eb5c, 0x5b643bceb83f74ff, 0x088b1af3aa331a4c, 0xde294c7e0a60c4a9, 0x0a0770fc8120b151, 0xf09b757a0c7c1937, 0x34b797c03efd9c88, 0x051e3edb2c28cc49, 0x66db34ec5ac5122c -, 0x95fde0d3d3dc8cbf, 0x797897c8121818cf, 0x1fd46d197710f89d, 0x533a505803f809c5, 0xb60f1c090c9fd211, 0x4a7c3479af5c9d82, 0x4bfc3ffa4c8cf5a5, 0x6949f4a61306821f, 0xd814c949c67abcdc, 0x419a5e33166863c4, 0x9de646f6bd0895e0, 0x497cc1449a54545a -, 0x69eb31247fe126f2, 0x323c83233967f477, 0x52e0db4d3d78127d, 0x42a0e188e7b9380c, 0x3a6b011c46e34e7e, 0x79f4168aa9a0b4aa, 0x94270a25d708fa4d, 0x2bb28618cbc9cdc8, 0x741e46bb04606819, 0x02790c52fb2ce982, 0x6dbb92d0c6d0af10, 0x32aa96ae061e9412 -, 0x1376700c90d98eaa, 0x4d1dfe650c0a7136, 0xb397f8eef89aff20, 0x4836ac4a041bae37, 0xf37c1076a80a02b8, 0x0d063fa2467b3a37, 0x498f2617b56b7e7b, 0x65ef1194db859a5d, 0xd1fe25d5d28ffcb6, 0x228ee6f49459c083, 0x6b7e82b3b009b15b, 0x713b185ef1fccbfc -, 0x552468f1ff60c298, 0x2b7ba65d02519614, 0x8a86ad90ff0816c2, 0x7bf9249284bd02e5, 0x3008c56e474c2d10, 0x171473b77f804540, 0x15fb79d07bdea766, 0x66ac67c7b9b0951f, 0x34bca15bb6d2f652, 0x13c63dd2687d617b, 0xc515ae237715c19c, 0x0e543c6765fbfef2 -, 0x668c80faf156fb5e, 0x1e2e9e3b3d9962b8, 0x89ebaa264394e113, 0x322add21cf1659cf, 0xf9e6e26733619f8e, 0x723bfc8b792147f0, 0x79aef2837d7e092f, 0x1aa61c59290b5011, 0x9955ae576a499cd3, 0x2c3d6e6a5a1ce0da, 0xb864cfa199a8676b, 0x4961a21f1080285f -, 0x828e184adf9d997b, 0x0c84bda97e7ce725, 0xe6974677094cfcc5, 0x4ec8cd773946105b, 0xa48681bcc95fb5c6, 0x6ade87f8f7a5f269, 0x9b97628fdd39c03d, 0x3bde0ee1f19f1842, 0x4ef8c8fb117c0ca1, 0x769bf8f8d07de9bf, 0xc8f5f435b78a57e5, 0x79987aa861bbcf9c -, 0x7f6c557204b02022, 0x119bd819111c69d1, 0xf0c61ef00b3eb70b, 0x4317f0511bfb7b39, 0x36a2b944e84d608e, 0x1c1a3862da3369cb, 0x37dbf471085f1775, 0x3835751e107419ad, 0x04ab0c84bb07a3fe, 0x63758bfbc7df13a0, 0x15ffd20cb554f23e, 0x1ff11c442b1515b7 -, 0x171377f1bf937186, 0x615efe82b83538f8, 0x321e7cfae352a761, 0x7af02427d7241502, 0x86546e47f2cc559f, 0x65a1d8a017659d75, 0xc95d8aa5b8bfdac9, 0x01e887cb68990623, 0xf1f8ee8c466bcc3d, 0x40ce5e4f2ba3908f, 0xd2b81a3480c16b35, 0x51625d3eabf708cd -, 0x44d770a210105739, 0x7f1de74a022958a0, 0xfbe4c91bd1e8f732, 0x204fbacb13586460, 0x97d79097d62e3cf8, 0x541ad5591934b114, 0xfdfb47919c141909, 0x354926e5244fdecf, 0x6291b0a0e2e994b0, 0x2b9a9a69d3a6c3d1, 0x8189be54302371e7, 0x3645c65df1a881cd -, 0xdf0460f445e3877b, 0x7ea384dc52d0d26e, 0x0c2e5f768d46b6b0, 0x1f6e62daa7c5d4e6, 0xf8b026b33b2343ee, 0x2b7183c8767d372c, 0xbd45d1b6b6731517, 0x4ddb3d287c470d60, 0x1031dba40263ece2, 0x4e737fa0d659045f, 0x8cbc98d07d09b455, 0x34a35128a2bcb7f5 }; - - -#endif \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/README.md b/ffi-deps/FourQlib/FourQ_64bit_and_portable/README.md deleted file mode 100644 index f54c03c..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/README.md +++ /dev/null @@ -1,125 +0,0 @@ -# FourQlib v3.0 (C Edition): portable and 64-bit optimized implementation - -## Contents - -The `FourQ_64bit_and_portable` folder contains: - -* [`FourQ_64bit_and_portable/Visual Studio/`](Visual%20Studio/): folder with Visual Studio 2015 solution and -project files for compilation and testing in Windows. -* [`FourQ_64bit_and_portable/makefile`](makefile): Makefile for compilation using GNU GCC or clang compilers -on Linux. -* Main .c and .h files: library and header files. Public API for ECC scalar multiplication, key exchange and signatures is in -[`FourQ_64bit_and_portable/FourQ_api.h`](FourQ_api.h). -* [`FourQ_64bit_and_portable/AMD64/`](AMD64/): folder with library files for optimized x64 implementation. -* [`FourQ_64bit_and_portable/ARM64/`](ARM64/): folder with library files for optimized 64-bit ARM -implementation. -* [`FourQ_64bit_and_portable/generic/`](generic/): folder with library files for portable implementation. -* [`FourQ_64bit_and_portable/tests/`](tests/): test files. -* [`FourQ_64bit_and_portable/README.md`](README.md): this readme file. - -## Supported platforms - -This implementation is supported in a wide range of platforms including x64, x86, 32-bit ARM and 64-bit ARM, -running Windows or Linux. We have tested the library with Microsoft Visual Studio 2015, GNU GCC v4.9 and -clang v3.8. - -See instructions below to choose an implementation option and compile on one of the supported platforms. - -## Complementary crypto functions - -Random values are generated with `/dev/urandom` in the case of Linux, and with the function `BCryptGenRandom()` in the case of Windows. - -The library includes an implementation of SHA-512 which is used by default by SchnorrQ signatures. - -Users can experiment with different options by replacing functions in the `random` and `sha512` folders and -applying the corresponding changes to the settings in [`FourQ.h`](FourQ.h). - -## Implementation options - -The following compilation options are available for the `FourQ_64bit_and_portable` implementation: - -* A portable implementation (enabled by the "GENERIC" option). -* Optimized implementations for x64 and 64-bit ARM (ARMv8). Note that the rest of platforms are only supported - by the generic implementation. -* Use of AVX or AVX2 instructions enabled by defining `_AVX_` or `_AVX2_` (Windows) or by the "AVX" and "AVX2" - options (Linux). -* Optimized x64 assembly implementations in Linux. -* Use of fast endomorphisms enabled by the "USE_ENDO" option. - -Follow the instructions below to configure these different options. - -## Instructions for Windows - -### Building the library with Visual Studio - -Open the solution file ([`FourQ.sln`](Visual%20Studio/FourQ/FourQ.sln)) in Visual Studio 2015, select -one of the available configurations from -the Solution Configurations menu ("Release" corresponding to the high-speed x64 implementation and "Generic" -corresponding to the portable implementation) and select one of the Solution Platforms (x64 or Win32). Note -that Win32 is only supported with the "Generic" solution configuration. - -By default, `USE_ENDO=true` and (for x64) `_AVX_` is defined. To modify this configuration, go to the property -window of the FourQ project, go to `Configuration Properties > C/C++ > Preprocessor`. Make any suitable changes, -e.g., delete `_AVX_` if AVX instructions are not supported, replace `_AVX_` by `_AVX2_` if AVX2 instructions -are supported, or set `USE_ENDO=true` or `false`. Repeat these steps for the `fp_tests`, `ecc_tests` and `crypto_tests` projects. - -Finally, select "Build Solution" from the "Build" menu. - -### Running the tests - -After building the solution, run `fp_tests.exe`, `ecc_tests.exe` and `crypto_tests.exe`. - -### Using the library - -After building the solution, add the `FourQ.lib` file to the set of References for a project, and add -[`FourQ.h`](FourQ.h) and [`FourQ_api.h`](FourQ_api.h) to the list of header files of a project. - -## Instructions for Linux - -### Building the library and executing the tests with GNU GCC or clang - -To compile on Linux using the GNU GCC compiler or the clang compiler, execute the following command from the -command prompt: - -```sh -$ make ARCH=[x64/x86/ARM/ARM64] CC=[gcc/clang] ASM=[TRUE/FALSE] AVX=[TRUE/FALSE] AVX2=[TRUE/FALSE] - EXTENDED_SET=[TRUE/FALSE] USE_ENDO=[TRUE/FALSE] GENERIC=[TRUE/FALSE] SERIAL_PUSH=[TRUE/FALSE] -``` - -After compilation, run `fp_tests`, `ecc_tests` or `crypto_tests`. - -By default GNU GCC is used, as well as the endomorphisms and the extended settings. - -In the case of x64, AVX2 instructions and the high-speed assembly implementation are enabled by default. -In the case of x86 and ARM, the portable ("GENERIC") implementation is used by default. - -For example, to compile the optimized x64 implementation in assembly with GNU GCC using the efficient -endomorphisms on a machine with AVX2 support (e.g, Intel's Haswell or Broadwell), execute: - -```sh -$ make ARCH=x64 -``` - -For example, to compile the optimized ARM64 implementation with GNU GCC using the efficient endomorphisms, -execute: - -```sh -$ make ARCH=ARM64 -``` - -As another example, to compile the portable implementation with clang using the efficient endomorphisms -on an x86 machine, execute: - -```sh -$ make ARCH=x86 CC=clang -``` - -`SERIAL_PUSH` can be enabled in some platforms (e.g., AMD without AVX2 support) to boost performance. - -By default `EXTENDED_SET` is enabled, which sets the following compilation flags: `-fwrapv -fomit-frame-pointer --march=native`. To disable this, use `EXTENDED_SET=FALSE`. -Users are encouraged to experiment with the different flag options. - -Whenever an unsupported configuration is applied, the following message will be displayed: `#error -- "Unsupported configuration". -For example, the use of assembly or any of the AVX options is not supported when selecting the portable implementation -(i.e., if `GENERIC=TRUE` or if `ARCH=[x86/ARM]`). diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/FourQ/FourQ.sln b/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/FourQ/FourQ.sln deleted file mode 100644 index e2aa675..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/FourQ/FourQ.sln +++ /dev/null @@ -1,72 +0,0 @@ - -Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio 14 -VisualStudioVersion = 14.0.25420.1 -MinimumVisualStudioVersion = 10.0.40219.1 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "FourQ", "FourQ.vcxproj", "{719F1A49-62B2-41E2-B500-40FAD83AB12A}" -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ecc_tests", "..\ecc_tests\ecc_tests.vcxproj", "{A6DB2ADB-C570-47D5-BAAA-06904D60C091}" -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "crypto_tests", "..\crypto_tests\crypto_tests.vcxproj", "{47A9BC7F-1C7F-4BB3-B5D1-7AC7DDAC0E04}" -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "fp_tests", "..\fp_tests\fp_tests.vcxproj", "{D36D493E-EFD2-4FF1-8CAE-2D16EEA76CAC}" -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|Win32 = Debug|Win32 - Debug|x64 = Debug|x64 - Generic|Win32 = Generic|Win32 - Generic|x64 = Generic|x64 - Release|Win32 = Release|Win32 - Release|x64 = Release|x64 - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {719F1A49-62B2-41E2-B500-40FAD83AB12A}.Debug|Win32.ActiveCfg = Debug|Win32 - {719F1A49-62B2-41E2-B500-40FAD83AB12A}.Debug|Win32.Build.0 = Debug|Win32 - {719F1A49-62B2-41E2-B500-40FAD83AB12A}.Debug|x64.ActiveCfg = Debug|x64 - {719F1A49-62B2-41E2-B500-40FAD83AB12A}.Debug|x64.Build.0 = Debug|x64 - {719F1A49-62B2-41E2-B500-40FAD83AB12A}.Generic|Win32.ActiveCfg = Generic|Win32 - {719F1A49-62B2-41E2-B500-40FAD83AB12A}.Generic|Win32.Build.0 = Generic|Win32 - {719F1A49-62B2-41E2-B500-40FAD83AB12A}.Generic|x64.ActiveCfg = Generic|x64 - {719F1A49-62B2-41E2-B500-40FAD83AB12A}.Generic|x64.Build.0 = Generic|x64 - {719F1A49-62B2-41E2-B500-40FAD83AB12A}.Release|Win32.ActiveCfg = Release|Win32 - {719F1A49-62B2-41E2-B500-40FAD83AB12A}.Release|x64.ActiveCfg = Release|x64 - {719F1A49-62B2-41E2-B500-40FAD83AB12A}.Release|x64.Build.0 = Release|x64 - {A6DB2ADB-C570-47D5-BAAA-06904D60C091}.Debug|Win32.ActiveCfg = Debug|Win32 - {A6DB2ADB-C570-47D5-BAAA-06904D60C091}.Debug|Win32.Build.0 = Debug|Win32 - {A6DB2ADB-C570-47D5-BAAA-06904D60C091}.Debug|x64.ActiveCfg = Debug|x64 - {A6DB2ADB-C570-47D5-BAAA-06904D60C091}.Debug|x64.Build.0 = Debug|x64 - {A6DB2ADB-C570-47D5-BAAA-06904D60C091}.Generic|Win32.ActiveCfg = Generic|Win32 - {A6DB2ADB-C570-47D5-BAAA-06904D60C091}.Generic|Win32.Build.0 = Generic|Win32 - {A6DB2ADB-C570-47D5-BAAA-06904D60C091}.Generic|x64.ActiveCfg = Generic|x64 - {A6DB2ADB-C570-47D5-BAAA-06904D60C091}.Generic|x64.Build.0 = Generic|x64 - {A6DB2ADB-C570-47D5-BAAA-06904D60C091}.Release|Win32.ActiveCfg = Release|Win32 - {A6DB2ADB-C570-47D5-BAAA-06904D60C091}.Release|x64.ActiveCfg = Release|x64 - {A6DB2ADB-C570-47D5-BAAA-06904D60C091}.Release|x64.Build.0 = Release|x64 - {47A9BC7F-1C7F-4BB3-B5D1-7AC7DDAC0E04}.Debug|Win32.ActiveCfg = Debug|Win32 - {47A9BC7F-1C7F-4BB3-B5D1-7AC7DDAC0E04}.Debug|Win32.Build.0 = Debug|Win32 - {47A9BC7F-1C7F-4BB3-B5D1-7AC7DDAC0E04}.Debug|x64.ActiveCfg = Debug|x64 - {47A9BC7F-1C7F-4BB3-B5D1-7AC7DDAC0E04}.Debug|x64.Build.0 = Debug|x64 - {47A9BC7F-1C7F-4BB3-B5D1-7AC7DDAC0E04}.Generic|Win32.ActiveCfg = Generic|Win32 - {47A9BC7F-1C7F-4BB3-B5D1-7AC7DDAC0E04}.Generic|Win32.Build.0 = Generic|Win32 - {47A9BC7F-1C7F-4BB3-B5D1-7AC7DDAC0E04}.Generic|x64.ActiveCfg = Generic|x64 - {47A9BC7F-1C7F-4BB3-B5D1-7AC7DDAC0E04}.Generic|x64.Build.0 = Generic|x64 - {47A9BC7F-1C7F-4BB3-B5D1-7AC7DDAC0E04}.Release|Win32.ActiveCfg = Release|Win32 - {47A9BC7F-1C7F-4BB3-B5D1-7AC7DDAC0E04}.Release|x64.ActiveCfg = Release|x64 - {47A9BC7F-1C7F-4BB3-B5D1-7AC7DDAC0E04}.Release|x64.Build.0 = Release|x64 - {D36D493E-EFD2-4FF1-8CAE-2D16EEA76CAC}.Debug|Win32.ActiveCfg = Debug|Win32 - {D36D493E-EFD2-4FF1-8CAE-2D16EEA76CAC}.Debug|Win32.Build.0 = Debug|Win32 - {D36D493E-EFD2-4FF1-8CAE-2D16EEA76CAC}.Debug|x64.ActiveCfg = Debug|x64 - {D36D493E-EFD2-4FF1-8CAE-2D16EEA76CAC}.Debug|x64.Build.0 = Debug|x64 - {D36D493E-EFD2-4FF1-8CAE-2D16EEA76CAC}.Generic|Win32.ActiveCfg = Generic|Win32 - {D36D493E-EFD2-4FF1-8CAE-2D16EEA76CAC}.Generic|Win32.Build.0 = Generic|Win32 - {D36D493E-EFD2-4FF1-8CAE-2D16EEA76CAC}.Generic|x64.ActiveCfg = Generic|x64 - {D36D493E-EFD2-4FF1-8CAE-2D16EEA76CAC}.Generic|x64.Build.0 = Generic|x64 - {D36D493E-EFD2-4FF1-8CAE-2D16EEA76CAC}.Release|Win32.ActiveCfg = Release|Win32 - {D36D493E-EFD2-4FF1-8CAE-2D16EEA76CAC}.Release|x64.ActiveCfg = Release|x64 - {D36D493E-EFD2-4FF1-8CAE-2D16EEA76CAC}.Release|x64.Build.0 = Release|x64 - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection -EndGlobal diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/FourQ/FourQ.vcxproj b/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/FourQ/FourQ.vcxproj deleted file mode 100644 index b002fc0..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/FourQ/FourQ.vcxproj +++ /dev/null @@ -1,245 +0,0 @@ - - - - - Debug - Win32 - - - Debug - x64 - - - Generic - Win32 - - - Generic - x64 - - - Release - Win32 - - - Release - x64 - - - - {719F1A49-62B2-41E2-B500-40FAD83AB12A} - Win32Proj - Core - FourQ - - - - StaticLibrary - true - Unicode - v140 - - - StaticLibrary - false - true - Unicode - v140 - - - StaticLibrary - false - true - Unicode - v140 - - - StaticLibrary - v140 - - - StaticLibrary - v140 - - - StaticLibrary - v140 - - - - - - - - - - - - - - - - - - - - - Level3 - Disabled - __WINDOWS__; _X86_; _GENERIC_; USE_ENDO=true; - - - Windows - true - - - bcrypt.lib - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions) - - - Windows - true - true - true - - - bcrypt.lib - - - - - Level4 - - - MaxSpeed - true - true - __WINDOWS__; _X86_; _GENERIC_; USE_ENDO=true; - - - Windows - true - true - true - - - bcrypt.lib - - - - - - - MaxSpeed - AdvancedVectorExtensions - AVX - None - None - None - Neither - Default - false - false - NoListing - Level4 - true - true - __WINDOWS__; _AMD64_; _AVX_; USE_ENDO=true; - true - - - - true - bcrypt.lib - - - - - - - MaxSpeed - AdvancedVectorExtensions - AVX - None - None - None - Neither - Default - false - false - NoListing - Level4 - true - true - __WINDOWS__; _AMD64_; _GENERIC_; USE_ENDO=true; - true - - - - - true - bcrypt.lib - - - - - - - AdvancedVectorExtensions - Disabled - Level4 - true - false - __WINDOWS__; _AMD64_; _AVX_; USE_ENDO=true; - true - Default - MultiThreadedDebugDLL - - - - true - bcrypt.lib - - - - - true - true - - - - - - true - true - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/FourQ/FourQ.vcxproj.filters b/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/FourQ/FourQ.vcxproj.filters deleted file mode 100644 index f2aac4f..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/FourQ/FourQ.vcxproj.filters +++ /dev/null @@ -1,78 +0,0 @@ - - - - - {4FC737F1-C7A5-4376-A066-2A32D752A2FF} - cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx - - - {93995380-89BD-4b04-88EB-625FBE52EBFB} - h;hpp;hxx;hm;inl;inc;xsd - - - {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} - rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms - - - {7bb64693-4b46-4a78-b3d6-9a77d6f44c81} - - - {6739ac49-cc8b-46e9-8303-bb86f346d251} - - - - - Header Files\generic - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files\x64 - - - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Header Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/crypto_tests/crypto_tests.vcxproj b/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/crypto_tests/crypto_tests.vcxproj deleted file mode 100644 index 21abada..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/crypto_tests/crypto_tests.vcxproj +++ /dev/null @@ -1,237 +0,0 @@ - - - - - Debug - Win32 - - - Debug - x64 - - - Generic - Win32 - - - Generic - x64 - - - Release - Win32 - - - Release - x64 - - - - {47A9BC7F-1C7F-4BB3-B5D1-7AC7DDAC0E04} - Win32Proj - MyLibrary - crypto_tests - - - - Application - true - Unicode - v140 - - - Application - false - true - Unicode - v140 - - - Application - false - true - Unicode - v140 - - - Application - v140 - - - Application - v140 - - - Application - v140 - - - - - - - - - - - - - - - - - true - - - false - - - false - - - - - - Level3 - Disabled - __WINDOWS__; _X86_; _GENERIC_; USE_ENDO=true; - - - Console - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - - - Console - true - true - true - - - - - Level4 - - - MaxSpeed - true - true - __WINDOWS__; _X86_; _GENERIC_; USE_ENDO=true; - - - Console - true - true - true - - - - - true - - - MaxSpeed - AdvancedVectorExtensions - AVX - None - None - None - Neither - Default - false - false - Level4 - - __WINDOWS__; _AMD64_; USE_ENDO=true; - - - UseLinkTimeCodeGeneration - true - - - - - - - - - - - true - - - MaxSpeed - AdvancedVectorExtensions - AVX - None - None - None - Neither - Default - false - false - Level4 - - - __WINDOWS__; _AMD64_; _GENERIC_; USE_ENDO=true; - - - UseLinkTimeCodeGeneration - true - - - - - - - - - - - true - - - - - MSVCRTD - - - true - AdvancedVectorExtensions - Disabled - __WINDOWS__; _AMD64_; USE_ENDO=true; - - - - - {719f1a49-62b2-41e2-b500-40fad83ab12a} - false - true - false - true - false - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/crypto_tests/crypto_tests.vcxproj.filters b/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/crypto_tests/crypto_tests.vcxproj.filters deleted file mode 100644 index 6305347..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/crypto_tests/crypto_tests.vcxproj.filters +++ /dev/null @@ -1,33 +0,0 @@ - - - - - {4FC737F1-C7A5-4376-A066-2A32D752A2FF} - cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx - - - {93995380-89BD-4b04-88EB-625FBE52EBFB} - h;hpp;hxx;hm;inl;inc;xsd - - - {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} - rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms - - - - - Header Files - - - Header Files - - - - - Source Files - - - Source Files - - - \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/ecc_tests/ecc_tests.vcxproj b/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/ecc_tests/ecc_tests.vcxproj deleted file mode 100644 index fdfc843..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/ecc_tests/ecc_tests.vcxproj +++ /dev/null @@ -1,237 +0,0 @@ - - - - - Debug - Win32 - - - Debug - x64 - - - Generic - Win32 - - - Generic - x64 - - - Release - Win32 - - - Release - x64 - - - - {A6DB2ADB-C570-47D5-BAAA-06904D60C091} - Win32Proj - MyLibrary - ecc_tests - - - - Application - true - Unicode - v140 - - - Application - false - true - Unicode - v140 - - - Application - false - true - Unicode - v140 - - - Application - v140 - - - Application - v140 - - - Application - v140 - - - - - - - - - - - - - - - - - true - - - false - - - false - - - - - - Level3 - Disabled - __WINDOWS__; _X86_; _GENERIC_; USE_ENDO=true; - - - Console - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - - - Console - true - true - true - - - - - Level4 - - - MaxSpeed - true - true - __WINDOWS__; _X86_; _GENERIC_; USE_ENDO=true; - - - Console - true - true - true - - - - - true - - - MaxSpeed - AdvancedVectorExtensions - AVX - None - None - None - Neither - Default - false - false - Level4 - - __WINDOWS__; _AMD64_; USE_ENDO=true; - - - UseLinkTimeCodeGeneration - true - - - - - - - - - - - true - - - MaxSpeed - AdvancedVectorExtensions - AVX - None - None - None - Neither - Default - false - false - Level4 - - - __WINDOWS__; _AMD64_; _GENERIC_; USE_ENDO=true; - - - UseLinkTimeCodeGeneration - true - - - - - - - - - - - true - - - - - MSVCRTD - - - true - AdvancedVectorExtensions - Disabled - __WINDOWS__; _AMD64_; USE_ENDO=true; - - - - - {719f1a49-62b2-41e2-b500-40fad83ab12a} - false - true - false - true - false - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/ecc_tests/ecc_tests.vcxproj.filters b/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/ecc_tests/ecc_tests.vcxproj.filters deleted file mode 100644 index 7358a58..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/ecc_tests/ecc_tests.vcxproj.filters +++ /dev/null @@ -1,33 +0,0 @@ - - - - - {4FC737F1-C7A5-4376-A066-2A32D752A2FF} - cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx - - - {93995380-89BD-4b04-88EB-625FBE52EBFB} - h;hpp;hxx;hm;inl;inc;xsd - - - {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} - rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms - - - - - Header Files - - - Header Files - - - - - Source Files - - - Source Files - - - \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/fp_tests/fp_tests.vcxproj b/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/fp_tests/fp_tests.vcxproj deleted file mode 100644 index de497d4..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/fp_tests/fp_tests.vcxproj +++ /dev/null @@ -1,219 +0,0 @@ - - - - - Debug - Win32 - - - Debug - x64 - - - Generic - Win32 - - - Generic - x64 - - - Release - Win32 - - - Release - x64 - - - - {D36D493E-EFD2-4FF1-8CAE-2D16EEA76CAC} - Win32Proj - MyLibrary - fp_tests - 8.1 - - - - Application - true - Unicode - v140 - - - Application - false - true - Unicode - v140 - - - Application - false - true - Unicode - v140 - - - Application - v140 - - - Application - v140 - - - Application - v140 - - - - - - - - - - - - - - - - - true - - - false - - - false - - - - - - Level3 - Disabled - __WINDOWS__; _X86_; _GENERIC_; USE_ENDO=true; - - - Console - true - - - - - Level3 - - - MaxSpeed - true - true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) - - - Console - true - true - true - - - - - Level3 - - - MaxSpeed - true - true - __WINDOWS__; _X86_; _GENERIC_; USE_ENDO=true; - - - Console - true - true - true - - - - - true - - - MaxSpeed - AdvancedVectorExtensions - AVX - None - None - None - Neither - Default - false - false - __WINDOWS__; _AMD64_; USE_ENDO=true; - Level4 - - - - UseLinkTimeCodeGeneration - - - - - true - - - MaxSpeed - AdvancedVectorExtensions - AVX - None - None - None - Neither - Default - false - false - __WINDOWS__; _AMD64_; _GENERIC_; USE_ENDO=true; - Level4 - - - - - UseLinkTimeCodeGeneration - - - - - true - - - - - MSVCRTD - - - true - AdvancedVectorExtensions - Disabled - __WINDOWS__; _AMD64_; USE_ENDO=true; - - - - - - - - - {719f1a49-62b2-41e2-b500-40fad83ab12a} - - - - - - - - - - - \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/fp_tests/fp_tests.vcxproj.filters b/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/fp_tests/fp_tests.vcxproj.filters deleted file mode 100644 index ec0d183..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/Visual Studio/fp_tests/fp_tests.vcxproj.filters +++ /dev/null @@ -1,33 +0,0 @@ - - - - - {4FC737F1-C7A5-4376-A066-2A32D752A2FF} - cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx - - - {93995380-89BD-4b04-88EB-625FBE52EBFB} - h;hpp;hxx;hm;inl;inc;xsd - - - {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} - rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms - - - - - Header Files - - - Header Files - - - - - Source Files - - - Source Files - - - \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/crypto_util.c b/ffi-deps/FourQlib/FourQ_64bit_and_portable/crypto_util.c deleted file mode 100644 index 6a63402..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/crypto_util.c +++ /dev/null @@ -1,239 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: crypto utility functions -************************************************************************************/ - -#include "FourQ_internal.h" -#include "FourQ_params.h" -#include - -static digit_t mask4000 = (digit_t)1 << (sizeof(digit_t)*8 - 2); -static digit_t mask7fff = (digit_t)(-1) >> 1; - - -bool is_zero_ct(digit_t* a, unsigned int nwords) -{ // Check if multiprecision element is zero - digit_t x; - unsigned int i; - - x = a[0]; - for (i = 1; i < nwords; i++) { - x |= a[i]; - } - - return (bool)(1 ^ ((x | (0-x)) >> (RADIX-1))); -} - - -void encode(point_t P, unsigned char* Pencoded) -{ // Encode point P - // SECURITY NOTE: this function does not run in constant time. - digit_t temp1 = (P->x[1][NWORDS_FIELD-1] & mask4000) << 1; - digit_t temp2 = (P->x[0][NWORDS_FIELD-1] & mask4000) << 1; - - memmove(Pencoded, P->y, 32); - if (is_zero_ct((digit_t*)P->x, NWORDS_FIELD) == true) { - ((digit_t*)Pencoded)[2*NWORDS_FIELD-1] |= temp1; - } else { - ((digit_t*)Pencoded)[2*NWORDS_FIELD-1] |= temp2; - } -} - - -ECCRYPTO_STATUS decode(const unsigned char* Pencoded, point_t P) -{ // Decode point P - // SECURITY NOTE: this function does not run in constant time. - felm_t r, t, t0, t1, t2, t3, t4; - f2elm_t u, v, one = {0}; - digit_t sign_dec; - point_extproj_t R; - unsigned int i, sign; - - one[0][0] = 1; - memmove((unsigned char*)P->y, Pencoded, 32); // Decoding y-coordinate and sign - sign = (unsigned int)(Pencoded[31] >> 7); - P->y[1][NWORDS_FIELD-1] &= mask7fff; - - fp2sqr1271(P->y, u); - fp2mul1271(u, (felm_t*)&PARAMETER_d, v); - fp2sub1271(u, one, u); - fp2add1271(v, one, v); - - fpsqr1271(v[0], t0); // t0 = v0^2 - fpsqr1271(v[1], t1); // t1 = v1^2 - fpadd1271(t0, t1, t0); // t0 = t0+t1 - fpmul1271(u[0], v[0], t1); // t1 = u0*v0 - fpmul1271(u[1], v[1], t2); // t2 = u1*v1 - fpadd1271(t1, t2, t1); // t1 = t1+t2 - fpmul1271(u[1], v[0], t2); // t2 = u1*v0 - fpmul1271(u[0], v[1], t3); // t3 = u0*v1 - fpsub1271(t2, t3, t2); // t2 = t2-t3 - fpsqr1271(t1, t3); // t3 = t1^2 - fpsqr1271(t2, t4); // t4 = t2^2 - fpadd1271(t3, t4, t3); // t3 = t3+t4 - for (i = 0; i < 125; i++) { // t3 = t3^(2^125) - fpsqr1271(t3, t3); - } - - fpadd1271(t1, t3, t); // t = t1+t3 - mod1271(t); - if (is_zero_ct(t, NWORDS_FIELD) == true) { - fpsub1271(t1, t3, t); // t = t1-t3 - } - fpadd1271(t, t, t); // t = 2*t - fpsqr1271(t0, t3); // t3 = t0^2 - fpmul1271(t0, t3, t3); // t3 = t3*t0 - fpmul1271(t, t3, t3); // t3 = t3*t - fpexp1251(t3, r); // r = t3^(2^125-1) - fpmul1271(t0, r, t3); // t3 = t0*r - fpmul1271(t, t3, P->x[0]); // x0 = t*t3 - fpsqr1271(P->x[0], t1); - fpmul1271(t0, t1, t1); // t1 = t0*x0^2 - fpdiv1271(P->x[0]); // x0 = x0/2 - fpmul1271(t2, t3, P->x[1]); // x1 = t3*t2 - - fpsub1271(t, t1, t); - mod1271(t); - if (is_zero_ct(t, NWORDS_FIELD) == false) { // If t != t1 then swap x0 and x1 - fpcopy1271(P->x[0], t0); - fpcopy1271(P->x[1], P->x[0]); - fpcopy1271(t0, P->x[1]); - } - - mod1271(P->x[0]); - if (is_zero_ct((digit_t*)P->x, NWORDS_FIELD) == true) { - sign_dec = ((digit_t*)&P->x[1])[NWORDS_FIELD-1] >> (sizeof(digit_t)*8 - 2); - } else { - sign_dec = ((digit_t*)&P->x[0])[NWORDS_FIELD-1] >> (sizeof(digit_t)*8 - 2); - } - - if (sign != (unsigned int)sign_dec) { // If sign of x-coordinate decoded != input sign bit, then negate x-coordinate - fp2neg1271(P->x); - } - - point_setup(P, R); - if (ecc_point_validate(R) == false) { - fpneg1271(R->x[1]); - fpcopy1271(R->x[1], P->x[1]); - if (ecc_point_validate(R) == false) { // Final point validation - return ECCRYPTO_ERROR; - } - } - - return ECCRYPTO_SUCCESS; -} - - -void to_Montgomery(const digit_t* ma, digit_t* c) -{ // Converting to Montgomery representation - - Montgomery_multiply_mod_order(ma, (digit_t*)&Montgomery_Rprime, c); -} - - -void from_Montgomery(const digit_t* a, digit_t* mc) -{ // Converting from Montgomery to standard representation - digit_t one[NWORDS_ORDER] = {0}; - one[0] = 1; - - Montgomery_multiply_mod_order(a, one, mc); -} - - -void Montgomery_inversion_mod_order(const digit_t* ma, digit_t* mc) -{ // (Non-constant time) Montgomery inversion modulo the curve order using a^(-1) = a^(order-2) mod order - // This function uses the sliding-window method - sdigit_t i = 256; - unsigned int j, nwords = NWORDS_ORDER; - digit_t temp, bit = 0, count, mod2, k_EXPON = 5; // Fixing parameter k to 5 for the sliding windows method - digit_t modulus2[NWORDS_ORDER] = {0}, npoints = 16; - digit_t input_a[NWORDS_ORDER]; - digit_t table[16][NWORDS_ORDER]; // Fixing the number of precomputed elements to 16 (assuming k = 5) - digit_t mask = (digit_t)1 << (sizeof(digit_t)*8 - 1); // 0x800...000 - digit_t mask2 = ~((digit_t)(-1) >> k_EXPON); // 0xF800...000, assuming k = 5 - - // SECURITY NOTE: this function does not run in constant time because the modulus is assumed to be public. - - modulus2[0] = 2; - subtract((digit_t*)&curve_order, modulus2, modulus2, nwords); // modulus-2 - - // Precomputation stage - memmove((unsigned char*)&table[0], (unsigned char*)ma, 32); // table[0] = ma - Montgomery_multiply_mod_order(ma, ma, input_a); // ma^2 - for (j = 0; j < npoints - 1; j++) { - Montgomery_multiply_mod_order(table[j], input_a, table[j+1]); // table[j+1] = table[j] * ma^2 - } - - while (bit != 1) { // Shift (modulus-2) to the left until getting first bit 1 - i--; - temp = 0; - for (j = 0; j < nwords; j++) { - bit = (modulus2[j] & mask) >> (sizeof(digit_t)*8 - 1); - modulus2[j] = (modulus2[j] << 1) | temp; - temp = bit; - } - } - - // Evaluation stage - memmove((unsigned char*)mc, (unsigned char*)ma, 32); - bit = (modulus2[nwords-1] & mask) >> (sizeof(digit_t)*8 - 1); - while (i > 0) { - if (bit == 0) { // Square accumulated value because bit = 0 and shift (modulus-2) one bit to the left - Montgomery_multiply_mod_order(mc, mc, mc); // mc = mc^2 - i--; - for (j = (nwords - 1); j > 0; j--) { - SHIFTL(modulus2[j], modulus2[j-1], 1, modulus2[j], RADIX); - } - modulus2[0] = modulus2[0] << 1; - } else { // "temp" will store the longest odd bitstring with "count" bits s.t. temp <= 2^k - 1 - count = k_EXPON; - temp = (modulus2[nwords-1] & mask2) >> (sizeof(digit_t)*8 - k_EXPON); // Extracting next k bits to the left - mod2 = temp & 1; - while (mod2 == 0) { // if even then shift to the right and adjust count - temp = (temp >> 1); - mod2 = temp & 1; - count--; - } - for (j = 0; j < count; j++) { // mc = mc^count - Montgomery_multiply_mod_order(mc, mc, mc); - } - Montgomery_multiply_mod_order(mc, table[(temp-1) >> 1], mc); // mc = mc * table[(temp-1)/2] - i = i - count; - - for (j = (nwords - 1); j > 0; j--) { // Shift (modulus-2) "count" bits to the left - SHIFTL(modulus2[j], modulus2[j-1], count, modulus2[j], RADIX); - } - modulus2[0] = modulus2[0] << count; - } - bit = (modulus2[nwords - 1] & mask) >> (sizeof(digit_t)*8 - 1); - } -} - - -const char* FourQ_get_error_message(ECCRYPTO_STATUS Status) -{ // Output error/success message for a given ECCRYPTO_STATUS - struct error_mapping { - unsigned int index; - char* string; - } mapping[ECCRYPTO_STATUS_TYPE_SIZE] = { - {ECCRYPTO_ERROR, ECCRYPTO_MSG_ERROR}, - {ECCRYPTO_SUCCESS, ECCRYPTO_MSG_SUCCESS}, - {ECCRYPTO_ERROR_DURING_TEST, ECCRYPTO_MSG_ERROR_DURING_TEST}, - {ECCRYPTO_ERROR_UNKNOWN, ECCRYPTO_MSG_ERROR_UNKNOWN}, - {ECCRYPTO_ERROR_NOT_IMPLEMENTED, ECCRYPTO_MSG_ERROR_NOT_IMPLEMENTED}, - {ECCRYPTO_ERROR_NO_MEMORY, ECCRYPTO_MSG_ERROR_NO_MEMORY}, - {ECCRYPTO_ERROR_INVALID_PARAMETER, ECCRYPTO_MSG_ERROR_INVALID_PARAMETER}, - {ECCRYPTO_ERROR_SHARED_KEY, ECCRYPTO_MSG_ERROR_SHARED_KEY}, - {ECCRYPTO_ERROR_SIGNATURE_VERIFICATION, ECCRYPTO_MSG_ERROR_SIGNATURE_VERIFICATION}, - {ECCRYPTO_ERROR_HASH_TO_CURVE, ECCRYPTO_MSG_ERROR_HASH_TO_CURVE}, - }; - - if (Status >= ECCRYPTO_STATUS_TYPE_SIZE || mapping[Status].string == NULL) { - return "Unrecognized ECCRYPTO_STATUS"; - } else { - return mapping[Status].string; - } -}; \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/eccp2.c b/ffi-deps/FourQlib/FourQ_64bit_and_portable/eccp2.c deleted file mode 100644 index 6b50b9a..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/eccp2.c +++ /dev/null @@ -1,486 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: ECC operations over GF(p^2) exploiting endomorphisms -* -* This code is based on the paper "FourQ: four-dimensional decompositions on a -* Q-curve over the Mersenne prime" by Craig Costello and Patrick Longa, in Advances -* in Cryptology - ASIACRYPT, 2015. -* Preprint available at http://eprint.iacr.org/2015/565. -************************************************************************************/ - -#include "FourQ_internal.h" - - -#if (USE_ENDO == true) - -// Fixed GF(p^2) constants for the endomorphisms -static uint64_t ctau1[4] = {0x74DCD57CEBCE74C3, 0x1964DE2C3AFAD20C, 0x12, 0x0C}; -static uint64_t ctaudual1[4] = {0x9ECAA6D9DECDF034, 0x4AA740EB23058652, 0x11, 0x7FFFFFFFFFFFFFF4}; -static uint64_t cphi0[4] = {0xFFFFFFFFFFFFFFF7, 0x05, 0x4F65536CEF66F81A, 0x2553A0759182C329}; -static uint64_t cphi1[4] = {0x07, 0x05, 0x334D90E9E28296F9, 0x62C8CAA0C50C62CF}; -static uint64_t cphi2[4] = {0x15, 0x0F, 0x2C2CB7154F1DF391, 0x78DF262B6C9B5C98}; -static uint64_t cphi3[4] = {0x03, 0x02, 0x92440457A7962EA4, 0x5084C6491D76342A}; -static uint64_t cphi4[4] = {0x03, 0x03, 0xA1098C923AEC6855, 0x12440457A7962EA4}; -static uint64_t cphi5[4] = {0x0F, 0x0A, 0x669B21D3C5052DF3, 0x459195418A18C59E}; -static uint64_t cphi6[4] = {0x18, 0x12, 0xCD3643A78A0A5BE7, 0x0B232A8314318B3C}; -static uint64_t cphi7[4] = {0x23, 0x18, 0x66C183035F48781A, 0x3963BC1C99E2EA1A}; -static uint64_t cphi8[4] = {0xF0, 0xAA, 0x44E251582B5D0EF0, 0x1F529F860316CBE5}; -static uint64_t cphi9[4] = {0xBEF, 0x870, 0x14D3E48976E2505, 0xFD52E9CFE00375B}; -static uint64_t cpsi1[4] = {0xEDF07F4767E346EF, 0x2AF99E9A83D54A02, 0x13A, 0xDE}; -static uint64_t cpsi2[4] = {0x143, 0xE4, 0x4C7DEB770E03F372, 0x21B8D07B99A81F03}; -static uint64_t cpsi3[4] = {0x09, 0x06, 0x3A6E6ABE75E73A61, 0x4CB26F161D7D6906}; -static uint64_t cpsi4[4] = {0xFFFFFFFFFFFFFFF6, 0x7FFFFFFFFFFFFFF9, 0xC59195418A18C59E, 0x334D90E9E28296F9}; - -// Fixed integer constants for the decomposition -// Close "offset" vector -static uint64_t c1 = {0x72482C5251A4559C}; -static uint64_t c2 = {0x59F95B0ADD276F6C}; -static uint64_t c3 = {0x7DD2D17C4625FA78}; -static uint64_t c4 = {0x6BC57DEF56CE8877}; -// Optimal basis vectors -static uint64_t b11 = {0x0906FF27E0A0A196}; -static uint64_t b12 = {0x1363E862C22A2DA0}; -static uint64_t b13 = {0x07426031ECC8030F}; -static uint64_t b14 = {0x084F739986B9E651}; -static uint64_t b21 = {0x1D495BEA84FCC2D4}; -static uint64_t b24 = {0x25DBC5BC8DD167D0}; -static uint64_t b31 = {0x17ABAD1D231F0302}; -static uint64_t b32 = {0x02C4211AE388DA51}; -static uint64_t b33 = {0x2E4D21C98927C49F}; -static uint64_t b34 = {0x0A9E6F44C02ECD97}; -static uint64_t b41 = {0x136E340A9108C83F}; -static uint64_t b42 = {0x3122DF2DC3E0FF32}; -static uint64_t b43 = {0x068A49F02AA8A9B5}; -static uint64_t b44 = {0x18D5087896DE0AEA}; -// Precomputed integers for fast-Babai rounding -static uint64_t ell1[4] = {0x259686E09D1A7D4F, 0xF75682ACE6A6BD66, 0xFC5BB5C5EA2BE5DF, 0x07}; -static uint64_t ell2[4] = {0xD1BA1D84DD627AFB, 0x2BD235580F468D8D, 0x8FD4B04CAA6C0F8A, 0x03}; -static uint64_t ell3[4] = {0x9B291A33678C203C, 0xC42BD6C965DCA902, 0xD038BF8D0BFFBAF6, 0x00}; -static uint64_t ell4[4] = {0x12E5666B77E7FDC0, 0x81CBDC3714983D82, 0x1B073877A22D8410, 0x03}; - - -/***********************************************/ -/********** CURVE/SCALAR FUNCTIONS ***********/ - -static __inline void ecc_tau(point_extproj_t P) -{ // Apply tau mapping to a point, P = tau(P) - // Input: P = (X1:Y1:Z1) on E in twisted Edwards coordinates - // Output: P = (Xfinal:Yfinal:Zfinal) on Ehat in twisted Edwards coordinates - f2elm_t t0, t1; - - fp2sqr1271(P->x, t0); // t0 = X1^2 - fp2sqr1271(P->y, t1); // t1 = Y1^2 - fp2mul1271(P->x, P->y, P->x); // X = X1*Y1 - fp2sqr1271(P->z, P->y); // Y = Z1^2 - fp2add1271(t0, t1, P->z); // Z = X1^2+Y1^2 - fp2sub1271(t1, t0, t0); // t0 = Y1^2-X1^2 - fp2add1271(P->y, P->y, P->y); // Y = 2*Z1^2 - fp2mul1271(P->x, t0, P->x); // X = X1*Y1*(Y1^2-X1^2) - fp2sub1271(P->y, t0, P->y); // Y = 2*Z1^2-(Y1^2-X1^2) - fp2mul1271(P->x, (felm_t*)&ctau1, P->x); // Xfinal = X*ctau1 - fp2mul1271(P->y, P->z, P->y); // Yfinal = Y*Z - fp2mul1271(P->z, t0, P->z); // Zfinal = t0*Z -} - - -static __inline void ecc_tau_dual(point_extproj_t P) -{ // Apply tau_dual mapping to a point, P = tau_dual(P) - // Input: P = (X1:Y1:Z1) on Ehat in twisted Edwards coordinates - // Output: P = (Xfinal,Yfinal,Zfinal,Tafinal,Tbfinal) on E, where Tfinal = Tafinal*Tbfinal, - // corresponding to (Xfinal:Yfinal:Zfinal:Tfinal) in extended twisted Edwards coordinates - f2elm_t t0, t1; - - fp2sqr1271(P->x, t0); // t0 = X1^2 - fp2sqr1271(P->z, P->ta); // Ta = Z1^2 - fp2sqr1271(P->y, t1); // t1 = Y1^2 - fp2add1271(P->ta, P->ta, P->z); // Z = 2*Z1^2 - fp2sub1271(t1, t0, P->ta); // Tafinal = Y1^2-X1^2 - fp2add1271(t0, t1, t0); // t0 = X1^2+Y1^2 - fp2mul1271(P->x, P->y, P->x); // X = X1*Y1 - fp2sub1271(P->z, P->ta, P->z); // Z = 2*Z1^2-(Y1^2-X1^2) - fp2mul1271(P->x, (felm_t*)&ctaudual1, P->tb); // Tbfinal = ctaudual1*X1*X1 - fp2mul1271(P->z, P->ta, P->y); // Yfinal = Z*Tafinal - fp2mul1271(P->tb, t0, P->x); // Xfinal = Tbfinal*t0 - fp2mul1271(P->z, t0, P->z); // Zfinal = Z*t0 -} - - -static __inline void ecc_delphidel(point_extproj_t P) -{ // Apply delta_phi_delta mapping to a point, P = delta(phi_W(delta_inv(P))), - // where phi_W is the endomorphism on the Weierstrass form. - // Input: P = (X1:Y1:Z1) on Ehat in twisted Edwards coordinates - // Output: P = (Xfinal:Yfinal:Zfinal) on Ehat in twisted Edwards coordinates - f2elm_t t0, t1, t2, t3, t4, t5, t6; - - fp2sqr1271(P->z, t4); // t4 = Z1^2 - fp2mul1271(P->y, P->z, t3); // t3 = Y1*Z1 - fp2mul1271(t4, (felm_t*)&cphi4, t0); // t0 = cphi4*t4 - fp2sqr1271(P->y, t2); // t2 = Y1^2 - fp2add1271(t0, t2, t0); // t0 = t0+t2 - fp2mul1271(t3, (felm_t*)&cphi3, t1); // t1 = cphi3*t3 - fp2sub1271(t0, t1, t5); // t5 = t0-t1 - fp2add1271(t0, t1, t0); // t0 = t0+t1 - fp2mul1271(t0, P->z, t0); // t0 = t0*Z1 - fp2mul1271(t3, (felm_t*)&cphi1, t1); // t1 = cphi1*t3 - fp2mul1271(t0, t5, t0); // t0 = t0*t5 - fp2mul1271(t4, (felm_t*)&cphi2, t5); // t5 = cphi2*t4 - fp2add1271(t2, t5, t5); // t5 = t2+t5 - fp2sub1271(t1, t5, t6); // t6 = t1-t5 - fp2add1271(t1, t5, t1); // t1 = t1+t5 - fp2mul1271(t6, t1, t6); // t6 = t1*t6 - fp2mul1271(t6, (felm_t*)&cphi0, t6); // t6 = cphi0*t6 - fp2mul1271(P->x, t6, P->x); // X = X1*t6 - fp2sqr1271(t2, t6); // t6 = t2^2 - fp2sqr1271(t3, t2); // t2 = t3^2 - fp2sqr1271(t4, t3); // t3 = t4^2 - fp2mul1271(t2, (felm_t*)&cphi8, t1); // t1 = cphi8*t2 - fp2mul1271(t3, (felm_t*)&cphi9, t5); // t5 = cphi9*t3 - fp2add1271(t1, t6, t1); // t1 = t1+t6 - fp2mul1271(t2, (felm_t*)&cphi6, t2); // t2 = cphi6*t2 - fp2mul1271(t3, (felm_t*)&cphi7, t3); // t3 = cphi7*t3 - fp2add1271(t1, t5, t1); // t1 = t1+t5 - fp2add1271(t2, t3, t2); // t2 = t2+t3 - fp2mul1271(t1, P->y, t1); // t1 = Y1*t1 - fp2add1271(t6, t2, P->y); // Y = t6+t2 - fp2mul1271(P->x, t1, P->x); // X = X*t1 - fp2mul1271(P->y, (felm_t*)&cphi5, P->y); // Y = cphi5*Y - fpneg1271(P->x[1]); // Xfinal = X^p - fp2mul1271(P->y, P->z, P->y); // Y = Y*Z1 - fp2mul1271(t0, t1, P->z); // Z = t0*t1 - fp2mul1271(P->y, t0, P->y); // Y = Y*t0 - fpneg1271(P->z[1]); // Zfinal = Z^p - fpneg1271(P->y[1]); // Yfinal = Y^p -} - - -static __inline void ecc_delpsidel(point_extproj_t P) -{ // Apply delta_psi_delta mapping to a point, P = delta(psi_W(delta_inv(P))), - // where psi_W is the endomorphism on the Weierstrass form. - // Input: P = (X1:Y1:Z1) on Ehat in twisted Edwards coordinates - // Output: P = (Xfinal:Yfinal:Zfinal) on Ehat in twisted Edwards coordinates - f2elm_t t0, t1, t2; - - fpneg1271(P->x[1]); // X = X1^p - fpneg1271(P->z[1]); // Z = Z1^p - fpneg1271(P->y[1]); // Y = Y1^p - fp2sqr1271(P->z, t2); // t2 = Z1^p^2 - fp2sqr1271(P->x, t0); // t0 = X1^p^2 - fp2mul1271(P->x, t2, P->x); // X = X1^p*Z1^p^2 - fp2mul1271(t2, (felm_t*)&cpsi2, P->z); // Z = cpsi2*Z1^p^2 - fp2mul1271(t2, (felm_t*)&cpsi3, t1); // t1 = cpsi3*Z1^p^2 - fp2mul1271(t2, (felm_t*)&cpsi4, t2); // t2 = cpsi4*Z1^p^2 - fp2add1271(t0, P->z, P->z); // Z = X1^p^2 + cpsi2*Z1^p^2 - fp2add1271(t0, t2, t2); // t2 = X1^p^2 + cpsi4*Z1^p^2 - fp2add1271(t0, t1, t1); // t1 = X1^p^2 + cpsi3*Z1^p^2 - fp2neg1271(t2); // t2 = -(X1^p^2 + cpsi4*Z1^p^2) - fp2mul1271(P->z, P->y, P->z); // Z = Y1^p*(X1^p^2 + cpsi2*Z1^p^2) - fp2mul1271(P->x, t2, P->x); // X = -X1^p*Z1^p^2*(X1^p^2 + cpsi4*Z1^p^2) - fp2mul1271(t1, P->z, P->y); // Yfinal = t1*Z - fp2mul1271(P->x, (felm_t*)&cpsi1, P->x); // Xfinal = cpsi1*X - fp2mul1271(P->z, t2, P->z); // Zfinal = Z*t2 -} - - -void ecc_psi(point_extproj_t P) -{ // Apply psi mapping to a point, P = psi(P) - // Input: P = (X1:Y1:Z1) on E in twisted Edwards coordinates - // Output: P = (Xfinal,Yfinal,Zfinal,Tafinal,Tbfinal) on E, where Tfinal = Tafinal*Tbfinal, - // corresponding to (Xfinal:Yfinal:Zfinal:Tfinal) in extended twisted Edwards coordinates - - ecc_tau(P); - ecc_delpsidel(P); - ecc_tau_dual(P); -} - - -void ecc_phi(point_extproj_t P) -{ // Apply phi mapping to a point, P = phi(P) - // Input: P = (X1:Y1:Z1) on E in twisted Edwards coordinates - // Output: P = (Xfinal,Yfinal,Zfinal,Tafinal,Tbfinal) on E, where Tfinal = Tafinal*Tbfinal, - // corresponding to (Xfinal:Yfinal:Zfinal:Tfinal) in extended twisted Edwards coordinates - - ecc_tau(P); - ecc_delphidel(P); - ecc_tau_dual(P); -} - - -static __inline void mul_truncate(uint64_t* s, uint64_t* C, uint64_t* out) -{ // 256-bit multiplication with truncation for the scalar decomposition - // Outputs 64-bit value "out" = (uint64_t)((s * C) >> 256). - uint128_t tt1, tt2; - unsigned int carry1; - -#if defined(GENERIC_IMPLEMENTATION) || defined(SCALAR_INTRIN_SUPPORT) - unsigned int carry2; - uint64_t temp; - - MUL128(s[0], C[0], tt2); - tt2[0] = tt2[1]; - tt2[1] = 0; - MUL128(s[1], C[0], tt1); - ADD128(tt1, tt2, tt1); - MUL128(s[0], C[1], tt2); - ADC128(tt1, tt2, carry1, tt1); - tt1[0] = tt1[1]; - tt1[1] = (uint64_t)(carry1); - MUL128(s[2], C[0], tt2); - ADD128(tt1, tt2, tt1); - MUL128(s[0], C[2], tt2); - ADC128(tt1, tt2, carry1, tt1); - MUL128(s[1], C[1], tt2); - ADC128(tt1, tt2, carry2, tt1); - tt1[0] = tt1[1]; - tt1[1] = (uint64_t)carry1 + (uint64_t)carry2; - MUL128(s[0], C[3], tt2); - ADD128(tt1, tt2, tt1); - MUL128(s[3], C[0], tt2); - ADC128(tt1, tt2, carry1, tt1); - MUL128(s[1], C[2], tt2); - ADC128(tt1, tt2, carry2, tt1); - temp = (uint64_t)carry1 + (uint64_t)carry2; - MUL128(s[2], C[1], tt2); - ADC128(tt1, tt2, carry2, tt1); - tt1[0] = tt1[1]; - tt1[1] = temp + (uint64_t)carry2; - MUL128(s[1], C[3], tt2); - ADD128(tt1, tt2, tt1); - MUL128(s[3], C[1], tt2); - ADD128(tt1, tt2, tt1); - MUL128(s[2], C[2], tt2); - ADD128(tt1, tt2, tt1); - *out = tt1[0]; -#ifdef TEMP_ZEROING - clear_words((void*)tt1, sizeof(uint128_t)/sizeof(unsigned int)); - clear_words((void*)tt2, sizeof(uint128_t)/sizeof(unsigned int)); - clear_words((void*)&temp, sizeof(uint64_t)/sizeof(unsigned int)); -#endif - -#elif defined(UINT128_SUPPORT) - uint128_t tt3, tt4; - - tt2 = (uint128_t)s[0]*C[0]; - tt1 = (uint128_t)s[1]*C[0] + (uint64_t)(tt2 >> 64); - tt2 = (uint128_t)s[0]*C[1]; - carry1 = (unsigned int)(((uint128_t)((uint64_t)tt1) + (uint128_t)((uint64_t)tt2)) >> 64); - tt1 = (uint128_t)(tt1 >> 64) + (uint128_t)(tt2 >> 64) + (uint64_t)carry1; - tt1 += (uint128_t)s[2]*C[0]; - tt2 = (uint128_t)s[0]*C[2]; - tt3 = (uint128_t)s[1]*C[1]; - carry1 = (unsigned int)(((uint128_t)((uint64_t)tt1) + (uint128_t)((uint64_t)tt2) + (uint128_t)((uint64_t)tt3)) >> 64); - tt1 = (uint128_t)(tt1 >> 64) + (uint128_t)(tt2 >> 64) + (uint128_t)(tt3 >> 64) + (uint64_t)carry1; - tt1 += (uint128_t)s[0]*C[3]; - tt2 = (uint128_t)s[3]*C[0]; - tt3 = (uint128_t)s[1]*C[2]; - tt4 = (uint128_t)s[2]*C[1]; - carry1 = (unsigned int)(((uint128_t)((uint64_t)tt1) + (uint128_t)((uint64_t)tt2) + (uint128_t)((uint64_t)tt3) + (uint128_t)((uint64_t)tt4)) >> 64); - tt1 = (uint128_t)(tt1 >> 64) + (uint128_t)(tt2 >> 64) + (uint128_t)(tt3 >> 64) + (uint128_t)(tt4 >> 64) + (uint64_t)carry1; - tt1 += (uint128_t)s[1]*C[3] + (uint128_t)s[3]*C[1] + (uint128_t)s[2]*C[2]; - *out = (uint64_t)tt1; -#ifdef TEMP_ZEROING - clear_words((void*)&tt1, sizeof(uint128_t)/sizeof(unsigned int)); - clear_words((void*)&tt2, sizeof(uint128_t)/sizeof(unsigned int)); - clear_words((void*)&tt3, sizeof(uint128_t)/sizeof(unsigned int)); - clear_words((void*)&tt4, sizeof(uint128_t)/sizeof(unsigned int)); -#endif -#endif -} - - -void decompose(uint64_t* k, uint64_t* scalars) -{ // Scalar decomposition for the variable-base scalar multiplication - // Input: scalar in the range [0, 2^256-1]. - // Output: 4 64-bit sub-scalars. - uint64_t a1, a2, a3, a4, temp, mask; - -#if (TARGET == TARGET_x86) && (COMPILER == COMPILER_VC) - uint128_t t1, t2, t3, t4; - - mul_truncate(k, ell1, &a1); - mul_truncate(k, ell2, &a2); - mul_truncate(k, ell3, &a3); - mul_truncate(k, ell4, &a4); - - MUL128(a1, b11, t1); MUL128(a2, b21, t2); MUL128(a3, b31, t3); MUL128(a4, b41, t4); - temp = k[0] - t1[0] - t2[0] - t3[0] - t4[0] + c1; - mask = ~(0 - (temp & 1)); // If temp is even then mask = 0xFF...FF, else mask = 0 - - scalars[0] = temp + (mask & b41); - MUL128(a1, b12, t1); MUL128(a3, b32, t2); MUL128(a4, b42, t3); - scalars[1] = t1[0] + (uint64_t)a2 - t2[0] - t3[0] + c2 + (mask & b42); - MUL128(a3, b33, t1); MUL128(a1, b13, t2); MUL128(a4, b43, t3); - scalars[2] = t1[0] - t2[0] - (uint64_t)a2 + t3[0] + c3 - (mask & b43); - MUL128(a1, b14, t1); MUL128(a2, b24, t2); MUL128(a3, b34, t3); MUL128(a4, b44, t4); - scalars[3] = t1[0] - t2[0] - t3[0] + t4[0] + c4 - (mask & b44); -#else - mul_truncate(k, ell1, &a1); - mul_truncate(k, ell2, &a2); - mul_truncate(k, ell3, &a3); - mul_truncate(k, ell4, &a4); - - temp = k[0] - (uint64_t)a1*b11 - (uint64_t)a2*b21 - (uint64_t)a3*b31 - (uint64_t)a4*b41 + c1; - mask = ~(0 - (temp & 1)); // If temp is even then mask = 0xFF...FF, else mask = 0 - - scalars[0] = temp + (mask & b41); - scalars[1] = (uint64_t)a1*b12 + (uint64_t)a2 - (uint64_t)a3*b32 - (uint64_t)a4*b42 + c2 + (mask & b42); - scalars[2] = (uint64_t)a3*b33 - (uint64_t)a1*b13 - (uint64_t)a2 + (uint64_t)a4*b43 + c3 - (mask & b43); - scalars[3] = (uint64_t)a1*b14 - (uint64_t)a2*b24 - (uint64_t)a3*b34 + (uint64_t)a4*b44 + c4 - (mask & b44); -#endif - -#ifdef TEMP_ZEROING - clear_words((void*)&a1, sizeof(uint64_t)/sizeof(unsigned int)); - clear_words((void*)&a2, sizeof(uint64_t)/sizeof(unsigned int)); - clear_words((void*)&a3, sizeof(uint64_t)/sizeof(unsigned int)); - clear_words((void*)&a4, sizeof(uint64_t)/sizeof(unsigned int)); - clear_words((void*)&temp, sizeof(uint64_t)/sizeof(unsigned int)); - clear_words((void*)&mask, sizeof(uint64_t)/sizeof(unsigned int)); -#endif -} - - -void ecc_precomp(point_extproj_t P, point_extproj_precomp_t *T) -{ // Generation of the precomputation table used by the variable-base scalar multiplication ecc_mul(). - // Input: P = (X1,Y1,Z1,Ta,Tb), where T1 = Ta*Tb, corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - // Output: table T containing 8 points: P, P+phi(P), P+psi(P), P+phi(P)+psi(P), P+psi(phi(P)), P+phi(P)+psi(phi(P)), P+psi(P)+psi(phi(P)), P+phi(P)+psi(P)+psi(phi(P)) - // Precomputed points use the representation (X+Y,Y-X,2Z,2dT) corresponding to (X:Y:Z:T) in extended twisted Edwards coordinates - point_extproj_precomp_t Q, R, S; - point_extproj_t PP; - - // Generating Q = phi(P) = (XQ+YQ,YQ-XQ,ZQ,TQ) - ecccopy(P, PP); - ecc_phi(PP); - R1_to_R3(PP, Q); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,Z,T) - - // Generating S = psi(Q) = (XS+YS,YS-XS,ZS,TS) - ecc_psi(PP); - R1_to_R3(PP, S); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,Z,T) - - // Generating T[0] = P = (XP+YP,YP-XP,2ZP,2dTP) - R1_to_R2(P, T[0]); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT) - - // Generating R = psi(P) = (XR+YR,YR-XR,ZR,TR) - ecc_psi(P); - R1_to_R3(P, R); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,Z,T) - - eccadd_core(T[0], Q, PP); // T[1] = P+Q using the representations (X,Y,Z,Ta,Tb) <- (X+Y,Y-X,2Z,2dT) + (X+Y,Y-X,Z,T) - R1_to_R2(PP, T[1]); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT) - eccadd_core(T[0], R, PP); // T[2] = P+R - R1_to_R2(PP, T[2]); - eccadd_core(T[1], R, PP); // T[3] = P+Q+R - R1_to_R2(PP, T[3]); - eccadd_core(T[0], S, PP); // T[4] = P+S - R1_to_R2(PP, T[4]); - eccadd_core(T[1], S, PP); // T[5] = P+Q+S - R1_to_R2(PP, T[5]); - eccadd_core(T[2], S, PP); // T[6] = P+R+S - R1_to_R2(PP, T[6]); - eccadd_core(T[3], S, PP); // T[7] = P+Q+R+S - R1_to_R2(PP, T[7]); -} - - -void recode(uint64_t* scalars, unsigned int* digits, unsigned int* sign_masks) -{ // Recoding sub-scalars for use in the variable-base scalar multiplication. See Algorithm 1 in "Efficient and Secure Methods for GLV-Based Scalar - // Multiplication and their Implementation on GLV-GLS Curves (Extended Version)", A. Faz-Hernandez, P. Longa, and A.H. Sanchez, in Journal - // of Cryptographic Engineering, Vol. 5(1), 2015. - // Input: 4 64-bit sub-scalars passed through "scalars", which are obtained after calling decompose(). - // Outputs: "digits" array with 65 nonzero entries. Each entry is in the range [0, 7], corresponding to one entry in the precomputed table. - // "sign_masks" array with 65 entries storing the signs for their corresponding digits in "digits". - // Notation: if the corresponding digit > 0 then sign_mask = 0xFF...FF, else if digit < 0 then sign_mask = 0. - unsigned int i, bit, bit0, carry; - sign_masks[64] = (unsigned int)-1; - - for (i = 0; i < 64; i++) - { - scalars[0] >>= 1; - bit0 = (unsigned int)scalars[0] & 1; - sign_masks[i] = 0 - bit0; - - bit = (unsigned int)scalars[1] & 1; - carry = (bit0 | bit) ^ bit0; - scalars[1] = (scalars[1] >> 1) + (uint64_t)carry; - digits[i] = bit; - - bit = (unsigned int)scalars[2] & 1; - carry = (bit0 | bit) ^ bit0; - scalars[2] = (scalars[2] >> 1) + (uint64_t)carry; - digits[i] += (bit << 1); - - bit = (unsigned int)scalars[3] & 1; - carry = (bit0 | bit) ^ bit0; - scalars[3] = (scalars[3] >> 1) + (uint64_t)carry; - digits[i] += (bit << 2); - } - digits[64] = (unsigned int)(scalars[1] + (scalars[2] << 1) + (scalars[3] << 2)); -} - - -bool ecc_mul(point_t P, digit_t* k, point_t Q, bool clear_cofactor) -{ // Variable-base scalar multiplication Q = k*P using a 4-dimensional decomposition - // Inputs: scalar "k" in [0, 2^256-1], - // point P = (x,y) in affine coordinates, - // clear_cofactor = 1 (TRUE) or 0 (FALSE) whether cofactor clearing is required or not, respectively. - // Output: Q = k*P in affine coordinates (x,y). - // This function performs point validation and (if selected) cofactor clearing. - point_extproj_t R; - point_extproj_precomp_t S, Table[8]; - uint64_t scalars[NWORDS64_ORDER]; - unsigned int digits[65], sign_masks[65]; - int i; - - point_setup(P, R); // Convert to representation (X,Y,1,Ta,Tb) - decompose((uint64_t*)k, scalars); // Scalar decomposition - - if (ecc_point_validate(R) == false) { // Check if point lies on the curve - return false; - } - - if (clear_cofactor == true) { - cofactor_clearing(R); - } - recode(scalars, digits, sign_masks); // Scalar recoding - ecc_precomp(R, Table); // Precomputation - table_lookup_1x8(Table, S, digits[64], sign_masks[64]); // Extract initial point in (X+Y,Y-X,2Z,2dT) representation - R2_to_R4(S, R); // Conversion to representation (2X,2Y,2Z) - - for (i = 63; i >= 0; i--) - { - table_lookup_1x8(Table, S, digits[i], sign_masks[i]); // Extract point S in (X+Y,Y-X,2Z,2dT) representation - eccdouble(R); // P = 2*P using representations (X,Y,Z,Ta,Tb) <- 2*(X,Y,Z) - eccadd(S, R); // P = P+S using representations (X,Y,Z,Ta,Tb) <- (X,Y,Z,Ta,Tb) + (X+Y,Y-X,2Z,2dT) - } - eccnorm(R, Q); // Conversion to affine coordinates (x,y) and modular correction. - -#ifdef TEMP_ZEROING - clear_words((void*)digits, 65); - clear_words((void*)sign_masks, 65); - clear_words((void*)S, sizeof(point_extproj_precomp_t)/sizeof(unsigned int)); -#endif - return true; -} - - -void cofactor_clearing(point_extproj_t P) -{ // Co-factor clearing - // Input: P = (X1,Y1,Z1,Ta,Tb), where T1 = Ta*Tb, corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - // Output: P = 392*P = (Xfinal,Yfinal,Zfinal,Tafinal,Tbfinal), where Tfinal = Tafinal*Tbfinal, - // corresponding to (Xfinal:Yfinal:Zfinal:Tfinal) in extended twisted Edwards coordinates - point_extproj_precomp_t Q; - - R1_to_R2(P, Q); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT) - eccdouble(P); // P = 2*P using representations (X,Y,Z,Ta,Tb) <- 2*(X,Y,Z) - eccadd(Q, P); // P = P+Q using representations (X,Y,Z,Ta,Tb) <- (X,Y,Z,Ta,Tb) + (X+Y,Y-X,2Z,2dT) - eccdouble(P); - eccdouble(P); - eccdouble(P); - eccdouble(P); - eccadd(Q, P); - eccdouble(P); - eccdouble(P); - eccdouble(P); -} - -#endif diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/eccp2_core.c b/ffi-deps/FourQlib/FourQ_64bit_and_portable/eccp2_core.c deleted file mode 100644 index 0275eb4..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/eccp2_core.c +++ /dev/null @@ -1,727 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: core GF(p^2) and ECC operations over GF(p^2) -* -* This code is based on the paper "FourQ: four-dimensional decompositions on a -* Q-curve over the Mersenne prime" by Craig Costello and Patrick Longa, in Advances -* in Cryptology - ASIACRYPT, 2015. -* Preprint available at http://eprint.iacr.org/2015/565. -************************************************************************************/ - -#include "FourQ_internal.h" -#include "FourQ_params.h" -#include "FourQ_tables.h" -#if defined(GENERIC_IMPLEMENTATION) - #include "generic/fp.h" -#elif (TARGET == TARGET_AMD64) - #include "AMD64/fp_x64.h" -#elif (TARGET == TARGET_ARM64) - #include "ARM64/fp_arm64.h" -#endif - - -/***********************************************/ -/************* GF(p^2) FUNCTIONS ***************/ - -void fp2copy1271(f2elm_t a, f2elm_t c) -{// Copy of a GF(p^2) element, c = a - fpcopy1271(a[0], c[0]); - fpcopy1271(a[1], c[1]); -} - - -void fp2zero1271(f2elm_t a) -{// Zeroing a GF(p^2) element, a = 0 - fpzero1271(a[0]); - fpzero1271(a[1]); -} - - -void fp2neg1271(f2elm_t a) -{// GF(p^2) negation, a = -a in GF((2^127-1)^2) - fpneg1271(a[0]); - fpneg1271(a[1]); -} - - -void fp2sqr1271(f2elm_t a, f2elm_t c) -{// GF(p^2) squaring, c = a^2 in GF((2^127-1)^2) - -#ifdef ASM_SUPPORT - fp2sqr1271_a(a, c); -#else - felm_t t1, t2, t3; - - fpadd1271(a[0], a[1], t1); // t1 = a0+a1 - fpsub1271(a[0], a[1], t2); // t2 = a0-a1 - fpmul1271(a[0], a[1], t3); // t3 = a0*a1 - fpmul1271(t1, t2, c[0]); // c0 = (a0+a1)(a0-a1) - fpadd1271(t3, t3, c[1]); // c1 = 2a0*a1 -#ifdef TEMP_ZEROING - clear_words((void*)t1, sizeof(felm_t)/sizeof(unsigned int)); - clear_words((void*)t2, sizeof(felm_t)/sizeof(unsigned int)); - clear_words((void*)t3, sizeof(felm_t)/sizeof(unsigned int)); -#endif -#endif -} - - -void fp2mul1271(f2elm_t a, f2elm_t b, f2elm_t c) -{// GF(p^2) multiplication, c = a*b in GF((2^127-1)^2) - -#if defined(ASM_SUPPORT) - fp2mul1271_a(a, b, c); -#else - felm_t t1, t2, t3, t4; - - fpmul1271(a[0], b[0], t1); // t1 = a0*b0 - fpmul1271(a[1], b[1], t2); // t2 = a1*b1 - fpadd1271(a[0], a[1], t3); // t3 = a0+a1 - fpadd1271(b[0], b[1], t4); // t4 = b0+b1 - fpsub1271(t1, t2, c[0]); // c[0] = a0*b0 - a1*b1 - fpmul1271(t3, t4, t3); // t3 = (a0+a1)*(b0+b1) - fpsub1271(t3, t1, t3); // t3 = (a0+a1)*(b0+b1) - a0*b0 - fpsub1271(t3, t2, c[1]); // c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 -#ifdef TEMP_ZEROING - clear_words((void*)t1, sizeof(felm_t)/sizeof(unsigned int)); - clear_words((void*)t2, sizeof(felm_t)/sizeof(unsigned int)); - clear_words((void*)t3, sizeof(felm_t)/sizeof(unsigned int)); - clear_words((void*)t4, sizeof(felm_t)/sizeof(unsigned int)); -#endif -#endif -} - - -__inline void fp2add1271(f2elm_t a, f2elm_t b, f2elm_t c) -{// GF(p^2) addition, c = a+b in GF((2^127-1)^2) - fpadd1271(a[0], b[0], c[0]); - fpadd1271(a[1], b[1], c[1]); -} - - -__inline void fp2sub1271(f2elm_t a, f2elm_t b, f2elm_t c) -{// GF(p^2) subtraction, c = a-b in GF((2^127-1)^2) - fpsub1271(a[0], b[0], c[0]); - fpsub1271(a[1], b[1], c[1]); -} - - -static __inline void fp2addsub1271(f2elm_t a, f2elm_t b, f2elm_t c) -{// GF(p^2) addition followed by subtraction, c = 2a-b in GF((2^127-1)^2) - -#ifdef ASM_SUPPORT - fp2addsub1271_a(a, b, c); -#else - fp2add1271(a, a, a); - fp2sub1271(a, b, c); -#endif -} - - -void fp2inv1271(f2elm_t a) -{// GF(p^2) inversion, a = (a0-i*a1)/(a0^2+a1^2) - f2elm_t t1; - - fpsqr1271(a[0], t1[0]); // t10 = a0^2 - fpsqr1271(a[1], t1[1]); // t11 = a1^2 - fpadd1271(t1[0], t1[1], t1[0]); // t10 = a0^2+a1^2 - fpinv1271(t1[0]); // t10 = (a0^2+a1^2)^-1 - fpneg1271(a[1]); // a = a0-i*a1 - fpmul1271(a[0], t1[0], a[0]); - fpmul1271(a[1], t1[0], a[1]); // a = (a0-i*a1)*(a0^2+a1^2)^-1 -#ifdef TEMP_ZEROING - clear_words((void*)t1, sizeof(f2elm_t)/sizeof(unsigned int)); -#endif -} - - -void clear_words(void* mem, unsigned int nwords) -{ // Clear integer-size digits from memory. "nwords" indicates the number of integer digits to be zeroed. - // This function uses the volatile type qualifier to inform the compiler not to optimize out the memory clearing. - // It has been tested with MSVS 2013 and GNU GCC 4.6.3, 4.7.3, 4.8.2 and 4.8.4. Users are responsible for verifying correctness with different compilers. - // See "Compliant Solution (C99)" at https://www.securecoding.cert.org/confluence/display/c/MSC06-C.+Beware+of+compiler+optimizations - unsigned int i; - volatile unsigned int *v = mem; - - for (i = 0; i < nwords; i++) - v[i] = 0; -} - - -/***********************************************/ -/********** CURVE/SCALAR FUNCTIONS ***********/ - -void eccset(point_t P) -{ // Set generator - // Output: P = (x,y) - - fp2copy1271((felm_t*)&GENERATOR_x, P->x); // X1 - fp2copy1271((felm_t*)&GENERATOR_y, P->y); // Y1 -} - - -void eccnorm(point_extproj_t P, point_t Q) -{ // Normalize a projective point (X1:Y1:Z1), including full reduction - // Input: P = (X1:Y1:Z1) in twisted Edwards coordinates - // Output: Q = (X1/Z1,Y1/Z1), corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - - fp2inv1271(P->z); // Z1 = Z1^-1 - fp2mul1271(P->x, P->z, Q->x); // X1 = X1/Z1 - fp2mul1271(P->y, P->z, Q->y); // Y1 = Y1/Z1 - mod1271(Q->x[0]); mod1271(Q->x[1]); - mod1271(Q->y[0]); mod1271(Q->y[1]); -} - - -__inline void R1_to_R2(point_extproj_t P, point_extproj_precomp_t Q) -{ // Conversion from representation (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT), where T = Ta*Tb - // Input: P = (X1,Y1,Z1,Ta,Tb), where T1 = Ta*Tb, corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - // Output: Q = (X1+Y1,Y1-X1,2Z1,2dT1) corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - - fp2add1271(P->ta, P->ta, Q->t2); // T = 2*Ta - fp2add1271(P->x, P->y, Q->xy); // QX = X+Y - fp2sub1271(P->y, P->x, Q->yx); // QY = Y-X - fp2mul1271(Q->t2, P->tb, Q->t2); // T = 2*T - fp2add1271(P->z, P->z, Q->z2); // QZ = 2*Z - fp2mul1271(Q->t2, (felm_t*)&PARAMETER_d, Q->t2); // QT = 2d*T -} - - -__inline void R1_to_R3(point_extproj_t P, point_extproj_precomp_t Q) -{ // Conversion from representation (X,Y,Z,Ta,Tb) to (X+Y,Y-X,Z,T), where T = Ta*Tb - // Input: P = (X1,Y1,Z1,Ta,Tb), where T1 = Ta*Tb, corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - // Output: Q = (X1+Y1,Y1-X1,Z1,T1) corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - - fp2add1271(P->x, P->y, Q->xy); // XQ = (X1+Y1) - fp2sub1271(P->y, P->x, Q->yx); // YQ = (Y1-X1) - fp2mul1271(P->ta, P->tb, Q->t2); // TQ = T1 - fp2copy1271(P->z, Q->z2); // ZQ = Z1 -} - - -void R2_to_R4(point_extproj_precomp_t P, point_extproj_t Q) -{ // Conversion from representation (X+Y,Y-X,2Z,2dT) to (2X,2Y,2Z,2dT) - // Input: P = (X1+Y1,Y1-X1,2Z1,2dT1) corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - // Output: Q = (2X1,2Y1,2Z1) corresponding to (X1:Y1:Z1) in twisted Edwards coordinates - - fp2sub1271(P->xy, P->yx, Q->x); // XQ = 2*X1 - fp2add1271(P->xy, P->yx, Q->y); // YQ = 2*Y1 - fp2copy1271(P->z2, Q->z); // ZQ = 2*Z1 -} - - -__inline void eccdouble(point_extproj_t P) -{ // Point doubling 2P - // Input: P = (X1:Y1:Z1) in twisted Edwards coordinates - // Output: 2P = (Xfinal,Yfinal,Zfinal,Tafinal,Tbfinal), where Tfinal = Tafinal*Tbfinal, - // corresponding to (Xfinal:Yfinal:Zfinal:Tfinal) in extended twisted Edwards coordinates - f2elm_t t1, t2; - - fp2sqr1271(P->x, t1); // t1 = X1^2 - fp2sqr1271(P->y, t2); // t2 = Y1^2 - fp2add1271(P->x, P->y, P->x); // t3 = X1+Y1 - fp2add1271(t1, t2, P->tb); // Tbfinal = X1^2+Y1^2 - fp2sub1271(t2, t1, t1); // t1 = Y1^2-X1^2 - fp2sqr1271(P->x, P->ta); // Ta = (X1+Y1)^2 - fp2sqr1271(P->z, t2); // t2 = Z1^2 - fp2sub1271(P->ta, P->tb, P->ta); // Tafinal = 2X1*Y1 = (X1+Y1)^2-(X1^2+Y1^2) - fp2addsub1271(t2, t1, t2); // t2 = 2Z1^2-(Y1^2-X1^2) - fp2mul1271(t1, P->tb, P->y); // Yfinal = (X1^2+Y1^2)(Y1^2-X1^2) - fp2mul1271(t2, P->ta, P->x); // Xfinal = 2X1*Y1*[2Z1^2-(Y1^2-X1^2)] - fp2mul1271(t1, t2, P->z); // Zfinal = (Y1^2-X1^2)[2Z1^2-(Y1^2-X1^2)] -#ifdef TEMP_ZEROING - clear_words((void*)t1, sizeof(f2elm_t)/sizeof(unsigned int)); - clear_words((void*)t2, sizeof(f2elm_t)/sizeof(unsigned int)); -#endif -} - - -__inline void eccadd_core(point_extproj_precomp_t P, point_extproj_precomp_t Q, point_extproj_t R) -{ // Basic point addition R = P+Q or R = P+P - // Inputs: P = (X1+Y1,Y1-X1,2Z1,2dT1) corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - // Q = (X2+Y2,Y2-X2,Z2,T2) corresponding to (X2:Y2:Z2:T2) in extended twisted Edwards coordinates - // Output: R = (Xfinal,Yfinal,Zfinal,Tafinal,Tbfinal), where Tfinal = Tafinal*Tbfinal, - // corresponding to (Xfinal:Yfinal:Zfinal:Tfinal) in extended twisted Edwards coordinates - f2elm_t t1, t2; - - fp2mul1271(P->t2, Q->t2, R->z); // Z = 2dT1*T2 - fp2mul1271(P->z2, Q->z2, t1); // t1 = 2Z1*Z2 - fp2mul1271(P->xy, Q->xy, R->x); // X = (X1+Y1)(X2+Y2) - fp2mul1271(P->yx, Q->yx, R->y); // Y = (Y1-X1)(Y2-X2) - fp2sub1271(t1, R->z, t2); // t2 = theta - fp2add1271(t1, R->z, t1); // t1 = alpha - fp2sub1271(R->x, R->y, R->tb); // Tbfinal = beta - fp2add1271(R->x, R->y, R->ta); // Tafinal = omega - fp2mul1271(R->tb, t2, R->x); // Xfinal = beta*theta - fp2mul1271(t1, t2, R->z); // Zfinal = theta*alpha - fp2mul1271(R->ta, t1, R->y); // Yfinal = alpha*omega -#ifdef TEMP_ZEROING - clear_words((void*)t1, sizeof(f2elm_t)/sizeof(unsigned int)); - clear_words((void*)t2, sizeof(f2elm_t)/sizeof(unsigned int)); -#endif -} - - -__inline void eccadd(point_extproj_precomp_t Q, point_extproj_t P) -{ // Complete point addition P = P+Q or P = P+P - // Inputs: P = (X1,Y1,Z1,Ta,Tb), where T1 = Ta*Tb, corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - // Q = (X2+Y2,Y2-X2,2Z2,2dT2) corresponding to (X2:Y2:Z2:T2) in extended twisted Edwards coordinates - // Output: P = (Xfinal,Yfinal,Zfinal,Tafinal,Tbfinal), where Tfinal = Tafinal*Tbfinal, - // corresponding to (Xfinal:Yfinal:Zfinal:Tfinal) in extended twisted Edwards coordinates - point_extproj_precomp_t R; - - R1_to_R3(P, R); // R = (X1+Y1,Y1-Z1,Z1,T1) - eccadd_core(Q, R, P); // P = (X2+Y2,Y2-X2,2Z2,2dT2) + (X1+Y1,Y1-Z1,Z1,T1) - -#ifdef TEMP_ZEROING - clear_words((void*)R, sizeof(point_extproj_precomp_t)/sizeof(unsigned int)); -#endif -} - - -__inline void point_setup(point_t P, point_extproj_t Q) -{ // Point conversion to representation (X,Y,Z,Ta,Tb) - // Input: P = (x,y) in affine coordinates - // Output: P = (X,Y,1,Ta,Tb), where Ta=X, Tb=Y and T=Ta*Tb, corresponding to (X:Y:Z:T) in extended twisted Edwards coordinates - - fp2copy1271(P->x, Q->x); - fp2copy1271(P->y, Q->y); - fp2copy1271(Q->x, Q->ta); // Ta = X1 - fp2copy1271(Q->y, Q->tb); // Tb = Y1 - fp2zero1271(Q->z); Q->z[0][0]=1; // Z1 = 1 -} - - -__inline bool ecc_point_validate(point_extproj_t P) -{ // Point validation: check if point lies on the curve - // Input: P = (x,y) in affine coordinates, where x, y in [0, 2^127-1]. - // Output: TRUE (1) if point lies on the curve E: -x^2+y^2-1-dx^2*y^2 = 0, FALSE (0) otherwise. - // SECURITY NOTE: this function does not run in constant time (input point P is assumed to be public). - f2elm_t t1, t2, t3; - - fp2sqr1271(P->y, t1); - fp2sqr1271(P->x, t2); - fp2sub1271(t1, t2, t3); // -x^2 + y^2 - fp2mul1271(t1, t2, t1); // x^2*y^2 - fp2mul1271((felm_t*)&PARAMETER_d, t1, t2); // dx^2*y^2 - fp2zero1271(t1); t1[0][0] = 1; // t1 = 1 - fp2add1271(t2, t1, t2); // 1 + dx^2*y^2 - fp2sub1271(t3, t2, t1); // -x^2 + y^2 - 1 - dx^2*y^2 - -#if defined(GENERIC_IMPLEMENTATION) - { unsigned int i, j; - mod1271(t1[0]); - mod1271(t1[1]); - - for (i = 0; i < 2; i++) { - for (j = 0; j < NWORDS_FIELD; j++) { - if (t1[i][j] != 0) return false; - } - } - - return true; } -#else - return ((is_digit_zero_ct(t1[0][0] | t1[0][1]) || is_digit_zero_ct((t1[0][0]+1) | (t1[0][1]+1))) & - (is_digit_zero_ct(t1[1][0] | t1[1][1]) || is_digit_zero_ct((t1[1][0]+1) | (t1[1][1]+1)))); -#endif -} - - -static __inline void R5_to_R1(point_precomp_t P, point_extproj_t Q) -{ // Conversion from representation (x+y,y-x,2dt) to (X,Y,Z,Ta,Tb) - // Input: P = (x1+y1,y1-x1,2dt1) corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates, where Z1=1 - // Output: Q = (x1,y1,z1,x1,y1), where z1=1, corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - - fp2sub1271(P->xy, P->yx, Q->x); // 2*x1 - fp2add1271(P->xy, P->yx, Q->y); // 2*y1 - fp2div1271(Q->x); // XQ = x1 - fp2div1271(Q->y); // YQ = y1 - fp2zero1271(Q->z); Q->z[0][0]=1; // ZQ = 1 - fp2copy1271(Q->x, Q->ta); // TaQ = x1 - fp2copy1271(Q->y, Q->tb); // TbQ = y1 -} - - -static __inline void eccmadd(point_precomp_t Q, point_extproj_t P) -{ // Mixed point addition P = P+Q or P = P+P - // Inputs: P = (X1,Y1,Z1,Ta,Tb), where T1 = Ta*Tb, corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - // Q = (x2+y2,y2-x2,2dt2) corresponding to (X2:Y2:Z2:T2) in extended twisted Edwards coordinates, where Z2=1 - // Output: P = (Xfinal,Yfinal,Zfinal,Tafinal,Tbfinal), where Tfinal = Tafinal*Tbfinal, - // corresponding to (Xfinal:Yfinal:Zfinal:Tfinal) in extended twisted Edwards coordinates - f2elm_t t1, t2; - - fp2mul1271(P->ta, P->tb, P->ta); // Ta = T1 - fp2add1271(P->z, P->z, t1); // t1 = 2Z1 - fp2mul1271(P->ta, Q->t2, P->ta); // Ta = 2dT1*t2 - fp2add1271(P->x, P->y, P->z); // Z = (X1+Y1) - fp2sub1271(P->y, P->x, P->tb); // Tb = (Y1-X1) - fp2sub1271(t1, P->ta, t2); // t2 = theta - fp2add1271(t1, P->ta, t1); // t1 = alpha - fp2mul1271(Q->xy, P->z, P->ta); // Ta = (X1+Y1)(x2+y2) - fp2mul1271(Q->yx, P->tb, P->x); // X = (Y1-X1)(y2-x2) - fp2mul1271(t1, t2, P->z); // Zfinal = theta*alpha - fp2sub1271(P->ta, P->x, P->tb); // Tbfinal = beta - fp2add1271(P->ta, P->x, P->ta); // Tafinal = omega - fp2mul1271(P->tb, t2, P->x); // Xfinal = beta*theta - fp2mul1271(P->ta, t1, P->y); // Yfinal = alpha*omega -#ifdef TEMP_ZEROING - clear_words((void*)t1, sizeof(f2elm_t)/sizeof(unsigned int)); - clear_words((void*)t2, sizeof(f2elm_t)/sizeof(unsigned int)); -#endif -} - - -void eccmadd_ni(point_precomp_t Q, point_extproj_t P) -{ - eccmadd(Q, P); -} - - -bool ecc_mul_fixed(digit_t* k, point_t Q) -{ // Fixed-base scalar multiplication Q = k*G, where G is the generator. FIXED_BASE_TABLE stores v*2^(w-1) = 80 multiples of G. - // Inputs: scalar "k" in [0, 2^256-1]. - // Output: Q = k*G in affine coordinates (x,y). - // The function is based on the modified LSB-set comb method, which converts the scalar to an odd signed representation - // with (bitlength(order)+w*v) digits. - unsigned int j, w = W_FIXEDBASE, v = V_FIXEDBASE, d = D_FIXEDBASE, e = E_FIXEDBASE; - unsigned int digit = 0, digits[NBITS_ORDER_PLUS_ONE+(W_FIXEDBASE*V_FIXEDBASE)-1] = {0}; - digit_t temp[NWORDS_ORDER]; - point_extproj_t R; - point_precomp_t S; - int i, ii; - - modulo_order(k, temp); // temp = k mod (order) - conversion_to_odd(temp, temp); // Converting scalar to odd using the prime subgroup order - mLSB_set_recode((uint64_t*)temp, digits); // Scalar recoding - - // Extracting initial digit - digit = digits[w*d-1]; - for (i = (int)((w-1)*d-1); i >= (int)(2*d-1); i = i-d) - { - digit = 2*digit + digits[i]; - } - // Initialize R = (x+y,y-x,2dt) with a point from the table - table_lookup_fixed_base(((point_precomp_t*)&FIXED_BASE_TABLE)+(v-1)*(1 << (w-1)), S, digit, digits[d-1]); - R5_to_R1(S, R); // Converting to representation (X:Y:1:Ta:Tb) - - for (j = 0; j < (v-1); j++) - { - digit = digits[w*d-(j+1)*e-1]; - for (i = (int)((w-1)*d-(j+1)*e-1); i >= (int)(2*d-(j+1)*e-1); i = i-d) - { - digit = 2*digit + digits[i]; - } - // Extract point in (x+y,y-x,2dt) representation - table_lookup_fixed_base(((point_precomp_t*)&FIXED_BASE_TABLE)+(v-j-2)*(1 << (w-1)), S, digit, digits[d-(j+1)*e-1]); - eccmadd(S, R); // R = R+S using representations (X,Y,Z,Ta,Tb) <- (X,Y,Z,Ta,Tb) + (x+y,y-x,2dt) - } - - for (ii = (e-2); ii >= 0; ii--) - { - eccdouble(R); // R = 2*R using representations (X,Y,Z,Ta,Tb) <- 2*(X,Y,Z) - for (j = 0; j < v; j++) - { - digit = digits[w*d-j*e+ii-e]; - for (i = (int)((w-1)*d-j*e+ii-e); i >= (int)(2*d-j*e+ii-e); i = i-d) - { - digit = 2*digit + digits[i]; - } - // Extract point in (x+y,y-x,2dt) representation - table_lookup_fixed_base(((point_precomp_t*)&FIXED_BASE_TABLE)+(v-j-1)*(1 << (w-1)), S, digit, digits[d-j*e+ii-e]); - eccmadd(S, R); // R = R+S using representations (X,Y,Z,Ta,Tb) <- (X,Y,Z,Ta,Tb) + (x+y,y-x,2dt) - } - } - eccnorm(R, Q); // Conversion to affine coordinates (x,y) and modular correction. - -#ifdef TEMP_ZEROING - clear_words((void*)digits, NBITS_ORDER_PLUS_ONE+(W_FIXEDBASE*V_FIXEDBASE)-1); - clear_words((void*)S, sizeof(point_precomp_t)/sizeof(unsigned int)); -#endif - return true; -} - - -void mLSB_set_recode(uint64_t* scalar, unsigned int *digits) -{ // Computes the modified LSB-set representation of a scalar - // Inputs: scalar in [0, order-1], where the order of FourQ's subgroup is 246 bits. - // Output: digits, where the first "d" values (from index 0 to (d-1)) store the signs for the recoded values using the convention: -1 (negative), 0 (positive), and - // the remaining values (from index d to (l-1)) store the recoded values in mLSB-set representation, excluding their sign, - // where l = d*w and d = ceil(bitlength(order)/(w*v))*v. The values v and w are fixed and must be in the range [1, 10] (see FourQ.h); they determine the size - // of the precomputed table "FIXED_BASE_TABLE" used by ecc_mul_fixed(). - unsigned int i, j, d = D_FIXEDBASE, l = L_FIXEDBASE; - uint64_t temp, carry; - - digits[d-1] = 0; - - // Shift scalar to the right by 1 - for (j = 0; j < (NWORDS64_ORDER-1); j++) { - SHIFTR(scalar[j+1], scalar[j], 1, scalar[j], RADIX64); - } - scalar[NWORDS64_ORDER-1] >>= 1; - - for (i = 0; i < (d-1); i++) - { - digits[i] = (unsigned int)((scalar[0] & 1) - 1); // Convention for the "sign" row: - // if scalar_(i+1) = 0 then digit_i = -1 (negative), else if scalar_(i+1) = 1 then digit_i = 0 (positive) - // Shift scalar to the right by 1 - for (j = 0; j < (NWORDS64_ORDER-1); j++) { - SHIFTR(scalar[j+1], scalar[j], 1, scalar[j], RADIX64); - } - scalar[NWORDS64_ORDER-1] >>= 1; - } - - for (i = d; i < l; i++) - { - digits[i] = (unsigned int)(scalar[0] & 1); // digits_i = k mod 2. Sign is determined by the "sign" row - - // Shift scalar to the right by 1 - for (j = 0; j < (NWORDS64_ORDER-1); j++) { - SHIFTR(scalar[j+1], scalar[j], 1, scalar[j], RADIX64); - } - scalar[NWORDS64_ORDER-1] >>= 1; - - temp = (0 - digits[i-(i/d)*d]) & digits[i]; // if (digits_i=0 \/ 1) then temp = 0, else if (digits_i=-1) then temp = 1 - - // floor(scalar/2) + temp - scalar[0] = scalar[0] + temp; - carry = (temp & (uint64_t)is_digit_zero_ct((digit_t)scalar[0])); // carry = (scalar[0] < temp); - for (j = 1; j < NWORDS64_ORDER; j++) - { - scalar[j] = scalar[j] + carry; - carry = (carry & (uint64_t)is_digit_zero_ct((digit_t)scalar[j])); // carry = (scalar[j] < temp); - } - } - return; -} - - -static __inline void eccneg_extproj_precomp(point_extproj_precomp_t P, point_extproj_precomp_t Q) -{ // Point negation - // Input : point P in coordinates (X+Y,Y-X,2Z,2dT) - // Output: point Q = -P = (Y-X,X+Y,2Z,-2dT) - fp2copy1271(P->t2, Q->t2); - fp2copy1271(P->xy, Q->yx); - fp2copy1271(P->yx, Q->xy); - fp2copy1271(P->z2, Q->z2); - fp2neg1271(Q->t2); -} - - -static __inline void eccneg_precomp(point_precomp_t P, point_precomp_t Q) -{ // Point negation - // Input : point P in coordinates (x+y,y-x,2dt) - // Output: point Q = -P = (y-x,x+y,-2dt) - fp2copy1271(P->t2, Q->t2); - fp2copy1271(P->xy, Q->yx); - fp2copy1271(P->yx, Q->xy); - fp2neg1271(Q->t2); -} - - -bool ecc_mul_double(digit_t* k, point_t Q, digit_t* l, point_t R) -{ // Double scalar multiplication R = k*G + l*Q, where the G is the generator. Uses DOUBLE_SCALAR_TABLE, which contains multiples of G, Phi(G), Psi(G) and Phi(Psi(G)). - // Inputs: point Q in affine coordinates, - // scalars "k" and "l" in [0, 2^256-1]. - // Output: R = k*G + l*Q in affine coordinates (x,y). - // The function uses wNAF with interleaving. - - // SECURITY NOTE: this function is intended for a non-constant-time operation such as signature verification. - -#if (USE_ENDO == true) - unsigned int position; - int i, digits_k1[65] = {0}, digits_k2[65] = {0}, digits_k3[65] = {0}, digits_k4[65] = {0}; - int digits_l1[65] = {0}, digits_l2[65] = {0}, digits_l3[65] = {0}, digits_l4[65] = {0}; - point_precomp_t V; - point_extproj_t Q1, Q2, Q3, Q4, T; - point_extproj_precomp_t U, Q_table1[NPOINTS_DOUBLEMUL_WQ], Q_table2[NPOINTS_DOUBLEMUL_WQ], Q_table3[NPOINTS_DOUBLEMUL_WQ], Q_table4[NPOINTS_DOUBLEMUL_WQ]; - uint64_t k_scalars[4], l_scalars[4]; - - point_setup(Q, Q1); // Convert to representation (X,Y,1,Ta,Tb) - - if (ecc_point_validate(Q1) == false) { // Check if point lies on the curve - return false; - } - - // Computing endomorphisms over point Q - ecccopy(Q1, Q2); - ecc_phi(Q2); - ecccopy(Q1, Q3); - ecc_psi(Q3); - ecccopy(Q2, Q4); - ecc_psi(Q4); - - decompose((uint64_t*)k, k_scalars); // Scalar decomposition - decompose((uint64_t*)l, l_scalars); - wNAF_recode(k_scalars[0], WP_DOUBLEBASE, digits_k1); // Scalar recoding - wNAF_recode(k_scalars[1], WP_DOUBLEBASE, digits_k2); - wNAF_recode(k_scalars[2], WP_DOUBLEBASE, digits_k3); - wNAF_recode(k_scalars[3], WP_DOUBLEBASE, digits_k4); - wNAF_recode(l_scalars[0], WQ_DOUBLEBASE, digits_l1); - wNAF_recode(l_scalars[1], WQ_DOUBLEBASE, digits_l2); - wNAF_recode(l_scalars[2], WQ_DOUBLEBASE, digits_l3); - wNAF_recode(l_scalars[3], WQ_DOUBLEBASE, digits_l4); - ecc_precomp_double(Q1, Q_table1, NPOINTS_DOUBLEMUL_WQ); // Precomputation - ecc_precomp_double(Q2, Q_table2, NPOINTS_DOUBLEMUL_WQ); - ecc_precomp_double(Q3, Q_table3, NPOINTS_DOUBLEMUL_WQ); - ecc_precomp_double(Q4, Q_table4, NPOINTS_DOUBLEMUL_WQ); - - fp2zero1271(T->x); // Initialize T as the neutral point (0:1:1) - fp2zero1271(T->y); T->y[0][0] = 1; - fp2zero1271(T->z); T->z[0][0] = 1; - - for (i = 64; i >= 0; i--) - { - eccdouble(T); // Double (X_T,Y_T,Z_T,Ta_T,Tb_T) = 2(X_T,Y_T,Z_T,Ta_T,Tb_T) - if (digits_l1[i] < 0) { - position = (-digits_l1[i])/2; - eccneg_extproj_precomp(Q_table1[position], U); // Load and negate U = (X_U,Y_U,Z_U,Td_U) <- -(X+Y,Y-X,2Z,2dT) from a point in the precomputed table - eccadd(U, T); // T = T+U = (X_T,Y_T,Z_T,Ta_T,Tb_T) = (X_T,Y_T,Z_T,Ta_T,Tb_T) + (X_U,Y_U,Z_U,Td_U) - } else if (digits_l1[i] > 0) { - position = (digits_l1[i])/2; // Take U = (X_U,Y_U,Z_U,Td_U) <- (X+Y,Y-X,2Z,2dT) from a point in the precomputed table - eccadd(Q_table1[position], T); // T = T+U = (X_T,Y_T,Z_T,Ta_T,Tb_T) = (X_T,Y_T,Z_T,Ta_T,Tb_T) + (X_U,Y_U,Z_U,Td_U) - } - if (digits_l2[i] < 0) { - position = (-digits_l2[i])/2; - eccneg_extproj_precomp(Q_table2[position], U); - eccadd(U, T); - } else if (digits_l2[i] > 0) { - position = (digits_l2[i])/2; - eccadd(Q_table2[position], T); - } - if (digits_l3[i] < 0) { - position = (-digits_l3[i])/2; - eccneg_extproj_precomp(Q_table3[position], U); - eccadd(U, T); - } else if (digits_l3[i] > 0) { - position = (digits_l3[i])/2; - eccadd(Q_table3[position], T); - } - if (digits_l4[i] < 0) { - position = (-digits_l4[i])/2; - eccneg_extproj_precomp(Q_table4[position], U); - eccadd(U, T); - } else if (digits_l4[i] > 0) { - position = (digits_l4[i])/2; - eccadd(Q_table4[position], T); - } - - if (digits_k1[i] < 0) { - position = (-digits_k1[i])/2; - eccneg_precomp(((point_precomp_t*)&DOUBLE_SCALAR_TABLE)[position], V); // Load and negate V = (X_V,Y_V,Z_V,Td_V) <- -(x+y,y-x,2dt) from a point in the precomputed table - eccmadd(V, T); // T = T+V = (X_T,Y_T,Z_T,Ta_T,Tb_T) = (X_T,Y_T,Z_T,Ta_T,Tb_T) + (X_V,Y_V,Z_V,Td_V) - } else if (digits_k1[i] > 0) { - position = (digits_k1[i])/2; // Take V = (X_V,Y_V,Z_V,Td_V) <- (x+y,y-x,2dt) from a point in the precomputed table - eccmadd(((point_precomp_t*)&DOUBLE_SCALAR_TABLE)[position], T); // T = T+V = (X_T,Y_T,Z_T,Ta_T,Tb_T) = (X_T,Y_T,Z_T,Ta_T,Tb_T) + (X_V,Y_V,Z_V,Td_V) - } - if (digits_k2[i] < 0) { - position = (-digits_k2[i])/2; - eccneg_precomp(((point_precomp_t*)&DOUBLE_SCALAR_TABLE)[NPOINTS_DOUBLEMUL_WP+position], V); - eccmadd(V, T); - } else if (digits_k2[i] > 0) { - position = (digits_k2[i])/2; - eccmadd(((point_precomp_t*)&DOUBLE_SCALAR_TABLE)[NPOINTS_DOUBLEMUL_WP+position], T); - } - if (digits_k3[i] < 0) { - position = (-digits_k3[i])/2; - eccneg_precomp(((point_precomp_t*)&DOUBLE_SCALAR_TABLE)[2*NPOINTS_DOUBLEMUL_WP+position], V); - eccmadd(V, T); - } else if (digits_k3[i] > 0) { - position = (digits_k3[i])/2; - eccmadd(((point_precomp_t*)&DOUBLE_SCALAR_TABLE)[2*NPOINTS_DOUBLEMUL_WP+position], T); - } - if (digits_k4[i] < 0) { - position = (-digits_k4[i])/2; - eccneg_precomp(((point_precomp_t*)&DOUBLE_SCALAR_TABLE)[3*NPOINTS_DOUBLEMUL_WP+position], V); - eccmadd(V, T); - } else if (digits_k4[i] > 0) { - position = (digits_k4[i])/2; - eccmadd(((point_precomp_t*)&DOUBLE_SCALAR_TABLE)[3*NPOINTS_DOUBLEMUL_WP+position], T); - } - } - -#else - point_t A; - point_extproj_t T; - point_extproj_precomp_t S; - - if (ecc_mul(Q, l, A, false) == false) { - return false; - } - point_setup(A, T); - R1_to_R2(T, S); - - ecc_mul_fixed(k, A); - point_setup(A, T); - eccadd(S, T); -#endif - eccnorm(T, R); // Output R = (x,y) - - return true; -} - - -void ecc_precomp_double(point_extproj_t P, point_extproj_precomp_t* Table, unsigned int npoints) -{ // Generation of the precomputation table used internally by the double scalar multiplication function ecc_mul_double(). - // Inputs: point P in representation (X,Y,Z,Ta,Tb), - // Table with storage for npoints, - // number of points "npoints". - // Output: Table containing multiples of the base point P using representation (X+Y,Y-X,2Z,2dT). - point_extproj_t Q; - point_extproj_precomp_t PP; - unsigned int i; - - R1_to_R2(P, Table[0]); // Precomputed point Table[0] = P in coordinates (X+Y,Y-X,2Z,2dT) - eccdouble(P); // A = 2*P in (X,Y,Z,Ta,Tb) - R1_to_R3(P, PP); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,Z,T) - - for (i = 1; i < npoints; i++) { - eccadd_core(Table[i-1], PP, Q); // Table[i] = Table[i-1]+2P using the representations (X,Y,Z,Ta,Tb) <- (X+Y,Y-X,2Z,2dT) + (X+Y,Y-X,Z,T) - R1_to_R2(Q, Table[i]); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT) - } - - return; -} - - -void wNAF_recode(uint64_t scalar, unsigned int w, int* digits) -{ // Computes wNAF recoding of a scalar, where digits are in set {0,+-1,+-3,...,+-(2^(w-1)-1)} - unsigned int i; - int digit, index = 0; - int val1 = (int)(1 << (w-1)) - 1; // 2^(w-1) - 1 - int val2 = (int)(1 << w); // 2^w; - uint64_t k = scalar, mask = (uint64_t)val2 - 1; // 2^w - 1 - - while (k != 0) - { - digit = (int)(k & 1); - - if (digit == 0) { - k >>= 1; // Shift scalar to the right by 1 - digits[index] = 0; - } else { - digit = (int)(k & mask); - k >>= w; // Shift scalar to the right by w - - if (digit > val1) { - digit -= val2; - } - if (digit < 0) { // scalar + 1 - k += 1; - } - digits[index] = digit; - - if (k != 0) { // Check if scalar != 0 - for (i = 0; i < (w-1); i++) - { - index++; - digits[index] = 0; - } - } - } - index++; - } - return; -} diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/eccp2_no_endo.c b/ffi-deps/FourQlib/FourQ_64bit_and_portable/eccp2_no_endo.c deleted file mode 100644 index 077d6ad..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/eccp2_no_endo.c +++ /dev/null @@ -1,160 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: ECC operations over GF(p^2) without exploiting endomorphisms -* -* This code is based on the paper "FourQ: four-dimensional decompositions on a -* Q-curve over the Mersenne prime" by Craig Costello and Patrick Longa, in Advances -* in Cryptology - ASIACRYPT, 2015. -* Preprint available at http://eprint.iacr.org/2015/565. -************************************************************************************/ - -#include "FourQ_internal.h" - - -#if (USE_ENDO == false) - -/***********************************************/ -/********** CURVE/SCALAR FUNCTIONS ***********/ - -void fixed_window_recode(uint64_t* scalar, unsigned int* digits, unsigned int* sign_masks) -{ // Converting scalar to the fixed window representation used by the variable-base scalar multiplication - // Inputs: scalar in [0, order-1], where the order of FourQ's subgroup is 246 bits. - // Outputs: "digits" array with (t_VARBASE+1) nonzero entries. Each entry is in the range [0, 7], corresponding to one entry in the precomputed table. - // where t_VARBASE+1 = ((bitlength(order)+w-1)/(w-1))+1 represents the fixed length of the recoded scalar using window width w. - // The value of w is fixed to W_VARBASE = 5, which corresponds to a precomputed table with 2^(W_VARBASE-2) = 8 entries (see FourQ.h) - // used by the variable base scalar multiplication ecc_mul(). - // "sign_masks" array with (t_VARBASE+1) entries storing the signs for their corresponding digits in "digits". - // Notation: if the corresponding digit > 0 then sign_mask = 0xFF...FF, else if digit < 0 then sign_mask = 0. - unsigned int val1, val2, i, j; - uint64_t res, borrow; - int64_t temp; - - val1 = (1 << W_VARBASE) - 1; - val2 = (1 << (W_VARBASE-1)); - - for (i = 0; i < t_VARBASE; i++) - { - temp = (scalar[0] & val1) - val2; // ki = (k mod 2^w)/2^(w-1) - sign_masks[i] = ~((unsigned int)(temp >> (RADIX64-1))); - digits[i] = ((sign_masks[i] & (unsigned int)(temp ^ -temp)) ^ (unsigned int)-temp) >> 1; - - res = scalar[0] - temp; // k = (k - ki) / 2^(w-1) - borrow = ((temp >> (RADIX64-1)) - 1) & (uint64_t)is_digit_lessthan_ct((digit_t)scalar[0], (digit_t)temp); - scalar[0] = res; - - for (j = 1; j < NWORDS64_ORDER; j++) - { - res = scalar[j]; - scalar[j] = res - borrow; - borrow = (uint64_t)is_digit_lessthan_ct((digit_t)res, (digit_t)borrow); - } - - for (j = 0; j < (NWORDS64_ORDER-1); j++) { - SHIFTR(scalar[j+1], scalar[j], (W_VARBASE-1), scalar[j], RADIX64); - } - scalar[NWORDS64_ORDER-1] = scalar[NWORDS64_ORDER-1] >> (W_VARBASE-1); - - } - sign_masks[t_VARBASE] = ~((unsigned int)(scalar[0] >> (RADIX64-1))); - digits[t_VARBASE] = ((sign_masks[t_VARBASE] & (unsigned int)(scalar[0] ^ (0-scalar[0]))) ^ (unsigned int)(0-scalar[0])) >> 1; // kt = k (t_VARBASE+1 digits) -} - - -void ecc_precomp(point_extproj_t P, point_extproj_precomp_t *T) -{ // Generation of the precomputation table used by the variable-base scalar multiplication ecc_mul(). - // Input: P = (X1,Y1,Z1,Ta,Tb), where T1 = Ta*Tb, corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates. - // Output: table T containing NPOINTS_VARBASE points: P, 3P, 5P, ... , (2*NPOINTS_VARBASE-1)P. NPOINTS_VARBASE is fixed to 8 (see FourQ.h). - // Precomputed points use the representation (X+Y,Y-X,2Z,2dT) corresponding to (X:Y:Z:T) in extended twisted Edwards coordinates. - point_extproj_precomp_t P2; - point_extproj_t Q; - unsigned int i; - - // Generating P2 = 2(X1,Y1,Z1,T1a,T1b) = (XP2+YP2,Y2P-X2P,ZP2,TP2) and T[0] = P = (X1+Y1,Y1-X1,2*Z1,2*d*T1) - ecccopy(P, Q); - R1_to_R2(P, T[0]); - eccdouble(Q); - R1_to_R3(Q, P2); - - for (i = 1; i < NPOINTS_VARBASE; i++) { - // T[i] = 2P+T[i-1] = (2*i+1)P = (XP2+YP2,Y2P-X2P,ZP2,TP2) + (X_(2*i-1)+Y_(2*i-1), Y_(2*i-1)-X_(2*i-1), 2Z_(2*i-1), 2T_(2*i-1)) = (X_(2*i+1)+Y_(2*i+1), Y_(2*i+1)-X_(2*i+1), 2Z_(2*i+1), 2dT_(2*i+1)) - eccadd_core(P2, T[i-1], Q); - R1_to_R2(Q, T[i]); - } -} - - -void cofactor_clearing(point_extproj_t P) -{ // Co-factor clearing - // Input: P = (X1,Y1,Z1,Ta,Tb), where T1 = Ta*Tb, corresponding to (X1:Y1:Z1:T1) in extended twisted Edwards coordinates - // Output: P = 392*P = (Xfinal,Yfinal,Zfinal,Tafinal,Tbfinal), where Tfinal = Tafinal*Tbfinal, - // corresponding to (Xfinal:Yfinal:Zfinal:Tfinal) in extended twisted Edwards coordinates - point_extproj_precomp_t Q; - - R1_to_R2(P, Q); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT) - eccdouble(P); // P = 2*P using representations (X,Y,Z,Ta,Tb) <- 2*(X,Y,Z) - eccadd(Q, P); // P = P+Q using representations (X,Y,Z,Ta,Tb) <- (X,Y,Z,Ta,Tb) + (X+Y,Y-X,2Z,2dT) - eccdouble(P); - eccdouble(P); - eccdouble(P); - eccdouble(P); - eccadd(Q, P); - eccdouble(P); - eccdouble(P); - eccdouble(P); -} - - -bool ecc_mul(point_t P, digit_t* k, point_t Q, bool clear_cofactor) -{ // Scalar multiplication Q = k*P - // Inputs: scalar "k" in [0, 2^256-1], - // point P = (x,y) in affine coordinates, - // clear_cofactor = 1 (TRUE) or 0 (FALSE) whether cofactor clearing is required or not, respectively. - // Output: Q = k*P in affine coordinates (x,y). - // This function performs point validation and (if selected) cofactor clearing. - point_extproj_t R; - point_extproj_precomp_t S, Table[NPOINTS_VARBASE]; - unsigned int digits[t_VARBASE+1] = {0}, sign_masks[t_VARBASE+1] = {0}; - digit_t k_odd[NWORDS_ORDER]; - int i; - - point_setup(P, R); // Convert to representation (X,Y,1,Ta,Tb) - - if (ecc_point_validate(R) == false) { // Check if point lies on the curve - return false; - } - - if (clear_cofactor == true) { - cofactor_clearing(R); - } - - modulo_order(k, k_odd); // k_odd = k mod (order) - conversion_to_odd(k_odd, k_odd); // Converting scalar to odd using the prime subgroup order - ecc_precomp(R, Table); // Precomputation of points T[0],...,T[npoints-1] - fixed_window_recode((uint64_t*)k_odd, digits, sign_masks); // Scalar recoding - table_lookup_1x8(Table, S, digits[t_VARBASE], sign_masks[t_VARBASE]); - R2_to_R4(S, R); // Conversion to representation (2X,2Y,2Z) - - for (i = (t_VARBASE-1); i >= 0; i--) - { - eccdouble(R); - table_lookup_1x8(Table, S, digits[i], sign_masks[i]); // Extract point in (X+Y,Y-X,2Z,2dT) representation - eccdouble(R); - eccdouble(R); - eccdouble(R); // P = 2*P using representations (X,Y,Z,Ta,Tb) <- 2*(X,Y,Z) - eccadd(S, R); // P = P+S using representations (X,Y,Z,Ta,Tb) <- (X,Y,Z,Ta,Tb) + (X+Y,Y-X,2Z,2dT) - } - eccnorm(R, Q); // Convert to affine coordinates (x,y) - -#ifdef TEMP_ZEROING - clear_words((void*)k_odd, NWORDS_ORDER*(sizeof(digit_t)/sizeof(unsigned int))); - clear_words((void*)digits, t_VARBASE+1); - clear_words((void*)sign_masks, t_VARBASE+1); - clear_words((void*)S, sizeof(point_extproj_precomp_t)/sizeof(unsigned int)); -#endif - return true; -} - -#endif diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/generic/fp.h b/ffi-deps/FourQlib/FourQ_64bit_and_portable/generic/fp.h deleted file mode 100644 index 3083c06..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/generic/fp.h +++ /dev/null @@ -1,409 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: portable modular arithmetic and other low-level operations -************************************************************************************/ - -#ifndef __FP_H__ -#define __FP_H__ - - -// For C++ -#ifdef __cplusplus -extern "C" { -#endif - - -#include "../table_lookup.h" -#include "../FourQ_params.h" - -const digit_t mask_7fff = (digit_t)(-1) >> 1; -const digit_t prime1271_0 = (digit_t)(-1); -#define prime1271_1 mask_7fff - - -void digit_x_digit(digit_t a, digit_t b, digit_t* c) -{ // Digit multiplication, digit * digit -> 2-digit result - register digit_t al, ah, bl, bh, temp; - digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry; - digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t)*4), mask_high = (digit_t)(-1) << (sizeof(digit_t)*4); - - al = a & mask_low; // Low part - ah = a >> (sizeof(digit_t) * 4); // High part - bl = b & mask_low; - bh = b >> (sizeof(digit_t) * 4); - - albl = al*bl; - albh = al*bh; - ahbl = ah*bl; - ahbh = ah*bh; - c[0] = albl & mask_low; // C00 - - res1 = albl >> (sizeof(digit_t) * 4); - res2 = ahbl & mask_low; - res3 = albh & mask_low; - temp = res1 + res2 + res3; - carry = temp >> (sizeof(digit_t) * 4); - c[0] ^= temp << (sizeof(digit_t) * 4); // C01 - - res1 = ahbl >> (sizeof(digit_t) * 4); - res2 = albh >> (sizeof(digit_t) * 4); - res3 = ahbh & mask_low; - temp = res1 + res2 + res3 + carry; - c[1] = temp & mask_low; // C10 - carry = temp & mask_high; - c[1] ^= (ahbh & mask_high) + carry; // C11 -} - - -__inline void fpcopy1271(felm_t a, felm_t c) -{ // Copy of a field element, c = a - unsigned int i; - - for (i = 0; i < NWORDS_FIELD; i++) - c[i] = a[i]; -} - - -static __inline void fpzero1271(felm_t a) -{ // Zeroing a field element, a = 0 - unsigned int i; - - for (i = 0; i < NWORDS_FIELD; i++) - a[i] = 0; -} - - -__inline void fpadd1271(felm_t a, felm_t b, felm_t c) -{ // Field addition, c = a+b mod p - unsigned int i; - unsigned int carry = 0; - - for (i = 0; i < NWORDS_FIELD; i++) { - ADDC(carry, a[i], b[i], carry, c[i]); - } - carry = (unsigned int)(c[NWORDS_FIELD-1] >> (RADIX-1)); - c[NWORDS_FIELD-1] &= mask_7fff; - for (i = 0; i < NWORDS_FIELD; i++) { - ADDC(carry, c[i], 0, carry, c[i]); - } -} - - -__inline void fpsub1271(felm_t a, felm_t b, felm_t c) -{ // Field subtraction, c = a-b mod p - unsigned int i; - unsigned int borrow = 0; - - for (i = 0; i < NWORDS_FIELD; i++) { - SUBC(borrow, a[i], b[i], borrow, c[i]); - } - borrow = (unsigned int)(c[NWORDS_FIELD-1] >> (RADIX-1)); - c[NWORDS_FIELD-1] &= mask_7fff; - for (i = 0; i < NWORDS_FIELD; i++) { - SUBC(borrow, c[i], 0, borrow, c[i]); - } -} - - -__inline void fpneg1271(felm_t a) -{ // Field negation, a = -a mod p - unsigned int i; - unsigned int borrow = 0; - - for (i = 0; i < (NWORDS_FIELD-1); i++) { - SUBC(borrow, prime1271_0, a[i], borrow, a[i]); - } - a[NWORDS_FIELD-1] = prime1271_1 - a[NWORDS_FIELD-1]; -} - - -void fpmul1271(felm_t a, felm_t b, felm_t c) -{ // Field multiplication using schoolbook method, c = a*b mod p - unsigned int i, j; - digit_t u, v, UV[2], temp, bit_mask; - digit_t t[2*NWORDS_FIELD] = {0}; - unsigned int carry = 0; - - for (i = 0; i < NWORDS_FIELD; i++) { - u = 0; - for (j = 0; j < NWORDS_FIELD; j++) { - MUL(a[i], b[j], UV+1, UV[0]); - ADDC(0, UV[0], u, carry, v); - u = UV[1] + carry; - ADDC(0, t[i+j], v, carry, v); - u = u + carry; - t[i+j] = v; - } - t[NWORDS_FIELD+i] = u; - } - bit_mask = (t[NWORDS_FIELD-1] >> (RADIX-1)); - t[NWORDS_FIELD-1] &= mask_7fff; - carry = 0; - for (i = 0; i < NWORDS_FIELD; i++) { - temp = (t[NWORDS_FIELD+i] >> (RADIX-1)); - t[NWORDS_FIELD+i] = (t[NWORDS_FIELD+i] << 1) + bit_mask; - bit_mask = temp; - ADDC(carry, t[i], t[NWORDS_FIELD+i], carry, t[i]); - } - carry = (unsigned int)(t[NWORDS_FIELD-1] >> (RADIX-1)); - t[NWORDS_FIELD-1] &= mask_7fff; - for (i = 0; i < NWORDS_FIELD; i++) { - ADDC(carry, t[i], 0, carry, c[i]); - } -} - - -void fpsqr1271(felm_t a, felm_t c) -{ // Field squaring using schoolbook method, c = a^2 mod p - - fpmul1271(a, a, c); -} - - -void mod1271(felm_t a) -{ // Modular correction, a = a mod (2^127-1) - digit_t mask; - unsigned int i; - unsigned int borrow = 0; - - for (i = 0; i < (NWORDS_FIELD-1); i++) { - SUBC(borrow, a[i], prime1271_0, borrow, a[i]); - } - SUBC(borrow, a[NWORDS_FIELD-1], prime1271_1, borrow, a[NWORDS_FIELD-1]); - - mask = 0 - (digit_t)borrow; // If result < 0 then mask = 0xFF...F else sign = 0x00...0 - borrow = 0; - for (i = 0; i < (NWORDS_FIELD-1); i++) { - ADDC(borrow, a[i], mask, borrow, a[i]); - } - ADDC(borrow, a[NWORDS_FIELD-1], (mask >> 1), borrow, a[NWORDS_FIELD-1]); -} - - -void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) -{ // Schoolbook multiprecision multiply, c = a*b - unsigned int i, j; - digit_t u, v, UV[2]; - unsigned int carry = 0; - - for (i = 0; i < (2*nwords); i++) c[i] = 0; - - for (i = 0; i < nwords; i++) { - u = 0; - for (j = 0; j < nwords; j++) { - MUL(a[i], b[j], UV+1, UV[0]); - ADDC(0, UV[0], u, carry, v); - u = UV[1] + carry; - ADDC(0, c[i+j], v, carry, v); - u = u + carry; - c[i+j] = v; - } - c[nwords+i] = u; - } -} - - -unsigned int mp_add(digit_t* a, digit_t* b, digit_t* c, unsigned int nwords) -{ // Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit - unsigned int i, carry = 0; - - for (i = 0; i < nwords; i++) { - ADDC(carry, a[i], b[i], carry, c[i]); - } - - return carry; -} - - -__inline void fpexp1251(felm_t a, felm_t af) -{ // Exponentiation over GF(p), af = a^(125-1) - int i; - felm_t t1, t2, t3, t4, t5; - - fpsqr1271(a, t2); - fpmul1271(a, t2, t2); - fpsqr1271(t2, t3); - fpsqr1271(t3, t3); - fpmul1271(t2, t3, t3); - fpsqr1271(t3, t4); - fpsqr1271(t4, t4); - fpsqr1271(t4, t4); - fpsqr1271(t4, t4); - fpmul1271(t3, t4, t4); - fpsqr1271(t4, t5); - for (i=0; i<7; i++) fpsqr1271(t5, t5); - fpmul1271(t4, t5, t5); - fpsqr1271(t5, t2); - for (i=0; i<15; i++) fpsqr1271(t2, t2); - fpmul1271(t5, t2, t2); - fpsqr1271(t2, t1); - for (i=0; i<31; i++) fpsqr1271(t1, t1); - fpmul1271(t2, t1, t1); - for (i=0; i<32; i++) fpsqr1271(t1, t1); - fpmul1271(t1, t2, t1); - for (i=0; i<16; i++) fpsqr1271(t1, t1); - fpmul1271(t5, t1, t1); - for (i=0; i<8; i++) fpsqr1271(t1, t1); - fpmul1271(t4, t1, t1); - for (i=0; i<4; i++) fpsqr1271(t1, t1); - fpmul1271(t3, t1, t1); - fpsqr1271(t1, t1); - fpmul1271(a, t1, af); -} - - -void fpinv1271(felm_t a) -{ // Field inversion, af = a^-1 = a^(p-2) mod p - // Hardcoded for p = 2^127-1 - felm_t t; - - fpexp1251(a, t); - fpsqr1271(t, t); - fpsqr1271(t, t); - fpmul1271(a, t, a); -} - - -static __inline void multiply(const digit_t* a, const digit_t* b, digit_t* c) -{ // Schoolbook multiprecision multiply, c = a*b - - mp_mul(a, b, c, NWORDS_ORDER); -} - - -static __inline unsigned int add(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) -{ // Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit - - return mp_add((digit_t*)a, (digit_t*)b, c, (unsigned int)nwords); -} - - -unsigned int subtract(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) -{ // Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit - unsigned int i; - unsigned int borrow = 0; - - for (i = 0; i < nwords; i++) { - SUBC(borrow, a[i], b[i], borrow, c[i]); - } - - return borrow; -} - - -void subtract_mod_order(const digit_t* a, const digit_t* b, digit_t* c) -{ // Subtraction modulo the curve order, c = a-b mod order - digit_t mask, carry = 0; - digit_t* order = (digit_t*)curve_order; - unsigned int i, bout; - - bout = subtract(a, b, c, NWORDS_ORDER); // (bout, c) = a - b - mask = 0 - (digit_t)bout; // if bout = 0 then mask = 0x00..0, else if bout = 1 then mask = 0xFF..F - - for (i = 0; i < NWORDS_ORDER; i++) { // c = c + (mask & order) - ADDC(carry, c[i], mask & order[i], carry, c[i]); - } -} - - -void add_mod_order(const digit_t* a, const digit_t* b, digit_t* c) -{ // Addition modulo the curve order, c = a+b mod order - - add(a, b, c, NWORDS_ORDER); // c = a + b - subtract_mod_order(c, (digit_t*)&curve_order, c); // if c >= order then c = c - order -} - - -void Montgomery_multiply_mod_order(const digit_t* ma, const digit_t* mb, digit_t* mc) -{ // 256-bit Montgomery multiplication modulo the curve order, mc = ma*mb*r' mod order, where ma,mb,mc in [0, order-1] - // ma, mb and mc are assumed to be in Montgomery representation - // The Montgomery constant r' = -r^(-1) mod 2^(log_2(r)) is the global value "Montgomery_rprime", where r is the order - unsigned int i; - digit_t mask, P[2 * NWORDS_ORDER], Q[2 * NWORDS_ORDER], temp[2 * NWORDS_ORDER]; - digit_t* order = (digit_t*)curve_order; - unsigned int cout = 0, bout = 0; - - multiply(ma, mb, P); // P = ma * mb - multiply(P, (digit_t*)&Montgomery_rprime, Q); // Q = P * r' mod 2^(log_2(r)) - multiply(Q, (digit_t*)&curve_order, temp); // temp = Q * r - cout = add(P, temp, temp, 2 * NWORDS_ORDER); // (cout, temp) = P + Q * r - - for (i = 0; i < NWORDS_ORDER; i++) { // (cout, mc) = (P + Q * r)/2^(log_2(r)) - mc[i] = temp[NWORDS_ORDER + i]; - } - - // Final, constant-time subtraction - bout = subtract(mc, (digit_t*)&curve_order, mc, NWORDS_ORDER); // (cout, mc) = (cout, mc) - r - mask = (digit_t)cout - (digit_t)bout; // if (cout, mc) >= 0 then mask = 0x00..0, else if (cout, mc) < 0 then mask = 0xFF..F - - for (i = 0; i < NWORDS_ORDER; i++) { // temp = mask & r - temp[i] = (order[i] & mask); - } - add(mc, temp, mc, NWORDS_ORDER); // mc = mc + (mask & r) - - return; -} - - -void modulo_order(digit_t* a, digit_t* c) -{ // Reduction modulo the order using Montgomery arithmetic - // ma = a*Montgomery_Rprime mod r, where a,ma in [0, r-1], a,ma,r < 2^256 - // c = ma*1*Montgomery_Rprime^(-1) mod r, where ma,c in [0, r-1], ma,c,r < 2^256 - digit_t ma[NWORDS_ORDER], one[NWORDS_ORDER] = { 0 }; - - one[0] = 1; - Montgomery_multiply_mod_order(a, (digit_t*)&Montgomery_Rprime, ma); - Montgomery_multiply_mod_order(ma, one, c); -} - - -void conversion_to_odd(digit_t* k, digit_t* k_odd) -{// Convert scalar to odd if even using the prime subgroup order r - digit_t i, mask; - digit_t* order = (digit_t*)curve_order; - unsigned int carry = 0; - - mask = ~(0 - (k[0] & 1)); - - for (i = 0; i < NWORDS_ORDER; i++) { // If (k is odd) then k_odd = k else k_odd = k + r - ADDC(carry, order[i] & mask, k[i], carry, k_odd[i]); - } -} - - -__inline void fpdiv1271(felm_t a) -{ // Field division by two, c = a/2 mod p - digit_t mask; - unsigned int carry = 0; - unsigned int i; - - mask = 0 - (a[0] & 1); // if a is odd then mask = 0xFF...FF, else mask = 0 - - for (i = 0; i < (NWORDS_FIELD-1); i++) { - ADDC(carry, mask, a[i], carry, a[i]); - } - ADDC(carry, (mask >> 1), a[NWORDS_FIELD-1], carry, a[NWORDS_FIELD-1]); - - for (i = 0; i < (NWORDS_FIELD-1); i++) { - SHIFTR(a[i+1], a[i], 1, a[i], RADIX); - } - a[NWORDS_FIELD-1] = (a[NWORDS_FIELD-1] >> 1); -} - - -void fp2div1271(f2elm_t a) -{ // GF(p^2) division by two c = a/2 mod p - fpdiv1271(a[0]); - fpdiv1271(a[1]); -} - - -#ifdef __cplusplus -} -#endif - - -#endif diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/hash_to_curve.c b/ffi-deps/FourQlib/FourQ_64bit_and_portable/hash_to_curve.c deleted file mode 100644 index e86aa66..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/hash_to_curve.c +++ /dev/null @@ -1,237 +0,0 @@ -/********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: hash to FourQ -***********************************************************************************/ - -#include "FourQ_internal.h" -#include "FourQ_params.h" - - -static digit_t fpeq1271(digit_t* a, digit_t* b) -{ // Constant-time comparison of two field elements, ai=bi? : (0) equal, (-1) unequal - digit_t c = 0; - - for (unsigned int i = 0; i < NWORDS_FIELD; i++) - c |= a[i] ^ b[i]; - - return (digit_t)((-(sdigit_t)(c >> 1) | -(sdigit_t)(c & 1)) >> (8*sizeof(digit_t) - 1)); -} - - -static void fpselect(digit_t* a, digit_t* b, digit_t* c, digit_t selector) -{ // Constant-time selection of field elements - // If selector = 0 do c <- a, else if selector =-1 do c <- b - - for (unsigned int i = 0; i < NWORDS_FIELD; i++) - c[i] = (selector & (a[i] ^ b[i])) ^ a[i]; -} - - -ECCRYPTO_STATUS HashToCurve(f2elm_t r, point_t out) -{ - digit_t *r0 = (digit_t*)r[0], *r1 = (digit_t*)r[1]; - felm_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16; - felm_t one = {0}; - one[0] = 1; - - digit_t* x0 = (digit_t*)out->x[0]; - digit_t* x1 = (digit_t*)out->x[1]; - digit_t* y0 = (digit_t*)out->y[0]; - digit_t* y1 = (digit_t*)out->y[1]; - digit_t selector; - - fpadd1271(r0, r1, t0); - fpsub1271(r0, r1, t1); - fpmul1271(t0, t1, t0); - fpmul1271(r0, r1, t1); - fpadd1271(t1, t1, t1); - fpadd1271(t1, t1, t2); - fpadd1271(t0, t2, t2); - fpadd1271(t0, t0, t0); - fpsub1271(t0, t1, t3); - fpadd1271(t3, one, t0); - fpmul1271(A0, t0, t4); - fpmul1271(A1, t2, t1); - fpsub1271(t1, t4, t4); - fpmul1271(A1, t0, t5); - fpmul1271(A0, t2, t1); - fpadd1271(t1, t5, t1); - fpadd1271(t0, t2, t5); - fpsub1271(t0, t2, t6); - fpmul1271(t5, t6, t6); - fpmul1271(t2, t0, t5); - fpadd1271(t5, t5, t5); - fpmul1271(con1, t3, t7); - fpsub1271(t6, t7, t8); - fpmul1271(con2, t2, t7); - fpadd1271(t7, t8, t8); - fpmul1271(con1, t2, t7); - fpsub1271(t5, t7, t9); - fpmul1271(con2, t3, t7); - fpsub1271(t9, t7, t9); - fpmul1271(t4, t8, t5); - fpmul1271(t1, t9, t7); - fpadd1271(t5, t7, t7); - fpmul1271(t4, t9, t5); - fpmul1271(t1, t8, t10); - fpsub1271(t5, t10, t10); - fpsqr1271(t7, t5); - fpsqr1271(t10, t7); - fpadd1271(t5, t7, t5); - fpexp1251(t5, t7); - fpsqr1271(t7, t7); - fpmul1271(t5, t7, t7); - fpcopy1271(A0, t8); - fpcopy1271(A1, t9); - fpneg1271(t8); - fpneg1271(t9); - fpadd1271(A0, t4, t5); - fpsub1271(A1, t1, t11); - - selector = fpeq1271(t7, one); - fpselect(t8, t5, t3, selector); - fpselect(t9, t11, t10, selector); - - fpmul1271(t0, t3, t5); - fpmul1271(t2, t10, t8); - fpsub1271(t5, t8, t8); - fpmul1271(t2, t3, t5); - fpmul1271(t0, t10, t9); - fpadd1271(t5, t9, t9); - fpadd1271(t3, t10, t5); - fpsub1271(t3, t10, t11); - fpmul1271(t5, t11, t5); - fpmul1271(t3, t10, t11); - fpadd1271(t11, t11, t11); - fpmul1271(t3, t4, t12); - fpmul1271(t1, t10, t13); - fpadd1271(t12, t13, t13); - fpmul1271(t4, t10, t14); - fpmul1271(t1, t3, t12); - fpsub1271(t14, t12, t12); - fpsub1271(t5, t13, t5); - fpsub1271(t11, t12, t11); - fpadd1271(t5, t6, t5); - fpmul1271(t0, t2, t6); - fpadd1271(t6, t6, t6); - fpadd1271(t11, t6, t11); - fpmul1271(t5, t8, t6); - fpmul1271(t9, t11, t12); - fpsub1271(t6, t12, t6); - fpmul1271(t5, t9, t12); - fpmul1271(t8, t11, t8); - fpadd1271(t12, t8, t12); - fpadd1271(t6, t6, t6); - fpadd1271(t6, t6, t6); - fpadd1271(t6, t6, t6); - fpadd1271(t6, t6, t6); - fpadd1271(t12, t12, t12); - fpadd1271(t12, t12, t12); - fpadd1271(t12, t12, t12); - fpadd1271(t12, t12, t12); - fpadd1271(t0, t3, t14); - fpadd1271(t14, t14, t14); - fpadd1271(t2, t10, t8); - fpadd1271(t8, t8, t8); - fpmul1271(t6, t14, t4); - fpmul1271(t8, t12, t1); - fpsub1271(t4, t1, t4); - fpmul1271(t12, t14, t9); - fpmul1271(t6, t8, t1); - fpadd1271(t1, t9, t1); - fpsqr1271(t12, t5); - fpsqr1271(t6, t9); - fpadd1271(t5, t9, t9); - fpsqr1271(t1, t5); - fpsqr1271(t4, t11); - fpadd1271(t11, t5, t11); - fpsqr1271(t11, t5); - fpmul1271(t5, t9, t5); - fpexp1251(t5, t7); - fpsqr1271(t7, t13); - fpsqr1271(t13, t13); - fpmul1271(t11, t13, t13); - fpmul1271(t9, t13, t13); - fpmul1271(t5, t13, t13); - fpmul1271(t13, t7, t7); - fpmul1271(t5, t7, t7); - fpadd1271(t6, t7, t5); - fpdiv1271(t5); - fpexp1251(t5, t9); - fpsqr1271(t9, t11); - fpsqr1271(t11, t11); - fpmul1271(t5, t11, t11); - fpmul1271(t5, t9, t9); - fpmul1271(t11, t12, t11); - fpsqr1271(t9, t7); - fpadd1271(one, one, t15); - fpcopy1271(t11, t16); - fpcopy1271(t15, x0); - fpneg1271(x0); - - selector = fpeq1271(t5, t7); - fpselect(t15, t16, t7, selector); - fpselect(t16, x0, t11, selector); - - fpadd1271(t13, t13, t13); - fpsub1271(t3, t0, y0); - fpsub1271(t10, t2, y1); - fpmul1271(y0, t6, t16); - fpmul1271(y1, t12, t15); - fpsub1271(t16, t15, t15); - fpmul1271(y0, t12, y0); - fpmul1271(t6, y1, t16); - fpadd1271(t16, y0, t16); - fpmul1271(t15, t4, x0); - fpmul1271(t1, t16, y0); - fpadd1271(x0, y0, y0); - fpmul1271(t4, t16, y1); - fpmul1271(t1, t15, x0); - fpsub1271(y1, x0, y1); - fpmul1271(y0, t13, y0); - fpmul1271(y1, t13, y1); - fpmul1271(b0, t3, t15); - fpmul1271(b1, t10, x0); - fpsub1271(t15, x0, t15); - fpmul1271(b0, t10, t16); - fpmul1271(b1, t3, x0); - fpadd1271(t16, x0, t16); - fpmul1271(t15, t4, t5); - fpmul1271(t1, t16, x0); - fpadd1271(x0, t5, x0); - fpmul1271(t4, t16, x1); - fpmul1271(t1, t15, t5); - fpsub1271(x1, t5, x1); - fpmul1271(x0, t0, t5); - fpmul1271(x1, t2, t15); - fpsub1271(t5, t15, t15); - fpmul1271(x1, t0, t5); - fpmul1271(x0, t2, t16); - fpadd1271(t5, t16, t16); - fpmul1271(t15, t14, t5); - fpmul1271(t16, t8, x0); - fpsub1271(t5, x0, x0); - fpmul1271(t15, t8, t5); - fpmul1271(t16, t14, x1); - fpadd1271(x1, t5, x1); - fpmul1271(x0, t7, t5); - fpmul1271(x1, t11, t15); - fpsub1271(t5, t15, t15); - fpmul1271(t7, x1, t5); - fpmul1271(t11, x0, t16); - fpadd1271(t16, t5, t16); - fpmul1271(t13, t9, t13); - fpmul1271(t15, t13, x0); - fpmul1271(t16, t13, x1); - - // Clear cofactor - point_extproj_t P; - point_setup(out, P); - cofactor_clearing(P); - eccnorm(P, out); - - return ECCRYPTO_SUCCESS; -} diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/kex.c b/ffi-deps/FourQlib/FourQ_64bit_and_portable/kex.c deleted file mode 100644 index e4a03cf..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/kex.c +++ /dev/null @@ -1,181 +0,0 @@ -/******************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: Diffie-Hellman key exchange based on FourQ -* option 1: co-factor ecdh using compressed 32-byte public keys, -* (see https://datatracker.ietf.org/doc/draft-ladd-cfrg-4q/). -* option 2: co-factor ecdh using uncompressed, 64-byte public keys. -*********************************************************************************/ - -#include "FourQ_internal.h" -#include "FourQ_params.h" -#include "../random/random.h" -#include - - -static __inline bool is_neutral_point(point_t P) -{ // Is P the neutral point (0,1)? - // SECURITY NOTE: this function does not run in constant time (input point P is assumed to be public). - - if (is_zero_ct((digit_t*)P->x, 2*NWORDS_FIELD) && is_zero_ct(&((digit_t*)P->y)[1], 2*NWORDS_FIELD-1) && is_digit_zero_ct(P->y[0][0] - 1)) { - return true; - } - return false; -} - - -/*************** ECDH USING COMPRESSED, 32-BYTE PUBLIC KEYS ***************/ - -ECCRYPTO_STATUS CompressedPublicKeyGeneration(const unsigned char* SecretKey, unsigned char* PublicKey) -{ // Compressed public key generation for key exchange - // It produces a public key PublicKey, which is the encoding of P = SecretKey*G (G is the generator). - // Input: 32-byte SecretKey - // Output: 32-byte PublicKey - point_t P; - - ecc_mul_fixed((digit_t*)SecretKey, P); // Compute public key - encode(P, PublicKey); // Encode public key - - return ECCRYPTO_SUCCESS; -} - - -ECCRYPTO_STATUS CompressedKeyGeneration(unsigned char* SecretKey, unsigned char* PublicKey) -{ // Keypair generation for key exchange. Public key is compressed to 32 bytes - // It produces a private key SecretKey and a public key PublicKey, which is the encoding of P = SecretKey*G (G is the generator). - // Outputs: 32-byte SecretKey and 32-byte PublicKey - ECCRYPTO_STATUS Status = ECCRYPTO_ERROR_UNKNOWN; - - Status = RandomBytesFunction(SecretKey, 32); - if (Status != ECCRYPTO_SUCCESS) { - goto cleanup; - } - - Status = CompressedPublicKeyGeneration(SecretKey, PublicKey); - if (Status != ECCRYPTO_SUCCESS) { - goto cleanup; - } - - return ECCRYPTO_SUCCESS; - -cleanup: - clear_words((unsigned int*)SecretKey, 256/(sizeof(unsigned int)*8)); - clear_words((unsigned int*)PublicKey, 256/(sizeof(unsigned int)*8)); - - return Status; -} - - -ECCRYPTO_STATUS CompressedSecretAgreement(const unsigned char* SecretKey, const unsigned char* PublicKey, unsigned char* SharedSecret) -{ // Secret agreement computation for key exchange using a compressed, 32-byte public key - // The output is the y-coordinate of SecretKey*A, where A is the decoding of the public key PublicKey. - // Inputs: 32-byte SecretKey and 32-byte PublicKey - // Output: 32-byte SharedSecret - point_t A; - ECCRYPTO_STATUS Status = ECCRYPTO_ERROR_UNKNOWN; - - if ((PublicKey[15] & 0x80) != 0) { // Is bit128(PublicKey) = 0? - Status = ECCRYPTO_ERROR_INVALID_PARAMETER; - goto cleanup; - } - - Status = decode(PublicKey, A); // Also verifies that A is on the curve. If it is not, it fails - if (Status != ECCRYPTO_SUCCESS) { - goto cleanup; - } - - Status = ecc_mul(A, (digit_t*)SecretKey, A, true); - if (Status != ECCRYPTO_SUCCESS) { - goto cleanup; - } - - if (is_neutral_point(A)) { // Is output = neutral point (0,1)? - Status = ECCRYPTO_ERROR_SHARED_KEY; - goto cleanup; - } - - memmove(SharedSecret, (unsigned char*)A->y, 32); - - return ECCRYPTO_SUCCESS; - -cleanup: - clear_words((unsigned int*)SharedSecret, 256/(sizeof(unsigned int)*8)); - - return Status; -} - - -/*************** ECDH USING UNCOMPRESSED PUBLIC KEYS ***************/ - -ECCRYPTO_STATUS PublicKeyGeneration(const unsigned char* SecretKey, unsigned char* PublicKey) -{ // Public key generation for key exchange - // It produces the public key PublicKey = SecretKey*G, where G is the generator. - // Input: 32-byte SecretKey - // Output: 64-byte PublicKey - - ecc_mul_fixed((digit_t*)SecretKey, (point_affine*)PublicKey); // Compute public key - - return ECCRYPTO_SUCCESS; -} - - -ECCRYPTO_STATUS KeyGeneration(unsigned char* SecretKey, unsigned char* PublicKey) -{ // Keypair generation for key exchange - // It produces a private key SecretKey and computes the public key PublicKey = SecretKey*G, where G is the generator. - // Outputs: 32-byte SecretKey and 64-byte PublicKey - ECCRYPTO_STATUS Status = ECCRYPTO_ERROR_UNKNOWN; - - Status = RandomBytesFunction(SecretKey, 32); - if (Status != ECCRYPTO_SUCCESS) { - goto cleanup; - } - - Status = PublicKeyGeneration(SecretKey, PublicKey); - if (Status != ECCRYPTO_SUCCESS) { - goto cleanup; - } - - return ECCRYPTO_SUCCESS; - -cleanup: - clear_words((unsigned int*)SecretKey, 256/(sizeof(unsigned int)*8)); - clear_words((unsigned int*)PublicKey, 512/(sizeof(unsigned int)*8)); - - return Status; -} - - -ECCRYPTO_STATUS SecretAgreement(const unsigned char* SecretKey, const unsigned char* PublicKey, unsigned char* SharedSecret) -{ // Secret agreement computation for key exchange - // The output is the y-coordinate of SecretKey*PublicKey. - // Inputs: 32-byte SecretKey and 64-byte PublicKey - // Output: 32-byte SharedSecret - point_t A; - ECCRYPTO_STATUS Status = ECCRYPTO_ERROR_UNKNOWN; - - if (((PublicKey[15] & 0x80) != 0) || ((PublicKey[31] & 0x80) != 0) || ((PublicKey[47] & 0x80) != 0) || ((PublicKey[63] & 0x80) != 0)) { // Are PublicKey_x[i] and PublicKey_y[i] < 2^127? - Status = ECCRYPTO_ERROR_INVALID_PARAMETER; - goto cleanup; - } - - Status = ecc_mul((point_affine*)PublicKey, (digit_t*)SecretKey, A, true); // Also verifies that PublicKey is a point on the curve. If it is not, it fails - if (Status != ECCRYPTO_SUCCESS) { - goto cleanup; - } - - if (is_neutral_point(A)) { // Is output = neutral point (0,1)? - Status = ECCRYPTO_ERROR_SHARED_KEY; - goto cleanup; - } - - memmove(SharedSecret, (unsigned char*)A->y, 32); - - return ECCRYPTO_SUCCESS; - -cleanup: - clear_words((unsigned int*)SharedSecret, 256/(sizeof(unsigned int)*8)); - - return Status; -} \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/makefile b/ffi-deps/FourQlib/FourQ_64bit_and_portable/makefile deleted file mode 100644 index 7b6fd07..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/makefile +++ /dev/null @@ -1,188 +0,0 @@ -#### Makefile for compilation using GNU GCC or clang #### - -OPT=-O3 # Optimization option by default - -CC=gcc -ifeq "$(CC)" "gcc" - COMPILER=gcc -else ifeq "$(CC)" "clang" - COMPILER=clang -endif - -ifeq "$(ARCH)" "x64" - ARCHITECTURE=_AMD64_ -ifeq "$(GENERIC)" "TRUE" - USE_GENERIC=-D _GENERIC_ -endif -ifeq "$(ASM)" "FALSE" -else ifeq "$(ASM)" "TRUE" - USE_ASM=-D _ASM_ - ASM_var=yes -else -ifneq "$(GENERIC)" "TRUE" - USE_ASM=-D _ASM_ - ASM_var=yes -endif -endif -ifeq "$(AVX)" "FALSE" -else ifeq "$(AVX)" "TRUE" - USE_AVX=-D _AVX_ - SIMD=-mavx -else -ifneq "$(GENERIC)" "TRUE" - USE_AVX=-D _AVX_ - SIMD=-mavx -endif -endif -ifeq "$(AVX2)" "FALSE" -else ifeq "$(AVX2)" "TRUE" - USE_AVX2=-D _AVX2_ - SIMD=-mavx2 - AVX2_var=yes -else -ifneq "$(GENERIC)" "TRUE" - USE_AVX2=-D _AVX2_ - SIMD=-mavx2 - AVX2_var=yes -endif -endif - -else ifeq "$(ARCH)" "ARM64" - ARCHITECTURE=_ARM64_ - ARM_SETTING=-lrt -ifeq "$(GENERIC)" "TRUE" - USE_GENERIC=-D _GENERIC_ -endif - -else - -USE_GENERIC=-D _GENERIC_ -ifeq "$(GENERIC)" "FALSE" - USE_GENERIC= -endif -ifeq "$(ASM)" "TRUE" - USE_ASM=-D _ASM_ -endif -ifeq "$(AVX)" "TRUE" - USE_ASM=-D _ASM_ -endif -ifeq "$(AVX2)" "TRUE" - USE_ASM=-D _ASM_ -endif -ifeq "$(ARCH)" "x86" - ARCHITECTURE=_X86_ -else ifeq "$(ARCH)" "ARM" - ARCHITECTURE=_ARM_ - ARM_SETTING=-lrt -endif -endif - -ADDITIONAL_SETTINGS=-fwrapv -fomit-frame-pointer -march=native -ifeq "$(EXTENDED_SET)" "FALSE" - ADDITIONAL_SETTINGS= -endif - -USE_ENDOMORPHISMS=-D USE_ENDO -ifeq "$(USE_ENDO)" "FALSE" - USE_ENDOMORPHISMS= -endif - -ifeq "$(SERIAL_PUSH)" "TRUE" - USE_SERIAL_PUSH=-D PUSH_SET -endif - -SHARED_LIB_TARGET=libFourQ.so -ifeq "$(SHARED_LIB)" "TRUE" - DO_MAKE_SHARED_LIB=-fPIC - SHARED_LIB_O=$(SHARED_LIB_TARGET) -endif - -cc=$(COMPILER) -CFLAGS=-c $(OPT) $(ADDITIONAL_SETTINGS) $(SIMD) -D $(ARCHITECTURE) -D __LINUX__ $(USE_AVX) $(USE_AVX2) $(USE_ASM) $(USE_GENERIC) $(USE_ENDOMORPHISMS) $(USE_SERIAL_PUSH) $(DO_MAKE_SHARED_LIB) -LDFLAGS= -ifdef ASM_var -ifdef AVX2_var - ASM_OBJECTS=fp2_1271_AVX2.o -else - ASM_OBJECTS=fp2_1271.o -endif -endif -OBJECTS=eccp2.o eccp2_no_endo.o eccp2_core.o $(ASM_OBJECTS) crypto_util.o schnorrq.o hash_to_curve.o kex.o sha512.o random.o -OBJECTS_FP_TEST=fp_tests.o $(OBJECTS) test_extras.o -OBJECTS_ECC_TEST=ecc_tests.o $(OBJECTS) test_extras.o -OBJECTS_CRYPTO_TEST=crypto_tests.o $(OBJECTS) test_extras.o -OBJECTS_ALL=$(OBJECTS) $(OBJECTS_FP_TEST) $(OBJECTS_ECC_TEST) $(OBJECTS_CRYPTO_TEST) - -all: crypto_test ecc_test fp_test $(SHARED_LIB_O) - -ifeq "$(SHARED_LIB)" "TRUE" - $(SHARED_LIB_O): $(OBJECTS) - $(CC) -shared -o $(SHARED_LIB_O) $(OBJECTS) -endif - -crypto_test: $(OBJECTS_CRYPTO_TEST) - $(CC) -o crypto_test $(OBJECTS_CRYPTO_TEST) $(ARM_SETTING) - -ecc_test: $(OBJECTS_ECC_TEST) - $(CC) -o ecc_test $(OBJECTS_ECC_TEST) $(ARM_SETTING) - -fp_test: $(OBJECTS_FP_TEST) - $(CC) -o fp_test $(OBJECTS_FP_TEST) $(ARM_SETTING) - -eccp2_core.o: eccp2_core.c AMD64/fp_x64.h - $(CC) $(CFLAGS) eccp2_core.c - -eccp2.o: eccp2.c - $(CC) $(CFLAGS) eccp2.c - -eccp2_no_endo.o: eccp2_no_endo.c - $(CC) $(CFLAGS) eccp2_no_endo.c - -ifdef ASM_var -ifdef AVX2_var - AMD64/consts.s: AMD64/consts.c - $(CC) $(CFLAGS) -S -o $@ $< - sed '/.globl/d' -i $@ - fp2_1271_AVX2.o: AMD64/fp2_1271_AVX2.S AMD64/consts.s - $(CC) $(CFLAGS) -o $@ $< -else - fp2_1271.o: AMD64/fp2_1271.S - $(CC) $(CFLAGS) AMD64/fp2_1271.S -endif -endif - -schnorrq.o: schnorrq.c - $(CC) $(CFLAGS) schnorrq.c - -hash_to_curve.o: hash_to_curve.c - $(CC) $(CFLAGS) hash_to_curve.c - -kex.o: kex.c - $(CC) $(CFLAGS) kex.c - -crypto_util.o: crypto_util.c - $(CC) $(CFLAGS) crypto_util.c - -sha512.o: ../sha512/sha512.c - $(CC) $(CFLAGS) ../sha512/sha512.c - -random.o: ../random/random.c - $(CC) $(CFLAGS) ../random/random.c - -test_extras.o: tests/test_extras.c - $(CC) $(CFLAGS) tests/test_extras.c - -crypto_tests.o: tests/crypto_tests.c - $(CC) $(CFLAGS) tests/crypto_tests.c - -ecc_tests.o: tests/ecc_tests.c - $(CC) $(CFLAGS) tests/ecc_tests.c - -fp_tests.o: tests/fp_tests.c - $(CC) $(CFLAGS) tests/fp_tests.c - -.PHONY: clean - -clean: - rm -rf $(SHARED_LIB_TARGET) crypto_test ecc_test fp_test *.o AMD64/consts.s - diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/schnorrq.c b/ffi-deps/FourQlib/FourQ_64bit_and_portable/schnorrq.c deleted file mode 100644 index da89f86..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/schnorrq.c +++ /dev/null @@ -1,190 +0,0 @@ -/********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: digital signature SchnorrQ -* -* See "SchnorrQ: Schnorr signatures on FourQ" by Craig Costello and Patrick Longa, -* MSR Technical Report, 2016. Available at: -* https://www.microsoft.com/en-us/research/wp-content/uploads/2016/07/SchnorrQ.pdf. -***********************************************************************************/ - -#include "FourQ_internal.h" -#include "FourQ_params.h" -#include "../random/random.h" -#include "../sha512/sha512.h" -#include -#include - - -ECCRYPTO_STATUS SchnorrQ_KeyGeneration(const unsigned char* SecretKey, unsigned char* PublicKey) -{ // SchnorrQ public key generation - // It produces a public key PublicKey, which is the encoding of P = s*G, where G is the generator and - // s is the output of hashing SecretKey and taking the least significant 32 bytes of the result. - // Input: 32-byte SecretKey - // Output: 32-byte PublicKey - point_t P; - unsigned char k[64]; - ECCRYPTO_STATUS Status = ECCRYPTO_ERROR_UNKNOWN; - - if (CryptoHashFunction(SecretKey, 32, k) != 0) { - Status = ECCRYPTO_ERROR; - goto cleanup; - } - - ecc_mul_fixed((digit_t*)k, P); // Compute public key - encode(P, PublicKey); // Encode public key - - return ECCRYPTO_SUCCESS; - -cleanup: - clear_words((unsigned int*)k, 512/(sizeof(unsigned int)*8)); - clear_words((unsigned int*)PublicKey, 256/(sizeof(unsigned int)*8)); - - return Status; -} - - -ECCRYPTO_STATUS SchnorrQ_FullKeyGeneration(unsigned char* SecretKey, unsigned char* PublicKey) -{ // SchnorrQ keypair generation - // It produces a private key SecretKey and computes the public key PublicKey, which is the encoding of P = s*G, - // where G is the generator and s is the output of hashing SecretKey and taking the least significant 32 bytes of the result. - // Outputs: 32-byte SecretKey and 32-byte PublicKey - ECCRYPTO_STATUS Status = ECCRYPTO_ERROR_UNKNOWN; - - Status = RandomBytesFunction(SecretKey, 32); - if (Status != ECCRYPTO_SUCCESS) { - goto cleanup; - } - - Status = SchnorrQ_KeyGeneration(SecretKey, PublicKey); - if (Status != ECCRYPTO_SUCCESS) { - goto cleanup; - } - - return ECCRYPTO_SUCCESS; - -cleanup: - clear_words((unsigned int*)SecretKey, 256/(sizeof(unsigned int)*8)); - clear_words((unsigned int*)PublicKey, 256/(sizeof(unsigned int)*8)); - - return Status; -} - - -ECCRYPTO_STATUS SchnorrQ_Sign(const unsigned char* SecretKey, const unsigned char* PublicKey, const unsigned char* Message, const unsigned int SizeMessage, unsigned char* Signature) -{ // SchnorrQ signature generation - // It produces the signature Signature of a message Message of size SizeMessage in bytes - // Inputs: 32-byte SecretKey, 32-byte PublicKey, and Message of size SizeMessage in bytes - // Output: 64-byte Signature - point_t R; - unsigned char k[64], r[64], h[64], *temp = NULL; - digit_t* H = (digit_t*)h; - digit_t* S = (digit_t*)(Signature+32); - ECCRYPTO_STATUS Status = ECCRYPTO_ERROR_UNKNOWN; - - if (CryptoHashFunction(SecretKey, 32, k) != 0) { - Status = ECCRYPTO_ERROR; - goto cleanup; - } - - temp = (unsigned char*)calloc(1, SizeMessage+64); - if (temp == NULL) { - Status = ECCRYPTO_ERROR_NO_MEMORY; - goto cleanup; - } - - memmove(temp+32, k+32, 32); - memmove(temp+64, Message, SizeMessage); - - if (CryptoHashFunction(temp+32, SizeMessage+32, r) != 0) { - Status = ECCRYPTO_ERROR; - goto cleanup; - } - - ecc_mul_fixed((digit_t*)r, R); - encode(R, Signature); // Encode lowest 32 bytes of signature - memmove(temp, Signature, 32); - memmove(temp+32, PublicKey, 32); - - if (CryptoHashFunction(temp, SizeMessage+64, h) != 0) { - Status = ECCRYPTO_ERROR; - goto cleanup; - } - modulo_order((digit_t*)r, (digit_t*)r); - modulo_order(H, H); - to_Montgomery((digit_t*)k, S); // Converting to Montgomery representation - to_Montgomery(H, H); // Converting to Montgomery representation - Montgomery_multiply_mod_order(S, H, S); - from_Montgomery(S, S); // Converting back to standard representation - subtract_mod_order((digit_t*)r, S, S); - Status = ECCRYPTO_SUCCESS; - -cleanup: - if (temp != NULL) - free(temp); - clear_words((unsigned int*)k, 512/(sizeof(unsigned int)*8)); - clear_words((unsigned int*)r, 512/(sizeof(unsigned int)*8)); - - return Status; -} - - -ECCRYPTO_STATUS SchnorrQ_Verify(const unsigned char* PublicKey, const unsigned char* Message, const unsigned int SizeMessage, const unsigned char* Signature, unsigned int* valid) -{ // SchnorrQ signature verification - // It verifies the signature Signature of a message Message of size SizeMessage in bytes - // Inputs: 32-byte PublicKey, 64-byte Signature, and Message of size SizeMessage in bytes - // Output: true (valid signature) or false (invalid signature) - point_t A; - unsigned char *temp, h[64]; - unsigned int i; - ECCRYPTO_STATUS Status = ECCRYPTO_ERROR_UNKNOWN; - - *valid = false; - - temp = (unsigned char*)calloc(1, SizeMessage+64); - if (temp == NULL) { - Status = ECCRYPTO_ERROR_NO_MEMORY; - goto cleanup; - } - - if (((PublicKey[15] & 0x80) != 0) || ((Signature[15] & 0x80) != 0) || (Signature[63] != 0) || ((Signature[62] & 0xC0) != 0)) { // Are bit128(PublicKey) = bit128(Signature) = 0 and Signature+32 < 2^246? - Status = ECCRYPTO_ERROR_INVALID_PARAMETER; - goto cleanup; - } - - Status = decode(PublicKey, A); // Also verifies that A is on the curve. If it is not, it fails - if (Status != ECCRYPTO_SUCCESS) { - goto cleanup; - } - - memmove(temp, Signature, 32); - memmove(temp+32, PublicKey, 32); - memmove(temp+64, Message, SizeMessage); - - if (CryptoHashFunction(temp, SizeMessage+64, h) != 0) { - Status = ECCRYPTO_ERROR; - goto cleanup; - } - - Status = ecc_mul_double((digit_t*)(Signature+32), A, (digit_t*)h, A); - if (Status != ECCRYPTO_SUCCESS) { - goto cleanup; - } - - encode(A, (unsigned char*)A); - - for (i = 0; i < NWORDS_ORDER; i++) { - if (((digit_t*)A)[i] != ((digit_t*)Signature)[i]) { - goto cleanup; - } - } - *valid = true; - -cleanup: - if (temp != NULL) - free(temp); - - return Status; -} \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/table_lookup.h b/ffi-deps/FourQlib/FourQ_64bit_and_portable/table_lookup.h deleted file mode 100644 index 935ccbf..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/table_lookup.h +++ /dev/null @@ -1,290 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: table lookup functions -************************************************************************************/ - -#ifndef __TABLE_LOOKUP_H__ -#define __TABLE_LOOKUP_H__ - - -// For C++ -#ifdef __cplusplus -extern "C" { -#endif - - -#include "FourQ_internal.h" -#if (TARGET == TARGET_AMD64) && !defined(GENERIC_IMPLEMENTATION) - #include -#endif - - -void table_lookup_1x8(point_extproj_precomp_t* table, point_extproj_precomp_t P, unsigned int digit, unsigned int sign_mask) -{ // Constant-time table lookup to extract a point represented as (X+Y,Y-X,2Z,2dT) corresponding to extended twisted Edwards coordinates (X:Y:Z:T) - // Inputs: sign_mask, digit, table containing 8 points - // Output: P = sign*table[digit], where sign=1 if sign_mask=0xFF...FF and sign=-1 if sign_mask=0 - -#if (SIMD_SUPPORT == AVX2_SUPPORT) -#if defined(ASM_SUPPORT) - table_lookup_1x8_a(table, P, &digit, &sign_mask); -#else - __m256i point[4], temp_point[4], full_mask; - unsigned int i; - int mask; - - point[0] = _mm256_loadu_si256((__m256i*)table[0]->xy); // point = table[0] - point[1] = _mm256_loadu_si256((__m256i*)table[0]->yx); - point[2] = _mm256_loadu_si256((__m256i*)table[0]->z2); - point[3] = _mm256_loadu_si256((__m256i*)table[0]->t2); - - for (i = 1; i < 8; i++) - { - digit--; - // While digit>=0 mask = 0xFF...F else mask = 0x00...0 - mask = (int)(digit >> (8*sizeof(digit)-1)) - 1; - temp_point[0] = _mm256_loadu_si256((__m256i*)table[i]->xy); // temp_point = table[i] - temp_point[1] = _mm256_loadu_si256((__m256i*)table[i]->yx); - temp_point[2] = _mm256_loadu_si256((__m256i*)table[i]->z2); - temp_point[3] = _mm256_loadu_si256((__m256i*)table[i]->t2); - // If mask = 0x00...0 then point = point, else if mask = 0xFF...F then point = temp_point - full_mask = _mm256_set1_epi32(mask); - temp_point[0] = _mm256_xor_si256(point[0], temp_point[0]); - temp_point[1] = _mm256_xor_si256(point[1], temp_point[1]); - temp_point[2] = _mm256_xor_si256(point[2], temp_point[2]); - temp_point[3] = _mm256_xor_si256(point[3], temp_point[3]); - point[0] = _mm256_xor_si256(_mm256_and_si256(temp_point[0], full_mask), point[0]); - point[1] = _mm256_xor_si256(_mm256_and_si256(temp_point[1], full_mask), point[1]); - point[2] = _mm256_xor_si256(_mm256_and_si256(temp_point[2], full_mask), point[2]); - point[3] = _mm256_xor_si256(_mm256_and_si256(temp_point[3], full_mask), point[3]); - } - - temp_point[3] = _mm256_loadu_si256((__m256i*)point+3); - temp_point[0] = _mm256_loadu_si256((__m256i*)point+1); // point: x+y,y-x,2dt coordinate, temp_point: y-x,x+y,-2dt coordinate - temp_point[1] = _mm256_loadu_si256((__m256i*)point); - full_mask = _mm256_set1_epi32((int)sign_mask); - fpneg1271((digit_t*)temp_point+12); // Negate 2dt coordinate - fpneg1271((digit_t*)temp_point+14); // If sign_mask = 0 then choose negative of the point - point[0] = _mm256_xor_si256(_mm256_and_si256(_mm256_xor_si256(point[0], temp_point[0]), full_mask), temp_point[0]); - point[1] = _mm256_xor_si256(_mm256_and_si256(_mm256_xor_si256(point[1], temp_point[1]), full_mask), temp_point[1]); - point[3] = _mm256_xor_si256(_mm256_and_si256(_mm256_xor_si256(point[3], temp_point[3]), full_mask), temp_point[3]); - _mm256_storeu_si256((__m256i*)P->xy, point[0]); - _mm256_storeu_si256((__m256i*)P->yx, point[1]); - _mm256_storeu_si256((__m256i*)P->z2, point[2]); - _mm256_storeu_si256((__m256i*)P->t2, point[3]); -#endif -#elif (SIMD_SUPPORT == AVX_SUPPORT) - __m256d point[4], temp_point[4], full_mask; - unsigned int i; - int mask; - - point[0] = _mm256_loadu_pd((double const*)table[0]->xy); // point = table[0] - point[1] = _mm256_loadu_pd((double const*)table[0]->yx); - point[2] = _mm256_loadu_pd((double const*)table[0]->z2); - point[3] = _mm256_loadu_pd((double const*)table[0]->t2); - - for (i = 1; i < 8; i++) - { - digit--; - // While digit>=0 mask = 0xFF...F else sign = 0x00...0 - mask = (int)(digit >> (8*sizeof(digit)-1)) - 1; - full_mask = _mm256_set1_pd ((double)mask); - temp_point[0] = _mm256_loadu_pd((double const*)table[i]->xy); // temp_point = table[i] - temp_point[1] = _mm256_loadu_pd((double const*)table[i]->yx); - temp_point[2] = _mm256_loadu_pd((double const*)table[i]->z2); - temp_point[3] = _mm256_loadu_pd((double const*)table[i]->t2); - // If mask = 0x00...0 then point = point, else if mask = 0xFF...F then point = temp_point - point[0] = _mm256_blendv_pd(point[0], temp_point[0], full_mask); - point[1] = _mm256_blendv_pd(point[1], temp_point[1], full_mask); - point[2] = _mm256_blendv_pd(point[2], temp_point[2], full_mask); - point[3] = _mm256_blendv_pd(point[3], temp_point[3], full_mask); - } - - temp_point[3] = _mm256_loadu_pd((double const*)point+12); - temp_point[0] = _mm256_loadu_pd((double const*)point+4); // point: x+y,y-x,2dt coordinate, temp_point: y-x,x+y,-2dt coordinate - temp_point[1] = _mm256_loadu_pd((double const*)point); - full_mask = _mm256_set1_pd((double)((int)sign_mask)); - fpneg1271((digit_t*)temp_point+12); // Negate 2dt coordinate - fpneg1271((digit_t*)temp_point+14); - point[0] = _mm256_blendv_pd(temp_point[0], point[0], full_mask); // If sign_mask = 0 then choose negative of the point - point[1] = _mm256_blendv_pd(temp_point[1], point[1], full_mask); - point[3] = _mm256_blendv_pd(temp_point[3], point[3], full_mask); - _mm256_storeu_pd((double*)P->xy, point[0]); - _mm256_storeu_pd((double*)P->yx, point[1]); - _mm256_storeu_pd((double*)P->z2, point[2]); - _mm256_storeu_pd((double*)P->t2, point[3]); -#else - point_extproj_precomp_t point, temp_point; - unsigned int i, j; - digit_t mask; - - ecccopy_precomp(table[0], point); // point = table[0] - - for (i = 1; i < 8; i++) - { - digit--; - // While digit>=0 mask = 0xFF...F else sign = 0x00...0 - mask = (digit_t)(digit >> (8*sizeof(digit)-1)) - 1; - ecccopy_precomp(table[i], temp_point); // temp_point = table[i] - // If mask = 0x00...0 then point = point, else if mask = 0xFF...F then point = temp_point - for (j = 0; j < NWORDS_FIELD; j++) { - point->xy[0][j] = (mask & (point->xy[0][j] ^ temp_point->xy[0][j])) ^ point->xy[0][j]; - point->xy[1][j] = (mask & (point->xy[1][j] ^ temp_point->xy[1][j])) ^ point->xy[1][j]; - point->yx[0][j] = (mask & (point->yx[0][j] ^ temp_point->yx[0][j])) ^ point->yx[0][j]; - point->yx[1][j] = (mask & (point->yx[1][j] ^ temp_point->yx[1][j])) ^ point->yx[1][j]; - point->z2[0][j] = (mask & (point->z2[0][j] ^ temp_point->z2[0][j])) ^ point->z2[0][j]; - point->z2[1][j] = (mask & (point->z2[1][j] ^ temp_point->z2[1][j])) ^ point->z2[1][j]; - point->t2[0][j] = (mask & (point->t2[0][j] ^ temp_point->t2[0][j])) ^ point->t2[0][j]; - point->t2[1][j] = (mask & (point->t2[1][j] ^ temp_point->t2[1][j])) ^ point->t2[1][j]; - } - } - - fp2copy1271(point->t2, temp_point->t2); - fp2copy1271(point->xy, temp_point->yx); // point: x+y,y-x,2dt coordinate, temp_point: y-x,x+y,-2dt coordinate - fp2copy1271(point->yx, temp_point->xy); - fpneg1271(temp_point->t2[0]); // Negate 2dt coordinate - fpneg1271(temp_point->t2[1]); - for (j = 0; j < NWORDS_FIELD; j++) { // If sign_mask = 0 then choose negative of the point - point->xy[0][j] = ((digit_t)((int)sign_mask) & (point->xy[0][j] ^ temp_point->xy[0][j])) ^ temp_point->xy[0][j]; - point->xy[1][j] = ((digit_t)((int)sign_mask) & (point->xy[1][j] ^ temp_point->xy[1][j])) ^ temp_point->xy[1][j]; - point->yx[0][j] = ((digit_t)((int)sign_mask) & (point->yx[0][j] ^ temp_point->yx[0][j])) ^ temp_point->yx[0][j]; - point->yx[1][j] = ((digit_t)((int)sign_mask) & (point->yx[1][j] ^ temp_point->yx[1][j])) ^ temp_point->yx[1][j]; - point->t2[0][j] = ((digit_t)((int)sign_mask) & (point->t2[0][j] ^ temp_point->t2[0][j])) ^ temp_point->t2[0][j]; - point->t2[1][j] = ((digit_t)((int)sign_mask) & (point->t2[1][j] ^ temp_point->t2[1][j])) ^ temp_point->t2[1][j]; - } - ecccopy_precomp(point, P); -#endif -} - - -void table_lookup_fixed_base(point_precomp_t* table, point_precomp_t P, unsigned int digit, unsigned int sign) -{ // Constant-time table lookup to extract a point represented as (x+y,y-x,2t) corresponding to extended twisted Edwards coordinates (X:Y:Z:T) with Z=1 - // Inputs: sign, digit, table containing VPOINTS_FIXEDBASE = 2^(W_FIXEDBASE-1) points - // Output: if sign=0 then P = table[digit], else if (sign=-1) then P = -table[digit] - -#if (SIMD_SUPPORT == AVX2_SUPPORT) - __m256i point[3], temp_point[3], full_mask; - unsigned int i; - int mask; - - point[0] = _mm256_loadu_si256((__m256i*)table[0]->xy); // point = table[0] - point[1] = _mm256_loadu_si256((__m256i*)table[0]->yx); - point[2] = _mm256_loadu_si256((__m256i*)table[0]->t2); - - for (i = 1; i < VPOINTS_FIXEDBASE; i++) - { - digit--; - // While digit>=0 mask = 0xFF...F else sign = 0x00...0 - mask = (int)(digit >> (8*sizeof(digit)-1)) - 1; - temp_point[0] = _mm256_loadu_si256((__m256i*)table[i]->xy); // temp_point = table[i] - temp_point[1] = _mm256_loadu_si256((__m256i*)table[i]->yx); - temp_point[2] = _mm256_loadu_si256((__m256i*)table[i]->t2); - // If mask = 0x00...0 then point = point, else if mask = 0xFF...F then point = temp_point - full_mask = _mm256_set1_epi32(mask); - temp_point[0] = _mm256_xor_si256(point[0], temp_point[0]); - temp_point[1] = _mm256_xor_si256(point[1], temp_point[1]); - temp_point[2] = _mm256_xor_si256(point[2], temp_point[2]); - point[0] = _mm256_xor_si256(_mm256_and_si256(temp_point[0], full_mask), point[0]); - point[1] = _mm256_xor_si256(_mm256_and_si256(temp_point[1], full_mask), point[1]); - point[2] = _mm256_xor_si256(_mm256_and_si256(temp_point[2], full_mask), point[2]); - } - - temp_point[2] = _mm256_loadu_si256((__m256i*)point+2); - temp_point[0] = _mm256_loadu_si256((__m256i*)point+1); // point: x+y,y-x,2dt coordinate, temp_point: y-x,x+y,-2dt coordinate - temp_point[1] = _mm256_loadu_si256((__m256i*)point); - full_mask = _mm256_set1_epi32((int)sign); - fpneg1271((digit_t*)temp_point+8); // Negate 2dt coordinate - fpneg1271((digit_t*)temp_point+10); // If sign = 0xFF...F then choose negative of the point - point[0] = _mm256_xor_si256(_mm256_and_si256(_mm256_xor_si256(point[0], temp_point[0]), full_mask), point[0]); - point[1] = _mm256_xor_si256(_mm256_and_si256(_mm256_xor_si256(point[1], temp_point[1]), full_mask), point[1]); - point[2] = _mm256_xor_si256(_mm256_and_si256(_mm256_xor_si256(point[2], temp_point[2]), full_mask), point[2]); - _mm256_storeu_si256((__m256i*)P->xy, point[0]); - _mm256_storeu_si256((__m256i*)P->yx, point[1]); - _mm256_storeu_si256((__m256i*)P->t2, point[2]); - -#elif (SIMD_SUPPORT >= AVX_SUPPORT) - __m256d point[3], temp_point[3], full_mask; - unsigned int i; - int mask; - - point[0] = _mm256_loadu_pd((double const*)table[0]->xy); // point = table[0] - point[1] = _mm256_loadu_pd((double const*)table[0]->yx); - point[2] = _mm256_loadu_pd((double const*)table[0]->t2); - - for (i = 1; i < VPOINTS_FIXEDBASE; i++) - { - digit--; - // While digit>=0 mask = 0xFF...F else sign = 0x00...0 - mask = (int)(digit >> (8*sizeof(digit)-1)) - 1; - full_mask = _mm256_set1_pd((double)mask); - temp_point[0] = _mm256_loadu_pd((double const*)table[i]->xy); // temp_point = table[i+1] - temp_point[1] = _mm256_loadu_pd((double const*)table[i]->yx); - temp_point[2] = _mm256_loadu_pd((double const*)table[i]->t2); - // If mask = 0x00...0 then point = point, else if mask = 0xFF...F then point = temp_point - point[0] = _mm256_blendv_pd(point[0], temp_point[0], full_mask); - point[1] = _mm256_blendv_pd(point[1], temp_point[1], full_mask); - point[2] = _mm256_blendv_pd(point[2], temp_point[2], full_mask); - } - - temp_point[2] = _mm256_loadu_pd((double const*)point+2*4); // point: x+y,y-x,2dt coordinate, temp_point: y-x,x+y,-2dt coordinate - temp_point[0] = _mm256_loadu_pd((double const*)point+1*4); - temp_point[1] = _mm256_loadu_pd((double const*)point); - full_mask = _mm256_set1_pd((double)((int)sign)); - fpneg1271((digit_t*)temp_point+8); // Negate 2dt coordinate - fpneg1271((digit_t*)temp_point+10); - point[0] = _mm256_blendv_pd(point[0], temp_point[0], full_mask); // If sign = 0xFF...F then choose negative of the point - point[1] = _mm256_blendv_pd(point[1], temp_point[1], full_mask); - point[2] = _mm256_blendv_pd(point[2], temp_point[2], full_mask); - _mm256_storeu_pd((double*)P->xy, point[0]); - _mm256_storeu_pd((double*)P->yx, point[1]); - _mm256_storeu_pd((double*)P->t2, point[2]); -#else - point_precomp_t point, temp_point; - unsigned int i, j; - digit_t mask; - - ecccopy_precomp_fixed_base(table[0], point); // point = table[0] - - for (i = 1; i < VPOINTS_FIXEDBASE; i++) - { - digit--; - // While digit>=0 mask = 0xFF...F else sign = 0x00...0 - mask = (digit_t)(digit >> (8*sizeof(digit)-1)) - 1; - ecccopy_precomp_fixed_base(table[i], temp_point); // temp_point = table[i] - // If mask = 0x00...0 then point = point, else if mask = 0xFF...F then point = temp_point - for (j = 0; j < NWORDS_FIELD; j++) { - point->xy[0][j] = (mask & (point->xy[0][j] ^ temp_point->xy[0][j])) ^ point->xy[0][j]; - point->xy[1][j] = (mask & (point->xy[1][j] ^ temp_point->xy[1][j])) ^ point->xy[1][j]; - point->yx[0][j] = (mask & (point->yx[0][j] ^ temp_point->yx[0][j])) ^ point->yx[0][j]; - point->yx[1][j] = (mask & (point->yx[1][j] ^ temp_point->yx[1][j])) ^ point->yx[1][j]; - point->t2[0][j] = (mask & (point->t2[0][j] ^ temp_point->t2[0][j])) ^ point->t2[0][j]; - point->t2[1][j] = (mask & (point->t2[1][j] ^ temp_point->t2[1][j])) ^ point->t2[1][j]; - } - } - - fp2copy1271(point->t2, temp_point->t2); - fp2copy1271(point->xy, temp_point->yx); // point: x+y,y-x,2dt coordinate, temp_point: y-x,x+y,-2dt coordinate - fp2copy1271(point->yx, temp_point->xy); - fpneg1271(temp_point->t2[0]); // Negate 2dt coordinate - fpneg1271(temp_point->t2[1]); - for (j = 0; j < NWORDS_FIELD; j++) { // If sign = 0xFF...F then choose negative of the point - point->xy[0][j] = ((digit_t)((int)sign) & (point->xy[0][j] ^ temp_point->xy[0][j])) ^ point->xy[0][j]; - point->xy[1][j] = ((digit_t)((int)sign) & (point->xy[1][j] ^ temp_point->xy[1][j])) ^ point->xy[1][j]; - point->yx[0][j] = ((digit_t)((int)sign) & (point->yx[0][j] ^ temp_point->yx[0][j])) ^ point->yx[0][j]; - point->yx[1][j] = ((digit_t)((int)sign) & (point->yx[1][j] ^ temp_point->yx[1][j])) ^ point->yx[1][j]; - point->t2[0][j] = ((digit_t)((int)sign) & (point->t2[0][j] ^ temp_point->t2[0][j])) ^ point->t2[0][j]; - point->t2[1][j] = ((digit_t)((int)sign) & (point->t2[1][j] ^ temp_point->t2[1][j])) ^ point->t2[1][j]; - } - ecccopy_precomp_fixed_base(point, P); -#endif -} - - -#ifdef __cplusplus -} -#endif - - -#endif \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/tests/crypto_tests.c b/ffi-deps/FourQlib/FourQ_64bit_and_portable/tests/crypto_tests.c deleted file mode 100644 index 44f9952..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/tests/crypto_tests.c +++ /dev/null @@ -1,456 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: testing code for cryptographic functions based on FourQ -************************************************************************************/ - -#include "../FourQ_api.h" -#include "../FourQ_params.h" -#include "../../random/random.h" -#include "../../sha512/sha512.h" -#include "test_extras.h" -#include - - -// Benchmark and test parameters -#if defined(GENERIC_IMPLEMENTATION) - #define BENCH_LOOPS 100 // Number of iterations per bench - #define TEST_LOOPS 100 // Number of iterations per test -#else - #define BENCH_LOOPS 10000 - #define TEST_LOOPS 1000 -#endif - - -ECCRYPTO_STATUS SchnorrQ_test() -{ // Test the SchnorrQ digital signature scheme - int n, passed; - void *msg = NULL; - unsigned int len, valid = false; - unsigned char SecretKey[32], PublicKey[32], Signature[64]; - ECCRYPTO_STATUS Status = ECCRYPTO_SUCCESS; - - printf("\n--------------------------------------------------------------------------------------------------------\n\n"); - printf("Testing the SchnorrQ signature scheme: \n\n"); - - passed = 1; - for (n = 0; n < TEST_LOOPS; n++) - { - // Signature key generation - Status = SchnorrQ_FullKeyGeneration(SecretKey, PublicKey); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - - // Signature computation - msg = "a"; - len = 1; - Status = SchnorrQ_Sign(SecretKey, PublicKey, msg, len, Signature); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - - // Valid signature test - Status = SchnorrQ_Verify(PublicKey, msg, len, Signature, &valid); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - if (valid == false) { - passed = 0; - break; - } - - // Invalid signature test (flipping one bit of the message) - msg = "b"; - Status = SchnorrQ_Verify(PublicKey, msg, len, Signature, &valid); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - if (valid == true) { - passed = 0; - break; - } - } - if (passed==1) printf(" Signature tests.................................................................. PASSED"); - else { printf(" Signature tests... FAILED"); printf("\n"); Status = ECCRYPTO_ERROR_SIGNATURE_VERIFICATION; } - printf("\n"); - - return Status; -} - - -ECCRYPTO_STATUS SchnorrQ_run() -{ // Benchmark the SchnorrQ digital signature scheme - int n; - unsigned long long cycles, cycles1, cycles2; - void *msg = NULL; - unsigned int len = 0, valid = false; - unsigned char SecretKey[32], PublicKey[32], Signature[64]; - ECCRYPTO_STATUS Status = ECCRYPTO_SUCCESS; - - printf("\n--------------------------------------------------------------------------------------------------------\n\n"); - printf("Benchmarking the SchnorrQ signature scheme: \n\n"); - - cycles = 0; - for (n = 0; n < BENCH_LOOPS; n++) - { - cycles1 = cpucycles(); - Status = SchnorrQ_FullKeyGeneration(SecretKey, PublicKey); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - cycles2 = cpucycles(); - cycles = cycles+(cycles2-cycles1); - } - printf(" SchnorrQ's key generation runs in ............................................... %8lld ", cycles/BENCH_LOOPS); print_unit; - printf("\n"); - - cycles = 0; - for (n = 0; n < BENCH_LOOPS; n++) - { - cycles1 = cpucycles(); - Status = SchnorrQ_Sign(SecretKey, PublicKey, msg, len, Signature); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - cycles2 = cpucycles(); - cycles = cycles+(cycles2-cycles1); - } - printf(" SchnorrQ's signing runs in ...................................................... %8lld ", cycles/BENCH_LOOPS); print_unit; - printf("\n"); - - cycles = 0; - for (n = 0; n < BENCH_LOOPS; n++) - { - cycles1 = cpucycles(); - Status = SchnorrQ_Verify(PublicKey, msg, len, Signature, &valid); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - cycles2 = cpucycles(); - cycles = cycles+(cycles2-cycles1); - } - printf(" SchnorrQ's verification runs in ................................................. %8lld ", cycles/BENCH_LOOPS); print_unit; - printf("\n"); - - return Status; -} - - -ECCRYPTO_STATUS compressedkex_test() -{ // Test ECDH key exchange based on FourQ - int n, passed; - unsigned int i; - unsigned char SecretKeyA[32], PublicKeyA[32], SecretAgreementA[32]; - unsigned char SecretKeyB[32], PublicKeyB[32], SecretAgreementB[32]; - ECCRYPTO_STATUS Status = ECCRYPTO_SUCCESS; - - printf("\n--------------------------------------------------------------------------------------------------------\n\n"); - printf("Testing DH key exchange using compressed, 32-byte public keys: \n\n"); - - passed = 1; - for (n = 0; n < TEST_LOOPS; n++) - { - // Alice's keypair generation - Status = CompressedKeyGeneration(SecretKeyA, PublicKeyA); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - // Bob's keypair generation - Status = CompressedKeyGeneration(SecretKeyB, PublicKeyB); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - - // Alice's shared secret computation - Status = CompressedSecretAgreement(SecretKeyA, PublicKeyB, SecretAgreementA); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - // Bob's shared secret computation - Status = CompressedSecretAgreement(SecretKeyB, PublicKeyA, SecretAgreementB); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - - for (i = 0; i < 32; i++) { - if (SecretAgreementA[i] != SecretAgreementB[i]) { - passed = 0; - break; - } - } - } - if (passed==1) printf(" DH key exchange tests............................................................ PASSED"); - else { printf(" DH key exchange tests... FAILED"); printf("\n"); Status = ECCRYPTO_ERROR_SHARED_KEY; } - printf("\n"); - - return Status; -} - - -ECCRYPTO_STATUS compressedkex_run() -{ // Benchmark ECDH key exchange based on FourQ - int n; - unsigned long long cycles, cycles1, cycles2; - unsigned char SecretKeyA[32], PublicKeyA[32], SecretAgreementA[32]; - unsigned char SecretKeyB[32], PublicKeyB[32]; - ECCRYPTO_STATUS Status = ECCRYPTO_SUCCESS; - - printf("\n--------------------------------------------------------------------------------------------------------\n\n"); - printf("Benchmarking DH key exchange using compressed, 32-byte public keys: \n\n"); - - cycles = 0; - for (n = 0; n < BENCH_LOOPS; n++) - { - cycles1 = cpucycles(); - Status = CompressedKeyGeneration(SecretKeyA, PublicKeyA); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - cycles2 = cpucycles(); - cycles = cycles + (cycles2 - cycles1); - } - printf(" Keypair generation runs in ...................................................... %8lld ", cycles/BENCH_LOOPS); print_unit; - printf("\n"); - - Status = CompressedKeyGeneration(SecretKeyB, PublicKeyB); - cycles = 0; - for (n = 0; n < BENCH_LOOPS; n++) - { - cycles1 = cpucycles(); - Status = CompressedSecretAgreement(SecretKeyA, PublicKeyB, SecretAgreementA); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - cycles2 = cpucycles(); - cycles = cycles + (cycles2 - cycles1); - } - printf(" Secret agreement runs in ........................................................ %8lld ", cycles/BENCH_LOOPS); print_unit; - printf("\n"); - - return Status; -} - - -ECCRYPTO_STATUS kex_test() -{ // Test ECDH key exchange based on FourQ - int n, passed; - unsigned int i; - unsigned char SecretKeyA[32], PublicKeyA[64], SecretAgreementA[32]; - unsigned char SecretKeyB[32], PublicKeyB[64], SecretAgreementB[32]; - ECCRYPTO_STATUS Status = ECCRYPTO_SUCCESS; - - printf("\n--------------------------------------------------------------------------------------------------------\n\n"); - printf("Testing DH key exchange using uncompressed, 64-byte public keys: \n\n"); - - passed = 1; - for (n = 0; n < TEST_LOOPS; n++) - { - // Alice's keypair generation - Status = KeyGeneration(SecretKeyA, PublicKeyA); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - // Bob's keypair generation - Status = KeyGeneration(SecretKeyB, PublicKeyB); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - - // Alice's shared secret computation - Status = SecretAgreement(SecretKeyA, PublicKeyB, SecretAgreementA); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - // Bob's shared secret computation - Status = SecretAgreement(SecretKeyB, PublicKeyA, SecretAgreementB); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - - for (i = 0; i < 32; i++) { - if (SecretAgreementA[i] != SecretAgreementB[i]) { - passed = 0; - break; - } - } - } - if (passed==1) printf(" DH key exchange tests............................................................ PASSED"); - else { printf(" DH key exchange tests... FAILED"); printf("\n"); Status = ECCRYPTO_ERROR_SHARED_KEY; } - printf("\n"); - - return Status; -} - - -ECCRYPTO_STATUS kex_run() -{ // Benchmark ECDH key exchange based on FourQ - int n; - unsigned long long cycles, cycles1, cycles2; - unsigned char SecretKeyA[32], PublicKeyA[64], SecretAgreementA[32]; - unsigned char SecretKeyB[32], PublicKeyB[64]; - ECCRYPTO_STATUS Status = ECCRYPTO_SUCCESS; - - printf("\n--------------------------------------------------------------------------------------------------------\n\n"); - printf("Benchmarking DH key exchange using uncompressed, 64-byte public keys: \n\n"); - - cycles = 0; - for (n = 0; n < BENCH_LOOPS; n++) - { - cycles1 = cpucycles(); - Status = KeyGeneration(SecretKeyA, PublicKeyA); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - cycles2 = cpucycles(); - cycles = cycles + (cycles2 - cycles1); - } - printf(" Keypair generation runs in ...................................................... %8lld ", cycles/BENCH_LOOPS); print_unit; - printf("\n"); - - Status = KeyGeneration(SecretKeyB, PublicKeyB); - cycles = 0; - for (n = 0; n < BENCH_LOOPS; n++) - { - cycles1 = cpucycles(); - Status = SecretAgreement(SecretKeyA, PublicKeyB, SecretAgreementA); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - cycles2 = cpucycles(); - cycles = cycles + (cycles2 - cycles1); - } - printf(" Secret agreement runs in ........................................................ %8lld ", cycles/BENCH_LOOPS); print_unit; - printf("\n"); - - return Status; -} - - -ECCRYPTO_STATUS hash2curve_test() -{ // Test hashing to FourQ - int n, passed; - point_t P, Q; - point_extproj_t R; - unsigned char Value[32], HashedValue[64]; - f2elm_t* f2elmt = (f2elm_t*)&HashedValue[0]; - ECCRYPTO_STATUS Status = ECCRYPTO_SUCCESS; - - printf("\n--------------------------------------------------------------------------------------------------------\n\n"); - printf("Testing hashing to FourQ: \n\n"); - - passed = 1; - for (n = 0; n < TEST_LOOPS; n++) - { - RandomBytesFunction(Value, 32); - CryptoHashFunction(Value, 32, HashedValue); - mod1271(((felm_t*)f2elmt)[0]); - mod1271(((felm_t*)f2elmt)[1]); - - // Hash GF(p^2) element to curve - Status = HashToCurve((felm_t*)f2elmt, P); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - hash2curve_unsafe((felm_t*)f2elmt, Q); // Non-constant-time version for testing - if (fp2compare64((uint64_t*)P->x,(uint64_t*)Q->x)!=0 || fp2compare64((uint64_t*)P->y,(uint64_t*)Q->y)!=0) { passed=0; break; } - - // Check if point is on the curve - point_setup(P, R); - if (!ecc_point_validate(R)) { passed=0; break; } - } - if (passed==1) printf(" Hash to FourQ tests.............................................................. PASSED"); - else { printf(" Hash to FourQ tests... FAILED"); printf("\n"); Status = ECCRYPTO_ERROR_HASH_TO_CURVE; } - printf("\n"); - - return Status; -} - - -ECCRYPTO_STATUS hash2curve_run() -{ // Benchmark hashing to FourQ - int n; - unsigned long long cycles, cycles1, cycles2; - point_t P; - unsigned char Value[32], HashedValue[64]; - f2elm_t* f2elmt = (f2elm_t*)&HashedValue[0]; - ECCRYPTO_STATUS Status = ECCRYPTO_SUCCESS; - - printf("\n--------------------------------------------------------------------------------------------------------\n\n"); - printf("Benchmarking hashing to FourQ: \n\n"); - - cycles = 0; - for (n = 0; n < BENCH_LOOPS; n++) - { - RandomBytesFunction(Value, 32); - CryptoHashFunction(Value, 32, HashedValue); - mod1271(((felm_t*)f2elmt)[0]); - mod1271(((felm_t*)f2elmt)[1]); - - cycles1 = cpucycles(); - Status = HashToCurve((felm_t*)f2elmt, P); - if (Status != ECCRYPTO_SUCCESS) { - return Status; - } - cycles2 = cpucycles(); - cycles = cycles + (cycles2 - cycles1); - } - printf(" Hashing to FourQ runs in ....................................................... %8lld ", cycles/BENCH_LOOPS); print_unit; - printf("\n"); - - return Status; -} - - -int main() -{ - ECCRYPTO_STATUS Status = ECCRYPTO_SUCCESS; - - Status = SchnorrQ_test(); // Test SchnorrQ signature scheme - if (Status != ECCRYPTO_SUCCESS) { - printf("\n\n Error detected: %s \n\n", FourQ_get_error_message(Status)); - return false; - } - Status = SchnorrQ_run(); // Benchmark SchnorrQ signature scheme - if (Status != ECCRYPTO_SUCCESS) { - printf("\n\n Error detected: %s \n\n", FourQ_get_error_message(Status)); - return false; - } - - Status = compressedkex_test(); // Test Diffie-Hellman key exchange using compressed public keys - if (Status != ECCRYPTO_SUCCESS) { - printf("\n\n Error detected: %s \n\n", FourQ_get_error_message(Status)); - return false; - } - Status = compressedkex_run(); // Benchmark Diffie-Hellman key exchange using compressed public keys - if (Status != ECCRYPTO_SUCCESS) { - printf("\n\n Error detected: %s \n\n", FourQ_get_error_message(Status)); - return false; - } - - Status = kex_test(); // Test Diffie-Hellman key exchange using uncompressed public keys - if (Status != ECCRYPTO_SUCCESS) { - printf("\n\n Error detected: %s \n\n", FourQ_get_error_message(Status)); - return false; - } - Status = kex_run(); // Benchmark Diffie-Hellman key exchange using uncompressed public keys - if (Status != ECCRYPTO_SUCCESS) { - printf("\n\n Error detected: %s \n\n", FourQ_get_error_message(Status)); - return false; - } - - Status = hash2curve_test(); // Test hash to FourQ function - if (Status != ECCRYPTO_SUCCESS) { - printf("\n\n Error detected: %s \n\n", FourQ_get_error_message(Status)); - return false; - } - Status = hash2curve_run(); // Benchmark hash to FourQ function - if (Status != ECCRYPTO_SUCCESS) { - printf("\n\n Error detected: %s \n\n", FourQ_get_error_message(Status)); - return false; - } - - return true; -} \ No newline at end of file diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/tests/ecc_tests.c b/ffi-deps/FourQlib/FourQ_64bit_and_portable/tests/ecc_tests.c deleted file mode 100644 index fa534d5..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/tests/ecc_tests.c +++ /dev/null @@ -1,718 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: testing code for FourQ's curve arithmetic -************************************************************************************/ - -#include "../FourQ_api.h" -#include "../FourQ_params.h" -#include "../FourQ_tables.h" -#include "test_extras.h" -#include - - -// Benchmark and test parameters -#if defined(GENERIC_IMPLEMENTATION) - #define BENCH_LOOPS 10000 // Number of iterations per bench - #define SHORT_BENCH_LOOPS 1000 // Number of iterations per bench (for expensive operations) -#else - #define BENCH_LOOPS 100000 - #define SHORT_BENCH_LOOPS 10000 -#endif -#define TEST_LOOPS 1000 // Number of iterations per test - - -bool ecc_test() -{ - bool clear_cofactor, OK = true; - unsigned int n; - int passed; - point_t A; - point_extproj_t P; - point_extproj_precomp_t Q; - f2elm_t t1; - uint64_t scalar[4], res_x[4], res_y[4]; - - - printf("\n--------------------------------------------------------------------------------------------------------\n\n"); - printf("Testing FourQ's curve arithmetic: \n\n"); - - // Point doubling - passed = 1; - eccset(A); - point_setup(A, P); - - for (n=0; nx[0]); mod1271(A->x[1]); // Fully reduced P - mod1271(A->y[0]); mod1271(A->y[1]); - - // Result - res_x[0] = 0xC9099C54855859D6; res_x[1] = 0x2C3FD8822C82270F; res_x[2] = 0xA7B3F6E2043E8E68; res_x[3] = 0x4DA5B9E83AA7A1B2; - res_y[0] = 0x3EE089F0EB49AA14; res_y[1] = 0x2001EB3A57688396; res_y[2] = 0x1FEE5617A7E954CD; res_y[3] = 0x0FFDB0D761421F50; - - if (fp2compare64((uint64_t*)A->x, res_x)!=0 || fp2compare64((uint64_t*)A->y, res_y)!=0) passed=0; - if (passed==1) printf(" Point doubling tests .................................................................... PASSED"); - else { printf(" Point doubling tests ... FAILED"); printf("\n"); return false; } - printf("\n"); - - // Point addition - eccset(A); - point_setup(A, P); - - for (n=0; nta, t1); // d*ta - fp2add1271(t1, t1, t1); // 2*d*ta - fp2mul1271(t1, P->tb, Q->t2); // 2*d*t - fp2add1271(P->x, P->y, Q->xy); // x+y - fp2sub1271(P->y, P->x, Q->yx); // y-x - fp2copy1271(P->z, Q->z2); - fp2add1271(Q->z2, Q->z2, Q->z2); // 2*z - eccadd(Q, P); // 2*P - } - eccnorm(P, A); - mod1271(A->x[0]); mod1271(A->x[1]); // Fully reduced P - mod1271(A->y[0]); mod1271(A->y[1]); - - // Result - res_x[0] = 0xC9099C54855859D6; res_x[1] = 0x2C3FD8822C82270F; res_x[2] = 0xA7B3F6E2043E8E68; res_x[3] = 0x4DA5B9E83AA7A1B2; - res_y[0] = 0x3EE089F0EB49AA14; res_y[1] = 0x2001EB3A57688396; res_y[2] = 0x1FEE5617A7E954CD; res_y[3] = 0x0FFDB0D761421F50; - - if (fp2compare64((uint64_t*)A->x, res_x)!=0 || fp2compare64((uint64_t*)A->y, res_y)!=0) passed=0; - - eccset(A); - point_setup(A, P); - fp2copy1271((felm_t*)&PARAMETER_d, t1); - fp2mul1271(t1, P->x, t1); // d*x - fp2add1271(t1, t1, t1); // 2*d*x - fp2mul1271(t1, P->y, Q->t2); // 2*d*t - fp2add1271(P->x, P->y, Q->xy); // x+y - fp2sub1271(P->y, P->x, Q->yx); // y-x - fp2zero1271(Q->z2); *Q->z2[0] = 2; // 2*z - eccdouble(P); // P = 2P - - for (n=0; nx[0]); mod1271(A->x[1]); // Fully reduced P - mod1271(A->y[0]); mod1271(A->y[1]); - - // Result - res_x[0] = 0x6480B1EF0A151DB0; res_x[1] = 0x3E243958590C4D90; res_x[2] = 0xAA270F644A65D473; res_x[3] = 0x5327AF7D84238CD0; - res_y[0] = 0x5E06003D73C43EB1; res_y[1] = 0x3EF69A49CB7E0237; res_y[2] = 0x4E752648AC2EF0AB; res_y[3] = 0x293EB1E26DD23B4E; - - if (fp2compare64((uint64_t*)A->x, res_x)!=0 || fp2compare64((uint64_t*)A->y, res_y)!=0) passed=0; - - if (passed==1) printf(" Point addition tests .................................................................... PASSED"); - else { printf(" Point addition tests ... FAILED"); printf("\n"); return false; } - printf("\n"); - -#if (USE_ENDO == true) - // Psi endomorphism - eccset(A); - point_setup(A, P); - - for (n=0; nx[0]); mod1271(A->x[1]); // Fully reduced P - mod1271(A->y[0]); mod1271(A->y[1]); - - // Result - res_x[0] = 0xD8F3C8C24A2BC7E2; res_x[1] = 0x75AF54EDB41A2B93; res_x[2] = 0x4DE2466701F009A9; res_x[3] = 0x065249F9EDE0C798; - res_y[0] = 0x1C6E119ADD608104; res_y[1] = 0x06DBB85BFFB7C21E; res_y[2] = 0xFD234D6C4CFA3EC1; res_y[3] = 0x060A30903424BF13; - - if (fp2compare64((uint64_t*)A->x, res_x)!=0 || fp2compare64((uint64_t*)A->y, res_y)!=0) passed=0; - - if (passed==1) printf(" Psi endomorphism tests .................................................................. PASSED"); - else { printf(" Psi endomorphism tests ... FAILED"); printf("\n"); return false; } - printf("\n"); - - // Phi endomorphism - { - eccset(A); - point_setup(A, P); - - for (n=0; nx[0]); mod1271(A->x[1]); // Fully reduced P - mod1271(A->y[0]); mod1271(A->y[1]); - - // Result - res_x[0] = 0xD5B5A3061287DB16; res_x[1] = 0x5550AAB9E7A620EE; res_x[2] = 0xEC321E6CF33610FC; res_x[3] = 0x3E61EBB9A1CB0210; - res_y[0] = 0x7E2851D5A8E83FB9; res_y[1] = 0x5474BF8EC55603AE; res_y[2] = 0xA5077613491788D5; res_y[3] = 0x5476093DBF8BF6BF; - - if (fp2compare64((uint64_t*)A->x, res_x)!=0 || fp2compare64((uint64_t*)A->y, res_y)!=0) passed=0; - if (passed==1) printf(" Phi endomorphism tests .................................................................. PASSED"); - else { printf(" Phi endomorphism tests ... FAILED"); printf("\n"); return false; } - printf("\n"); - - // Scalar decomposition and recoding - { - uint64_t acc1, acc2, acc3, acc4, scalars[4]; - unsigned int digits[65], sign_masks[65]; - uint64_t k[4]; - int i; - - for (n=0; n= 0; i--) - { - acc1 = 2*acc1; acc2 = 2*acc2; acc3 = 2*acc3; acc4 = 2*acc4; - if (sign_masks[i] == (unsigned int)-1) { - acc1 += 1; - acc2 += (digits[i] & 1); - acc3 += ((digits[i] >> 1) & 1); - acc4 += ((digits[i] >> 2) & 1); - } else if (sign_masks[i] == 0) { - acc1 -= 1; - acc2 -= (digits[i] & 1); - acc3 -= ((digits[i] >> 1) & 1); - acc4 -= ((digits[i] >> 2) & 1); - } - } - if (scalar[0] != acc1 || scalar[1] != acc2 || scalar[2] != acc3 || scalar[3] != acc4) { passed=0; break; } - } - - if (passed==1) printf(" Recoding and decomposition tests ........................................................ PASSED"); - else { printf(" Recoding and decomposition tests ... FAILED"); printf("\n"); return false; } - printf("\n"); - } - } -#endif - - // Scalar multiplication - eccset(A); - clear_cofactor = false; - scalar[0] = 0x3AD457AB55456230; scalar[1] = 0x3A8B3C2C6FD86E0C; scalar[2] = 0x7E38F7C9CFBB9166; scalar[3] = 0x0028FD6CBDA458F0; - - for (n=0; nx, res_x)!=0 || fp2compare64((uint64_t*)A->y, res_y)!=0) passed=0; - - - eccset(A); - clear_cofactor = true; - scalar[0] = 0x3AD457AB55456230; scalar[1] = 0x3A8B3C2C6FD86E0C; scalar[2] = 0x7E38F7C9CFBB9166; scalar[3] = 0x0028FD6CBDA458F0; - - for (n=0; nx, res_x)!=0 || fp2compare64((uint64_t*)A->y, res_y)!=0) passed=0; - - if (passed==1) printf(" Scalar multiplication tests ............................................................. PASSED"); - else { printf(" Scalar multiplication tests ... FAILED"); printf("\n"); return false; } - printf("\n"); - - { - point_t AA, B, C; - unsigned int j, w, v, e, d; - uint64_t k[4]; - unsigned int digits_fixed[NBITS_ORDER_PLUS_ONE+(W_FIXEDBASE*V_FIXEDBASE)-1] = {0}; - - // Scalar recoding using the mLSB-set representation - w = W_FIXEDBASE; - v = V_FIXEDBASE; - e = E_FIXEDBASE; - d = D_FIXEDBASE; - - for (n=0; nx,(uint64_t*)C->x)!=0 || fp2compare64((uint64_t*)B->y,(uint64_t*)C->y)!=0) { passed=0; break; } - } - - if (passed==1) printf(" Fixed-base scalar multiplication tests .................................................. PASSED"); - else { printf(" Fixed-base scalar multiplication tests ... FAILED"); printf("\n"); return false; } - printf("\n"); - } - - { - point_t PP, QQ, RR, UU, TT; - point_extproj_precomp_t AA; - point_extproj_t BB; - uint64_t k[4], l[4], kk[4]; - - // Double scalar multiplication - eccset(QQ); - eccset(PP); - - for (n=0; nx, UU->y, AA->xy); - fp2sub1271(UU->y, UU->x, AA->yx); - fp2mul1271(UU->x, UU->y, AA->t2); - fp2add1271(AA->t2, AA->t2, AA->t2); - fp2mul1271(AA->t2, (felm_t*)&PARAMETER_d, AA->t2); - fp2zero1271(AA->z2); AA->z2[0][0] = 2; - point_setup(TT, BB); - - eccadd(AA, BB); - eccnorm(BB, UU); - - if (fp2compare64((uint64_t*)UU->x,(uint64_t*)RR->x)!=0 || fp2compare64((uint64_t*)UU->y,(uint64_t*)RR->y)!=0) { passed=0; break; } - } - - if (passed==1) printf(" Double scalar multiplication tests ...................................................... PASSED"); - else { printf(" Double scalar multiplication tests ... FAILED"); printf("\n"); return false; } - printf("\n"); - } - - return OK; -} - - -bool ecc_run() -{ - bool OK = true; - unsigned int n; - unsigned long long cycles, cycles1, cycles2; - point_t A, B; - point_extproj_t P; - point_extproj_precomp_t Q, Table[8]; - f2elm_t t1; - uint64_t scalar[4]; - - printf("\n--------------------------------------------------------------------------------------------------------\n\n"); - printf("Benchmarking FourQ's curve arithmetic \n\n"); - - // Point doubling (twisted Edwards a=-1) - eccset(A); - point_setup(A, P); - - cycles = 0; - for (n=0; nx, t1); // d*x - fp2add1271(t1, t1, t1); // 2*d*x - fp2mul1271(t1, P->y, Q->t2); // 2*d*t - fp2add1271(P->x, P->y, Q->xy); // x+y - fp2sub1271(P->y, P->x, Q->yx); // y-x - fp2zero1271(Q->z2); *Q->z2[0] = 2; // 2*z - eccdouble(P); // P = 2P - - cycles = 0; - for (n=0; n -#include - - -// Benchmark and test parameters -#define BENCH_LOOPS 10000 // Number of iterations per bench -#define SHORT_BENCH_LOOPS 1000 // Number of iterations per bench (for expensive operations) -#define TEST_LOOPS 1000 // Number of iterations per test - - -bool fp2_test() -{ // Tests for the quadratic extension field arithmetic - bool OK = true; - int n, passed; - f2elm_t a, b, c, d, e, f; - digit_t ma[NWORDS_ORDER], mb[NWORDS_ORDER], mc[NWORDS_ORDER], md[NWORDS_ORDER], me[NWORDS_ORDER], mf[NWORDS_ORDER], one[NWORDS_ORDER] = {0}; - one[0] = 1; - - printf("\n--------------------------------------------------------------------------------------------------------\n\n"); - printf("Testing quadratic extension field arithmetic over GF((2^127-1)^2): \n\n"); - - // GF(p^2) multiplication using p = 2^127-1 - passed = 1; - for (n=0; n - #include -#endif -#if (OS_TARGET == OS_LINUX) && (TARGET == TARGET_ARM || TARGET == TARGET_ARM64) - #include -#endif -#include -#include - - -int64_t cpucycles(void) -{ // Access system counter for benchmarking -#if (OS_TARGET == OS_WIN) && (TARGET == TARGET_AMD64 || TARGET == TARGET_x86) - return __rdtsc(); -#elif (OS_TARGET == OS_WIN) && (TARGET == TARGET_ARM) - return __rdpmccntr64(); -#elif (OS_TARGET == OS_LINUX) && (TARGET == TARGET_AMD64 || TARGET == TARGET_x86) - unsigned int hi, lo; - - asm volatile ("rdtsc\n\t" : "=a" (lo), "=d"(hi)); - return ((int64_t)lo) | (((int64_t)hi) << 32); -#elif (OS_TARGET == OS_LINUX) && (TARGET == TARGET_ARM || TARGET == TARGET_ARM64) - struct timespec time; - - clock_gettime(CLOCK_REALTIME, &time); - return (int64_t)(time.tv_sec*1e9 + time.tv_nsec); -#else - return 0; -#endif -} - - -int fp2compare64(uint64_t* a, uint64_t* b) -{ // Comparing uint64_t digits of two quadratic extension field elements, ai=bi? : (0) equal, (1) unequal - // NOTE: this function does not have constant-time execution. TO BE USED FOR TESTING ONLY. - unsigned int i; - - for (i = 0; i < (2*NWORDS64_FIELD); i++) { - if (a[i] != b[i]) return 1; - } - - return 0; -} - - -void random_scalar_test(uint64_t* a) -{ // Generating a pseudo-random scalar value in [0, 2^256-1] - // NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY. - unsigned char* string = (unsigned char*)&a[0]; - unsigned int i; - - for (i = 0; i < (sizeof(uint64_t)*NWORDS64_ORDER); i++) { - string[i] = (unsigned char)rand(); - } -} - - -void fp2random1271_test(f2elm_t a) -{ // Generating a pseudo-random GF(p^2) element a+b*i, where a,b in [0, 2^127-1] - // NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY. - digit_t mask_7fff = (digit_t)-1 >> 1; - - random_scalar_test((uint64_t*)&a[0]); - a[0][NWORDS_FIELD - 1] &= mask_7fff; - a[1][NWORDS_FIELD - 1] &= mask_7fff; -} - - -void random_order_test(digit_t* a) -{ // Generating a pseudo-random element in [0, order-1] - // SECURITY NOTE: distribution is not fully uniform. TO BE USED FOR TESTING ONLY. - int i; - unsigned char* string = (unsigned char*)a; - - for (i = 0; i < 31; i++) { - string[i] = (unsigned char)rand(); // Obtain 246-bit number - } - string[30] &= 0x3F; - string[31] = 0; - subtract_mod_order(a, (digit_t*)&curve_order, a); - - return; -} - - -bool verify_mLSB_recoding(uint64_t* scalar, int* digits) -{ // Verification of the mLSB-set's recoding algorithm used in fixed-base scalar multiplication - unsigned int j, l = L_FIXEDBASE, d = D_FIXEDBASE; - uint64_t temp, temp2, carry, borrow, generated_scalar[NWORDS64_ORDER] = {0}; - int i, digit; - - for (i = (l-1); i >= 0; i--) - { - // Shift generated scalar to the left by 1 (multiply by 2) - temp = ((generated_scalar[0] >> (RADIX64-1)) & 1) ; - generated_scalar[0] = generated_scalar[0] << 1; - - for (j = 1; j < NWORDS64_ORDER; j++) { - temp2 = ((generated_scalar[j] >> (RADIX64-1)) & 1) ; - generated_scalar[j] = (generated_scalar[j] << 1) | temp; - temp = temp2; - } - - // generated scalar + digit_i - if (i < (int)d) { - digit = digits[i] | 1; - if (digit >= 0) { - generated_scalar[0] = generated_scalar[0] + digit; - carry = (generated_scalar[0] < (unsigned int)digit); - for (j = 1; j < NWORDS64_ORDER; j++) - { - generated_scalar[j] = generated_scalar[j] + carry; - carry = (generated_scalar[j] < carry); - } - } else { - borrow = 0; - temp = (uint64_t)(-digit); - for (j = 0; j < NWORDS64_ORDER; j++) - { - temp2 = generated_scalar[j] - temp; - carry = (generated_scalar[j] < temp); - generated_scalar[j] = temp2 - borrow; - borrow = carry || (temp2 < borrow); - temp = 0; - } - } - } else { - digit = digits[i]*(digits[i-(i/d)*d] | 1); - if (digit >= 0) { - generated_scalar[0] = generated_scalar[0] + digit; - carry = (generated_scalar[0] < (unsigned int)digit); - for (j = 1; j < NWORDS64_ORDER; j++) - { - generated_scalar[j] = generated_scalar[j] + carry; - carry = (generated_scalar[j] < carry); - } - } else { - borrow = 0; - temp = (uint64_t)(-digit); - for (j = 0; j < NWORDS64_ORDER; j++) - { - temp2 = generated_scalar[j] - temp; - carry = (generated_scalar[j] < temp); - generated_scalar[j] = temp2 - borrow; - borrow = carry || (temp2 < borrow); - temp = 0; - } - } - } - } - - for (j = 0; j < NWORDS64_ORDER; j++) - { - if (scalar[j] != generated_scalar[j]) - return false; - } - - return true; -} - - -static inline bool fpeq1271_unsafe(felm_t in1, felm_t in2) -{ - return memcmp(in1, in2, sizeof(felm_t)) == 0; -} - - -void hash2curve_unsafe(f2elm_t r, point_t out) -{ // (Unsafe, non-constant-time version of) hash to curve function for testing - digit_t *r0 = (digit_t*)r[0], *r1 = (digit_t*)r[1]; - felm_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16, t17, t18; - felm_t one = {0}; - one[0] = 1; - - digit_t* x0 = (digit_t*)out->x[0]; - digit_t* x1 = (digit_t*)out->x[1]; - digit_t* y0 = (digit_t*)out->y[0]; - digit_t* y1 = (digit_t*)out->y[1]; - - fpadd1271(r0, r1, t0); - fpsub1271(r0, r1, t1); - fpmul1271(t0, t1, t0); - fpmul1271(r0, r1, t1); - fpadd1271(t1, t1, t1); - fpadd1271(t1, t1, t2); - fpadd1271(t0, t2, t2); - fpadd1271(t0, t0, t0); - fpsub1271(t0, t1, t3); - fpadd1271(t3, one, t0); - fpmul1271(A0, t0, t4); - fpmul1271(A1, t2, t1); - fpsub1271(t1, t4, t4); - fpmul1271(A1, t0, t5); - fpmul1271(A0, t2, t1); - fpadd1271(t1, t5, t1); - fpadd1271(t0, t2, t5); - fpsub1271(t0, t2, t6); - fpmul1271(t5, t6, t6); - fpmul1271(t2, t0, t5); - fpadd1271(t5, t5, t5); - fpmul1271(con1, t3, t7); - fpsub1271(t6, t7, t8); - fpmul1271(con2, t2, t7); - fpadd1271(t7, t8, t8); - fpmul1271(con1, t2, t7); - fpsub1271(t5, t7, t9); - fpmul1271(con2, t3, t7); - fpsub1271(t9, t7, t9); - fpmul1271(t4, t8, t5); - fpmul1271(t1, t9, t7); - fpadd1271(t5, t7, t7); - fpmul1271(t4, t9, t5); - fpmul1271(t1, t8, t10); - fpsub1271(t5, t10, t10); - fpsqr1271(t7, t5); - fpsqr1271(t10, t7); - fpadd1271(t5, t7, t5); - fpexp1251(t5, t7); - fpsqr1271(t7, t7); - fpmul1271(t5, t7, t7); - fpcopy1271(A0, t8); - fpcopy1271(A1, t9); - fpneg1271(t8); - fpneg1271(t9); - fpadd1271(A0, t4, t5); - fpsub1271(A1, t1, t11); - - if (fpeq1271_unsafe(t7, one)) { - fpcopy1271(t8, t3); - fpcopy1271(t9, t10); - } else { - fpcopy1271(t5, t3); - fpcopy1271(t11, t10); - } - - fpmul1271(t0, t3, t5); - fpmul1271(t2, t10, t8); - fpsub1271(t5, t8, t8); - fpmul1271(t2, t3, t5); - fpmul1271(t0, t10, t9); - fpadd1271(t5, t9, t9); - fpadd1271(t3, t10, t5); - fpsub1271(t3, t10, t11); - fpmul1271(t5, t11, t5); - fpmul1271(t3, t10, t11); - fpadd1271(t11, t11, t11); - fpmul1271(t3, t4, t12); - fpmul1271(t1, t10, t13); - fpadd1271(t12, t13, t13); - fpmul1271(t4, t10, t14); - fpmul1271(t1, t3, t12); - fpsub1271(t14, t12, t12); - fpsub1271(t5, t13, t5); - fpsub1271(t11, t12, t11); - fpadd1271(t5, t6, t5); - fpmul1271(t0, t2, t6); - fpadd1271(t6, t6, t6); - fpadd1271(t11, t6, t11); - fpmul1271(t5, t8, t6); - fpmul1271(t9, t11, t12); - fpsub1271(t6, t12, t6); - fpmul1271(t5, t9, t12); - fpmul1271(t8, t11, t8); - fpadd1271(t12, t8, t12); - fpadd1271(t6, t6, t6); - fpadd1271(t6, t6, t6); - fpadd1271(t6, t6, t6); - fpadd1271(t6, t6, t6); - fpadd1271(t12, t12, t12); - fpadd1271(t12, t12, t12); - fpadd1271(t12, t12, t12); - fpadd1271(t12, t12, t12); - fpadd1271(t0, t3, t14); - fpadd1271(t14, t14, t14); - fpadd1271(t2, t10, t8); - fpadd1271(t8, t8, t8); - fpmul1271(t6, t14, t4); - fpmul1271(t8, t12, t1); - fpsub1271(t4, t1, t4); - fpmul1271(t12, t14, t9); - fpmul1271(t6, t8, t1); - fpadd1271(t1, t9, t1); - fpsqr1271(t12, t5); - fpsqr1271(t6, t9); - fpadd1271(t5, t9, t9); - fpsqr1271(t1, t5); - fpsqr1271(t4, t11); - fpadd1271(t11, t5, t11); - fpsqr1271(t11, t5); - fpmul1271(t5, t9, t5); - fpexp1251(t5, t7); - fpsqr1271(t7, t13); - fpsqr1271(t13, t13); - fpmul1271(t11, t13, t13); - fpmul1271(t9, t13, t13); - fpmul1271(t5, t13, t13); - fpmul1271(t13, t7, t7); - fpmul1271(t5, t7, t7); - fpadd1271(t6, t7, t5); - fpdiv1271(t5); - fpexp1251(t5, t9); - fpsqr1271(t9, t11); - fpsqr1271(t11, t11); - fpmul1271(t5, t11, t11); - fpmul1271(t5, t9, t9); - fpmul1271(t11, t12, t11); - fpsqr1271(t9, t7); - fpadd1271(one, one, t15); - fpcopy1271(t11, t16); - fpcopy1271(t15, x0); - fpneg1271(x0); - - if (fpeq1271_unsafe(t5, t7)) { - fpcopy1271(t15, t17); - fpcopy1271(t16, t18); - } else { - fpcopy1271(t16, t17); - fpcopy1271(x0, t18); - } - - fpadd1271(t13, t13, t13); - fpsub1271(t3, t0, y0); - fpsub1271(t10, t2, y1); - fpmul1271(y0, t6, t16); - fpmul1271(y1, t12, t15); - fpsub1271(t16, t15, t15); - fpmul1271(y0, t12, y0); - fpmul1271(t6, y1, t16); - fpadd1271(t16, y0, t16); - fpmul1271(t15, t4, x0); - fpmul1271(t1, t16, y0); - fpadd1271(x0, y0, y0); - fpmul1271(t4, t16, y1); - fpmul1271(t1, t15, x0); - fpsub1271(y1, x0, y1); - fpmul1271(y0, t13, y0); - fpmul1271(y1, t13, y1); - fpmul1271(b0, t3, t15); - fpmul1271(b1, t10, x0); - fpsub1271(t15, x0, t15); - fpmul1271(b0, t10, t16); - fpmul1271(b1, t3, x0); - fpadd1271(t16, x0, t16); - fpmul1271(t15, t4, t5); - fpmul1271(t1, t16, x0); - fpadd1271(x0, t5, x0); - fpmul1271(t4, t16, x1); - fpmul1271(t1, t15, t5); - fpsub1271(x1, t5, x1); - fpmul1271(x0, t0, t5); - fpmul1271(x1, t2, t15); - fpsub1271(t5, t15, t15); - fpmul1271(x1, t0, t5); - fpmul1271(x0, t2, t16); - fpadd1271(t5, t16, t16); - fpmul1271(t15, t14, t5); - fpmul1271(t16, t8, x0); - fpsub1271(t5, x0, x0); - fpmul1271(t15, t8, t5); - fpmul1271(t16, t14, x1); - fpadd1271(x1, t5, x1); - fpmul1271(x0, t17, t5); - fpmul1271(x1, t18, t15); - fpsub1271(t5, t15, t15); - fpmul1271(t17, x1, t5); - fpmul1271(t18, x0, t16); - fpadd1271(t16, t5, t16); - fpmul1271(t13, t9, t13); - fpmul1271(t15, t13, x0); - fpmul1271(t16, t13, x1); - - // Clear cofactor - point_extproj_t P; - point_setup(out, P); - cofactor_clearing(P); - eccnorm(P, out); -} diff --git a/ffi-deps/FourQlib/FourQ_64bit_and_portable/tests/test_extras.h b/ffi-deps/FourQlib/FourQ_64bit_and_portable/tests/test_extras.h deleted file mode 100644 index 267f9ef..0000000 --- a/ffi-deps/FourQlib/FourQ_64bit_and_portable/tests/test_extras.h +++ /dev/null @@ -1,53 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: utility header file for tests -************************************************************************************/ - -#ifndef __TEST_EXTRAS_H__ -#define __TEST_EXTRAS_H__ - - -// For C++ -#ifdef __cplusplus -extern "C" { -#endif - - -#if (TARGET == TARGET_ARM || TARGET == TARGET_ARM64) - #define print_unit printf("nsec"); -#else - #define print_unit printf("cycles"); -#endif - - -// Access system counter for benchmarking -int64_t cpucycles(void); - -// Comparing uint64_t digits of two quadratic extension field elements, ai=bi? : (0) equal, (1) unequal -int fp2compare64(uint64_t* a, uint64_t* b); - -// Generating a pseudo-random scalar value in [0, 2^256-1] -void random_scalar_test(uint64_t* a); - -// Generating a pseudo-random GF(p^2) element a+b*i, where a,b in [0, 2^127-1] -void fp2random1271_test(f2elm_t a); - -// Generating a pseudo-random element in [0, order-1] -void random_order_test(digit_t* a); - -// Verification of the mLSB-set's recoding algorithm used in fixed-base scalar multiplication -bool verify_mLSB_recoding(uint64_t* scalar, int* digits); - -// (Unsafe, non-constant-time version of) hash to curve function for testing -void hash2curve_unsafe(f2elm_t r, point_t out); - - -#ifdef __cplusplus -} -#endif - - -#endif \ No newline at end of file diff --git a/ffi-deps/FourQlib/LICENSE b/ffi-deps/FourQlib/LICENSE deleted file mode 100644 index 4b1ad51..0000000 --- a/ffi-deps/FourQlib/LICENSE +++ /dev/null @@ -1,21 +0,0 @@ - MIT License - - Copyright (c) Microsoft Corporation. All rights reserved. - - Permission is hereby granted, free of charge, to any person obtaining a copy - of this software and associated documentation files (the "Software"), to deal - in the Software without restriction, including without limitation the rights - to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - copies of the Software, and to permit persons to whom the Software is - furnished to do so, subject to the following conditions: - - The above copyright notice and this permission notice shall be included in all - copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE diff --git a/ffi-deps/FourQlib/README.md b/ffi-deps/FourQlib/README.md deleted file mode 100644 index 79ff89f..0000000 --- a/ffi-deps/FourQlib/README.md +++ /dev/null @@ -1,128 +0,0 @@ -## FourQlib v3.1 (C Edition) - -**FourQlib** implements essential elliptic curve and cryptographic functions based on FourQ, -a high-security, high-performance elliptic curve that targets the 128-bit security level [1]. At the -high level, **FourQlib** consists of a set of implementations targeting different platforms with different -levels of portability and performance. The cryptographic and elliptic curve API is common to all the -implementations. - -The library was developed by [Microsoft Research](http://research.microsoft.com/) and is available under the MIT License. - -## Contents - -Version 3.1 includes the following implementations: - -* [`FourQ_32bit`](FourQ_32bit/): a portable implementation especially tailored for 32-bit platforms. -* [`FourQ_64bit_and_portable`](FourQ_64bit_and_portable/): a portable implementation for 32-bit and 64-bit platforms with optional - optimizations for x64 and 64-bit ARMv8 platforms. -* [`FourQ_ARM`](FourQ_ARM/): an optimized implementation for 32-bit ARMv6, ARMv7 and ARMv7-M (Cortex-M4) platforms. -* [`FourQ_ARM_side_channel`](FourQ_ARM_side_channel/): an optimized implementation for 32-bit ARMv6, ARMv7 and ARMv7-M (Cortex-M4), - including strong countermeasures against a wide variety of side-channel attacks. -* [`FourQ_ARM_NEON`](FourQ_ARM_NEON/): an optimized implementation for 32-bit ARM platforms with NEON support. - -The elliptic curve and crypto API can be found in `FourQ_api.h`, which is available per implementation. - -The [`FourQ-Magma`](FourQ-Magma/) folder includes easy-to-read scripts written in Magma. - -### Complementary cryptographic functions - -Random values are generated with `/dev/urandom` in the case of Linux, and with the function `BCryptGenRandom()` -in the case of Windows. Check the [`random`](random/) folder for details. - -The library includes an implementation of SHA-512 which is used by default by SchnorrQ signatures (see [`sha512`](sha512/)). - -Users can provide their own PRNG and hash implementations by replacing the functions in the [`random`](random/) and [`sha512`](sha512/)folders, and applying the corresponding changes to the settings in `FourQ.h` (in a given implementation). -Refer to [2] for the security requirements for the cryptographic hash function. - -## What's new - -### In version 3.0 - -* New support for co-factor ECDH and SchnorrQ signatures. -* New implementations for 32-bit processors, 32-bit ARMv6 and ARMv7 processors, and 32-bit ARM Cortex-M4 microcontroller. -* New implementation for ARMv6/ARMv7/ARMv7-M with strong countermeasures against several side-channel attacks. -* New support for 64-bit ARMv8 processors. - -### In version 3.1 - -* New hash to curve functionality (only supported in the portable implementation FourQ_64bit_and_portable). - -## Main features - -* Support for co-factor Elliptic Curve Diffie-Hellman (ECDH) key exchange [3], the SchnorrQ digital signature scheme [2], and - hash to curve conversion. -* Support for 3 core elliptic curve operations: variable-base, fixed-base and double-scalar multiplications. -* Support for Windows using Microsoft Visual Studio and Linux using GNU GCC or clang. -* Includes a basic implementation using portable C to enable support on a wide range of platforms including x64, x86 - and ARM, Windows and Linux. -* Includes optimized implementations for 64-bit ARMv8 and x64 platforms with optional, high-performance x64 assembly for Linux [1]. -* Includes high-performance implementations for 32-bit ARM processors with NEON support [4], for 32-bit ARMv6 and - ARMv7 processors, and for 32-bit ARM Cortex-M4 microcontrollers [5]. -* Includes side-channel secure implementations for 32-bit ARMv6/ARMv7 and for ARMv7-M (Cortex-M4) microcontrollers [5]. -* Includes testing and benchmarking code for field arithmetic, elliptic curve and cryptographic functions. -* All functions evaluating secret data have regular, constant-time execution, protecting against timing and cache attacks. -* Includes an option to disable the use of the fast endomorphisms. - -## Quick start - -### Building the library and executing the tests on Linux - -One can quickly test a given implementation by executing from the corresponding folder and using a supported architecture: - -```sh -$ make ARCH=[x64/x86/ARM/ARM64] -``` - -GNU GCC is used by default. After compilation, run `fp_tests`, `ecc_tests` or `crypto_tests`. - -Below are the architectures supported by each implementation: - -* [`FourQ_32bit`](FourQ_32bit/): x86 and ARM -* [`FourQ_64bit_and_portable`](FourQ_64bit_and_portable/): x64, x86, ARM and ARM64 -* [`FourQ_ARM`](FourQ_ARM/), [`FourQ_ARM_side_channel`](FourQ_ARM_side_channel/) and [`FourQ_ARM_NEON`](FourQ_ARM_NEON/): ARM - -For example, to compile the optimized x64 implementation using assembly with GNU GCC, using the efficient endomorphisms on a machine with AVX2 support (e.g, Intel's Haswell or Skylake), execute: - -```sh -$ cd FourQ_64bit_and_portable -$ make ARCH=x64 -``` - -Additional compilation options are available. Refer to the `README` files in a given implementation folder for complete details. - -**NOTE:** the above instructions apply to all the "processor-class" implementations. For instructions on how to compile on an ARM Cortex-M (ARMv7-M) microcontroller, refer to the `README` files in [`FourQ_ARM_side_channel`](FourQ_ARM_side_channel/) or [`FourQ_ARM`](FourQ_ARM/). - -### Building the library and executing the tests on Windows - -`FourQ_32bit` and `FourQ_64bit_and_portable` include Visual Studio solutions for compilation on Windows. Refer to the corresponding `README` files for instructions. - -## License - -**FourQlib** is licensed under the MIT License; see [`License`](LICENSE) for details. - -Files `stm32f4_wrapper.c` and `stm32f4_wrapper.h` in the [`FourQ_ARM`](FourQ_ARM/) and [`FourQ_ARM_side_channel`](FourQ_ARM_side_channel/) folders are by Joost Rijneveld and are released under the CC0 1.0 Universal license. - -Files in the folder [`FourQ_ARM_side_channel/libopencm3`](FourQ_ARM_side_channel/libopencm3/) and [`FourQ_ARM/libopencm3`](FourQ_ARM/libopencm3/) are from the `libopencm3` project and are under the GNU LGPL v3.0 license. - -The SHA-512 implementation is by D.J. Bernstein and is released to the public domain. - -# References - -[1] Craig Costello and Patrick Longa, "FourQ: four-dimensional decompositions on a Q-curve over the Mersenne prime". Advances in Cryptology - ASIACRYPT 2015, 2015. -The extended version is available [`here`](http://eprint.iacr.org/2015/565). - -[2] Craig Costello and Patrick Longa. "SchnorrQ: Schnorr signatures on FourQ". MSR Technical Report, 2016. -Available [`here`](https://www.microsoft.com/en-us/research/wp-content/uploads/2016/07/SchnorrQ.pdf). - -[3] Watson Ladd, Patrick Longa and Richard Barnes, "Curve4Q". Internet-Draft, draft-ladd-cfrg-4q-01, 2017. -Available [`here`](https://www.ietf.org/id/draft-ladd-cfrg-4q-01.txt). - -[4] Patrick Longa, "FourQNEON: faster elliptic curve scalar multiplications on ARM processors". Selected Areas in Cryptography (SAC 2016), 2016. -Preprint available [`here`](http://eprint.iacr.org/2016/645). - -[5] Zhe Liu, Patrick Longa, Geovandro Pereira, Oscar Reparaz and Hwajeong Seo, "FourQ on embedded devices with strong countermeasures against side-channel attacks". -Preprint available [`here`](http://eprint.iacr.org/2017/434). - -# Contributing - -This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. diff --git a/ffi-deps/FourQlib/SECURITY.md b/ffi-deps/FourQlib/SECURITY.md deleted file mode 100644 index 869fdfe..0000000 --- a/ffi-deps/FourQlib/SECURITY.md +++ /dev/null @@ -1,41 +0,0 @@ - - -## Security - -Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). - -If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/opensource/security/definition), please report it to us as described below. - -## Reporting Security Issues - -**Please do not report security vulnerabilities through public GitHub issues.** - -Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/opensource/security/create-report). - -If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/opensource/security/pgpkey). - -You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://aka.ms/opensource/security/msrc). - -Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: - - * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) - * Full paths of source file(s) related to the manifestation of the issue - * The location of the affected source code (tag/branch/commit or direct URL) - * Any special configuration required to reproduce the issue - * Step-by-step instructions to reproduce the issue - * Proof-of-concept or exploit code (if possible) - * Impact of the issue, including how an attacker might exploit the issue - -This information will help us triage your report more quickly. - -If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/opensource/security/bounty) page for more details about our active programs. - -## Preferred Languages - -We prefer all communications to be in English. - -## Policy - -Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/opensource/security/cvd). - - diff --git a/ffi-deps/FourQlib/random/random.c b/ffi-deps/FourQlib/random/random.c deleted file mode 100644 index db17aeb..0000000 --- a/ffi-deps/FourQlib/random/random.c +++ /dev/null @@ -1,86 +0,0 @@ -/*********************************************************************************** -* FourQlib: a high-performance crypto library based on the elliptic curve FourQ -* -* Copyright (c) Microsoft Corporation. All rights reserved. -* -* Abstract: pseudo-random function -************************************************************************************/ - -#include "random.h" -#include -#include -#if defined(__WINDOWS__) - #include - #include - #define RTL_GENRANDOM "SystemFunction036" - NTSTATUS last_bcrypt_error = 0; -#elif defined(__LINUX__) - #include - #include - static int lock = -1; -#endif - - -static __inline void delay(unsigned int count) -{ - while (count--) {} -} - - -int random_bytes(unsigned char* random_array, unsigned int nbytes) -{ // Generation of "nbytes" of random values - -#if defined(__WINDOWS__) - if (BCRYPT_SUCCESS(last_bcrypt_error)) { - NTSTATUS status = BCryptGenRandom(NULL, random_array, nbytes, BCRYPT_USE_SYSTEM_PREFERRED_RNG); - - if (BCRYPT_SUCCESS(status)) { - return true; - } - last_bcrypt_error = status; - } - - HMODULE hAdvApi = LoadLibraryA("ADVAPI32.DLL"); - if (!hAdvApi) { - return false; - } - - BOOLEAN(APIENTRY * RtlGenRandom)(void*, ULONG) = (BOOLEAN(APIENTRY*)(void*, ULONG))GetProcAddress(hAdvApi, RTL_GENRANDOM); - - BOOLEAN genrand_result = FALSE; - if (RtlGenRandom) { - genrand_result = RtlGenRandom(random_array, nbytes); - } - - FreeLibrary(hAdvApi); - - if (!genrand_result) { - return false; - } - -#elif defined(__LINUX__) - int r, n = nbytes, count = 0; - - if (lock == -1) { - do { - lock = open("/dev/urandom", O_RDONLY); - if (lock == -1) { - delay(0xFFFFF); - } - } while (lock == -1); - } - - while (n > 0) { - do { - r = read(lock, random_array+count, n); - if (r == -1) { - delay(0xFFFF); - } - } while (r == -1); - count += r; - n -= r; - } -#endif - - return true; -} \ No newline at end of file diff --git a/ffi-deps/FourQlib/random/random.h b/ffi-deps/FourQlib/random/random.h deleted file mode 100644 index e5f48df..0000000 --- a/ffi-deps/FourQlib/random/random.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef __RANDOM_H__ -#define __RANDOM_H__ - - -// For C++ -#ifdef __cplusplus -extern "C" { -#endif - - -// Generate random bytes and output the result to random_array -int random_bytes(unsigned char* random_array, unsigned int nbytes); - - -#ifdef __cplusplus -} -#endif - - -#endif diff --git a/ffi-deps/FourQlib/sha512/sha512.c b/ffi-deps/FourQlib/sha512/sha512.c deleted file mode 100644 index 92b1f7e..0000000 --- a/ffi-deps/FourQlib/sha512/sha512.c +++ /dev/null @@ -1,306 +0,0 @@ -/* -20080913 -D. J. Bernstein -Public domain. -*/ - -#include "sha512.h" - -typedef unsigned long long uint64; - -static uint64 load_bigendian(const unsigned char *x) -{ - return - (uint64) (x[7]) \ - | (((uint64) (x[6])) << 8) \ - | (((uint64) (x[5])) << 16) \ - | (((uint64) (x[4])) << 24) \ - | (((uint64) (x[3])) << 32) \ - | (((uint64) (x[2])) << 40) \ - | (((uint64) (x[1])) << 48) \ - | (((uint64) (x[0])) << 56) - ; -} - -static void store_bigendian(unsigned char *x,uint64 u) -{ - x[7] = (unsigned char)u; u >>= 8; - x[6] = (unsigned char)u; u >>= 8; - x[5] = (unsigned char)u; u >>= 8; - x[4] = (unsigned char)u; u >>= 8; - x[3] = (unsigned char)u; u >>= 8; - x[2] = (unsigned char)u; u >>= 8; - x[1] = (unsigned char)u; u >>= 8; - x[0] = (unsigned char)u; -} - -#define SHR(x,c) ((x) >> (c)) -#define ROTR(x,c) (((x) >> (c)) | ((x) << (64 - (c)))) - -#define Ch(x,y,z) ((x & y) ^ (~x & z)) -#define Maj(x,y,z) ((x & y) ^ (x & z) ^ (y & z)) -#define Sigma0(x) (ROTR(x,28) ^ ROTR(x,34) ^ ROTR(x,39)) -#define Sigma1(x) (ROTR(x,14) ^ ROTR(x,18) ^ ROTR(x,41)) -#define sigma0(x) (ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x,7)) -#define sigma1(x) (ROTR(x,19) ^ ROTR(x,61) ^ SHR(x,6)) - -#define M(w0,w14,w9,w1) w0 = sigma1(w14) + w9 + sigma0(w1) + w0; - -#define EXPAND \ - M(w0 ,w14,w9 ,w1 ) \ - M(w1 ,w15,w10,w2 ) \ - M(w2 ,w0 ,w11,w3 ) \ - M(w3 ,w1 ,w12,w4 ) \ - M(w4 ,w2 ,w13,w5 ) \ - M(w5 ,w3 ,w14,w6 ) \ - M(w6 ,w4 ,w15,w7 ) \ - M(w7 ,w5 ,w0 ,w8 ) \ - M(w8 ,w6 ,w1 ,w9 ) \ - M(w9 ,w7 ,w2 ,w10) \ - M(w10,w8 ,w3 ,w11) \ - M(w11,w9 ,w4 ,w12) \ - M(w12,w10,w5 ,w13) \ - M(w13,w11,w6 ,w14) \ - M(w14,w12,w7 ,w15) \ - M(w15,w13,w8 ,w0 ) - -#define F(w,k) \ - T1 = h + Sigma1(e) + Ch(e,f,g) + k + w; \ - T2 = Sigma0(a) + Maj(a,b,c); \ - h = g; \ - g = f; \ - f = e; \ - e = d + T1; \ - d = c; \ - c = b; \ - b = a; \ - a = T1 + T2; - -static int crypto_hashblocks_sha512(unsigned char *statebytes,const unsigned char *in,unsigned long long inlen) -{ - uint64 state[8]; - uint64 a; - uint64 b; - uint64 c; - uint64 d; - uint64 e; - uint64 f; - uint64 g; - uint64 h; - uint64 T1; - uint64 T2; - - a = load_bigendian(statebytes + 0); state[0] = a; - b = load_bigendian(statebytes + 8); state[1] = b; - c = load_bigendian(statebytes + 16); state[2] = c; - d = load_bigendian(statebytes + 24); state[3] = d; - e = load_bigendian(statebytes + 32); state[4] = e; - f = load_bigendian(statebytes + 40); state[5] = f; - g = load_bigendian(statebytes + 48); state[6] = g; - h = load_bigendian(statebytes + 56); state[7] = h; - - while (inlen >= 128) { - uint64 w0 = load_bigendian(in + 0); - uint64 w1 = load_bigendian(in + 8); - uint64 w2 = load_bigendian(in + 16); - uint64 w3 = load_bigendian(in + 24); - uint64 w4 = load_bigendian(in + 32); - uint64 w5 = load_bigendian(in + 40); - uint64 w6 = load_bigendian(in + 48); - uint64 w7 = load_bigendian(in + 56); - uint64 w8 = load_bigendian(in + 64); - uint64 w9 = load_bigendian(in + 72); - uint64 w10 = load_bigendian(in + 80); - uint64 w11 = load_bigendian(in + 88); - uint64 w12 = load_bigendian(in + 96); - uint64 w13 = load_bigendian(in + 104); - uint64 w14 = load_bigendian(in + 112); - uint64 w15 = load_bigendian(in + 120); - - F(w0 ,0x428a2f98d728ae22ULL) - F(w1 ,0x7137449123ef65cdULL) - F(w2 ,0xb5c0fbcfec4d3b2fULL) - F(w3 ,0xe9b5dba58189dbbcULL) - F(w4 ,0x3956c25bf348b538ULL) - F(w5 ,0x59f111f1b605d019ULL) - F(w6 ,0x923f82a4af194f9bULL) - F(w7 ,0xab1c5ed5da6d8118ULL) - F(w8 ,0xd807aa98a3030242ULL) - F(w9 ,0x12835b0145706fbeULL) - F(w10,0x243185be4ee4b28cULL) - F(w11,0x550c7dc3d5ffb4e2ULL) - F(w12,0x72be5d74f27b896fULL) - F(w13,0x80deb1fe3b1696b1ULL) - F(w14,0x9bdc06a725c71235ULL) - F(w15,0xc19bf174cf692694ULL) - - EXPAND - - F(w0 ,0xe49b69c19ef14ad2ULL) - F(w1 ,0xefbe4786384f25e3ULL) - F(w2 ,0x0fc19dc68b8cd5b5ULL) - F(w3 ,0x240ca1cc77ac9c65ULL) - F(w4 ,0x2de92c6f592b0275ULL) - F(w5 ,0x4a7484aa6ea6e483ULL) - F(w6 ,0x5cb0a9dcbd41fbd4ULL) - F(w7 ,0x76f988da831153b5ULL) - F(w8 ,0x983e5152ee66dfabULL) - F(w9 ,0xa831c66d2db43210ULL) - F(w10,0xb00327c898fb213fULL) - F(w11,0xbf597fc7beef0ee4ULL) - F(w12,0xc6e00bf33da88fc2ULL) - F(w13,0xd5a79147930aa725ULL) - F(w14,0x06ca6351e003826fULL) - F(w15,0x142929670a0e6e70ULL) - - EXPAND - - F(w0 ,0x27b70a8546d22ffcULL) - F(w1 ,0x2e1b21385c26c926ULL) - F(w2 ,0x4d2c6dfc5ac42aedULL) - F(w3 ,0x53380d139d95b3dfULL) - F(w4 ,0x650a73548baf63deULL) - F(w5 ,0x766a0abb3c77b2a8ULL) - F(w6 ,0x81c2c92e47edaee6ULL) - F(w7 ,0x92722c851482353bULL) - F(w8 ,0xa2bfe8a14cf10364ULL) - F(w9 ,0xa81a664bbc423001ULL) - F(w10,0xc24b8b70d0f89791ULL) - F(w11,0xc76c51a30654be30ULL) - F(w12,0xd192e819d6ef5218ULL) - F(w13,0xd69906245565a910ULL) - F(w14,0xf40e35855771202aULL) - F(w15,0x106aa07032bbd1b8ULL) - - EXPAND - - F(w0 ,0x19a4c116b8d2d0c8ULL) - F(w1 ,0x1e376c085141ab53ULL) - F(w2 ,0x2748774cdf8eeb99ULL) - F(w3 ,0x34b0bcb5e19b48a8ULL) - F(w4 ,0x391c0cb3c5c95a63ULL) - F(w5 ,0x4ed8aa4ae3418acbULL) - F(w6 ,0x5b9cca4f7763e373ULL) - F(w7 ,0x682e6ff3d6b2b8a3ULL) - F(w8 ,0x748f82ee5defb2fcULL) - F(w9 ,0x78a5636f43172f60ULL) - F(w10,0x84c87814a1f0ab72ULL) - F(w11,0x8cc702081a6439ecULL) - F(w12,0x90befffa23631e28ULL) - F(w13,0xa4506cebde82bde9ULL) - F(w14,0xbef9a3f7b2c67915ULL) - F(w15,0xc67178f2e372532bULL) - - EXPAND - - F(w0 ,0xca273eceea26619cULL) - F(w1 ,0xd186b8c721c0c207ULL) - F(w2 ,0xeada7dd6cde0eb1eULL) - F(w3 ,0xf57d4f7fee6ed178ULL) - F(w4 ,0x06f067aa72176fbaULL) - F(w5 ,0x0a637dc5a2c898a6ULL) - F(w6 ,0x113f9804bef90daeULL) - F(w7 ,0x1b710b35131c471bULL) - F(w8 ,0x28db77f523047d84ULL) - F(w9 ,0x32caab7b40c72493ULL) - F(w10,0x3c9ebe0a15c9bebcULL) - F(w11,0x431d67c49c100d4cULL) - F(w12,0x4cc5d4becb3e42b6ULL) - F(w13,0x597f299cfc657e2aULL) - F(w14,0x5fcb6fab3ad6faecULL) - F(w15,0x6c44198c4a475817ULL) - - a += state[0]; - b += state[1]; - c += state[2]; - d += state[3]; - e += state[4]; - f += state[5]; - g += state[6]; - h += state[7]; - - state[0] = a; - state[1] = b; - state[2] = c; - state[3] = d; - state[4] = e; - state[5] = f; - state[6] = g; - state[7] = h; - - in += 128; - inlen -= 128; - } - - store_bigendian(statebytes + 0,state[0]); - store_bigendian(statebytes + 8,state[1]); - store_bigendian(statebytes + 16,state[2]); - store_bigendian(statebytes + 24,state[3]); - store_bigendian(statebytes + 32,state[4]); - store_bigendian(statebytes + 40,state[5]); - store_bigendian(statebytes + 48,state[6]); - store_bigendian(statebytes + 56,state[7]); - - return (int)inlen; -} - -static const unsigned char iv[64] = { - 0x6a,0x09,0xe6,0x67,0xf3,0xbc,0xc9,0x08, - 0xbb,0x67,0xae,0x85,0x84,0xca,0xa7,0x3b, - 0x3c,0x6e,0xf3,0x72,0xfe,0x94,0xf8,0x2b, - 0xa5,0x4f,0xf5,0x3a,0x5f,0x1d,0x36,0xf1, - 0x51,0x0e,0x52,0x7f,0xad,0xe6,0x82,0xd1, - 0x9b,0x05,0x68,0x8c,0x2b,0x3e,0x6c,0x1f, - 0x1f,0x83,0xd9,0xab,0xfb,0x41,0xbd,0x6b, - 0x5b,0xe0,0xcd,0x19,0x13,0x7e,0x21,0x79 -} ; - -typedef unsigned long long uint64; - -int crypto_sha512(const unsigned char *in, unsigned long long inlen, unsigned char *out) -{ - unsigned char h[64]; - unsigned char padded[256]; - int i; - unsigned long long bytes = inlen; - - for (i = 0;i < 64;++i) h[i] = iv[i]; - - crypto_hashblocks_sha512(h,in,inlen); - in += inlen; - inlen &= 127; - in -= inlen; - - for (i = 0;i < inlen;++i) padded[i] = in[i]; - padded[inlen] = 0x80; - - if (inlen < 112) { - for (i = (int)inlen + 1;i < 119;++i) padded[i] = 0; - padded[119] = (unsigned char)(bytes >> 61); - padded[120] = (unsigned char)(bytes >> 53); - padded[121] = (unsigned char)(bytes >> 45); - padded[122] = (unsigned char)(bytes >> 37); - padded[123] = (unsigned char)(bytes >> 29); - padded[124] = (unsigned char)(bytes >> 21); - padded[125] = (unsigned char)(bytes >> 13); - padded[126] = (unsigned char)(bytes >> 5); - padded[127] = (unsigned char)(bytes << 3); - crypto_hashblocks_sha512(h,padded,128); - } else { - for (i = (int)inlen + 1;i < 247;++i) padded[i] = 0; - padded[247] = (unsigned char)(bytes >> 61); - padded[248] = (unsigned char)(bytes >> 53); - padded[249] = (unsigned char)(bytes >> 45); - padded[250] = (unsigned char)(bytes >> 37); - padded[251] = (unsigned char)(bytes >> 29); - padded[252] = (unsigned char)(bytes >> 21); - padded[253] = (unsigned char)(bytes >> 13); - padded[254] = (unsigned char)(bytes >> 5); - padded[255] = (unsigned char)(bytes << 3); - crypto_hashblocks_sha512(h,padded,256); - } - - for (i = 0;i < 64;++i) out[i] = h[i]; - - return 0; -} diff --git a/ffi-deps/FourQlib/sha512/sha512.h b/ffi-deps/FourQlib/sha512/sha512.h deleted file mode 100644 index 4ac8bd8..0000000 --- a/ffi-deps/FourQlib/sha512/sha512.h +++ /dev/null @@ -1,20 +0,0 @@ -#ifndef __SHA512_H__ -#define __SHA512_H__ - - -// For C++ -#ifdef __cplusplus -extern "C" { -#endif - - -// Hashing using SHA-512. Output is 64 bytes long -int crypto_sha512(const unsigned char *in, unsigned long long inlen, unsigned char *out); - - -#ifdef __cplusplus -} -#endif - - -#endif diff --git a/ffi-deps/K12/README.markdown b/ffi-deps/K12/README.markdown deleted file mode 100644 index 4a85e1b..0000000 --- a/ffi-deps/K12/README.markdown +++ /dev/null @@ -1,84 +0,0 @@ -# What is KangarooTwelve ? - -[**KangarooTwelve**][k12] (or **K12**) is a fast and secure extendable-output function (XOF), the generalization of hash functions to arbitrary output lengths. -Derived from Keccak, it aims at higher speeds than FIPS 202's SHA-3 and SHAKE functions, while retaining their flexibility and basis of security. - -On high-end platforms, it can exploit a high degree of parallelism, whether using multiple cores or the single-instruction multiple-data (SIMD) instruction set of modern processors. -On Intel's Haswell and Skylake architectures, KangarooTwelve tops at less than 1.5 cycles/byte for long messages on a single core, and at 0.51 cycles/byte on the SkylakeX and Cascade Lake architectures. -On the latest Apple A14 and M1 processors, KangarooTwelve can take advantage of the ARMv8-A's SHA-3 dedicated instructions to deliver 0.75 cycles/byte for long messages on a single core. -On low-end platforms, as well as for short messages, it also benefits from about a factor two speed-up compared to the fastest FIPS 202 instance SHAKE128. - -More details can be found in our [ACNS Paper][eprint]. - -# What can I find here? - -This repository contains source code that implements the extandable output (or hash) function [**KangarooTwelve**][k12] (or **K12**). -Its purpose is to offer optimized implementations of K12 and nothing else. - -The code comes from the [**eXtended Keccak Code Package**][xkcp] (or **XKCP**), after much trimming to keep only what is needed for K12. -It is still structured like the XKCP in two layers. The lower layer implements the permutation Keccak-_p_[1600, 12] and possibly parallel versions thereof, whereas the higher layer implements the sponge construction and the K12 tree hash mode. -Also, some sources have been merged to reduce the file count. - -* For the higher layer, we kept only the code needed for K12. -* For the lower layer, we removed all the functions that are not needed for K12. The lower layer therefore implements a subset of the SnP and PlSnP interfaces. - -For Keccak or Xoodoo-based functions other than K12 only, it is recommended to use the XKCP itself instead and not to mix both this repository and the XKCP. - - -# Is there a tool to compute the K12 hash of a file? - -Not in this repository, but Jack O'Connor's [`kangarootwelve_xkcp.rs` repository](https://github.com/oconnor663/kangarootwelve_xkcp.rs) contains Rust bindings to this code and a `k12sum` utility. -Pre-built binaries can be found [there](https://github.com/oconnor663/kangarootwelve_xkcp.rs/releases). - - -# How can I build this K12 code? - -This repository uses the same build system as that of the XKCP. -To build, the following tools are needed: - -* *GCC* -* *GNU make* -* *xsltproc* - -The different targets are defined in [`Makefile.build`](Makefile.build). This file is expanded into a regular makefile using *xsltproc*. To use it, simply type, e.g., - -``` -make generic64/K12Tests -``` - -to build K12Tests generically optimized for 64-bit platforms. The name before the slash indicates the platform, while the part after the slash is the executable to build. As another example, the static (resp. dynamic) library is built by typing `make generic64/libK12.a` (resp. `.so`) or similarly with `generic64` replaced with the appropriate platform name. An alternate C compiler can be specified via the `CC` environment variable. - -Instead of building an executable with *GCC*, one can choose to select the files needed and make a package. For this, simply append `.pack` to the target name, e.g., - -``` -make generic64/K12Tests.pack -``` - -This creates a `.tar.gz` archive with all the necessary files to build the given target. - -The list of targets can be found at the end of [`Makefile.build`](Makefile.build) or by running `make` without parameters. - -## Microsoft Visual Studio support - -KangarooTwelve can be compiled with Microsoft Visual Studio (MSVC). The XKCP build system offers support for the creation of project files. To get a project file for a given target, simply append `.vcxproj` to the target name, e.g., - -``` -make generic64noAsm/K12Tests.vcxproj -``` - -The targets `generic32` and `generic64noAsm` can be used with MSVC, but not `generic64` as it contains assembly implementations in the GCC syntax, which at this point cannot be used with MSVC. -Please refer to the documention of [XKCP][xkcp] for more details on the limitations of the support of MSVC. - -[k12]: https://keccak.team/kangarootwelve.html -[xkcp]: https://github.com/XKCP/XKCP -[eprint]: https://eprint.iacr.org/2016/770.pdf - - -# Acknowledgments - -We wish to thank: - -- Andy Polyakov for his expertise with the ARMv8-A+SHA3 code, and in particular for his core routine from [CRYPTOGAMS](https://github.com/dot-asm/cryptogams) -- Duc Tri Nguyen for his benchmark on the Apple M1 -- Jack O'Connor for bug fixes and more importantly for his [Rust bindings](https://github.com/oconnor663/kangarootwelve_xkcp.rs) -- Kent Ross for his contributions to this code and its quality diff --git a/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S b/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S deleted file mode 100644 index 09aa0d2..0000000 --- a/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-ARMv8Asha3.S +++ /dev/null @@ -1,623 +0,0 @@ -# K12 based on the eXtended Keccak Code Package (XKCP) -# https://github.com/XKCP/XKCP -# -# The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. -# -# Implementation by Gilles Van Assche, hereby denoted as "the implementer". -# Core subroutine is based on one by Andy Polyakov, available -# at https://github.com/dot-asm/cryptogams. Used with permission. -# -# For more information, feedback or questions, please refer to the Keccak Team website: -# https://keccak.team/ -# -# To the extent possible under law, the implementer has waived all copyright -# and related or neighboring rights to the source code in this file. -# http://creativecommons.org/publicdomain/zero/1.0/ - -.text - -.balign 64 // strategic alignment and padding that allows to use - // address value as loop termination condition... - .quad 0,0,0,0,0,0,0,0 -.ifdef macOS -.else -.type iotas,%object -.endif -iotas: - .quad 0x0000000000000001 - .quad 0x0000000000008082 - .quad 0x800000000000808a - .quad 0x8000000080008000 - .quad 0x000000000000808b - .quad 0x0000000080000001 - .quad 0x8000000080008081 - .quad 0x8000000000008009 - .quad 0x000000000000008a - .quad 0x0000000000000088 - .quad 0x0000000080008009 - .quad 0x000000008000000a -iotas12: - .quad 0x000000008000808b - .quad 0x800000000000008b - .quad 0x8000000000008089 - .quad 0x8000000000008003 - .quad 0x8000000000008002 - .quad 0x8000000000000080 - .quad 0x000000000000800a - .quad 0x800000008000000a - .quad 0x8000000080008081 - .quad 0x8000000000008080 - .quad 0x0000000080000001 - .quad 0x8000000080008008 -.ifdef macOS -.else -.size iotas,.-iotas -.endif - -.ifdef macOS -.else -.type KeccakP1600_ARMv8Asha3_Permute_12rounds_internal,%function -.endif -KeccakP1600_ARMv8Asha3_Permute_12rounds_internal: -.balign 32 - mov x9,#12 - adr x10,iotas12 - b .Loop_ce -.balign 16 -.Loop_ce: - ////////////////////////////////////////////////// Theta - eor3 v25.16b,v20.16b,v15.16b,v10.16b - eor3 v26.16b,v21.16b,v16.16b,v11.16b - eor3 v27.16b,v22.16b,v17.16b,v12.16b - eor3 v28.16b,v23.16b,v18.16b,v13.16b - eor3 v29.16b,v24.16b,v19.16b,v14.16b - eor3 v25.16b,v25.16b, v5.16b,v0.16b - eor3 v26.16b,v26.16b, v6.16b,v1.16b - eor3 v27.16b,v27.16b, v7.16b,v2.16b - eor3 v28.16b,v28.16b, v8.16b,v3.16b - eor3 v29.16b,v29.16b, v9.16b,v4.16b - - rax1 v30.2d,v25.2d,v27.2d // D[1] - rax1 v31.2d,v26.2d,v28.2d // D[2] - rax1 v27.2d,v27.2d,v29.2d // D[3] - rax1 v28.2d,v28.2d,v25.2d // D[4] - rax1 v29.2d,v29.2d,v26.2d // D[0] - - ////////////////////////////////////////////////// Theta+Rho+Pi - xar v25.2d, v1.2d,v30.2d,#64-1 // C[0]=A[2][0] - - xar v1.2d,v6.2d,v30.2d,#64-44 - xar v6.2d,v9.2d,v28.2d,#64-20 - xar v9.2d,v22.2d,v31.2d,#64-61 - xar v22.2d,v14.2d,v28.2d,#64-39 - xar v14.2d,v20.2d,v29.2d,#64-18 - - xar v26.2d, v2.2d,v31.2d,#64-62 // C[1]=A[4][0] - - xar v2.2d,v12.2d,v31.2d,#64-43 - xar v12.2d,v13.2d,v27.2d,#64-25 - xar v13.2d,v19.2d,v28.2d,#64-8 - xar v19.2d,v23.2d,v27.2d,#64-56 - xar v23.2d,v15.2d,v29.2d,#64-41 - - xar v15.2d,v4.2d,v28.2d,#64-27 - - xar v28.2d, v24.2d,v28.2d,#64-14 // D[4]=A[0][4] - xar v24.2d,v21.2d,v30.2d,#64-2 - xar v8.2d,v8.2d,v27.2d,#64-55 // A[1][3]=A[4][1] - xar v4.2d,v16.2d,v30.2d,#64-45 // A[0][4]=A[1][3] - xar v16.2d,v5.2d,v29.2d,#64-36 - - xar v5.2d,v3.2d,v27.2d,#64-28 - - eor v0.16b,v0.16b,v29.16b - - xar v27.2d, v18.2d,v27.2d,#64-21 // D[3]=A[0][3] - xar v3.2d,v17.2d,v31.2d,#64-15 // A[0][3]=A[3][3] - xar v30.2d, v11.2d,v30.2d,#64-10 // D[1]=A[3][2] - xar v31.2d, v7.2d,v31.2d,#64-6 // D[2]=A[2][1] - xar v29.2d, v10.2d,v29.2d,#64-3 // D[0]=A[1][2] - - ////////////////////////////////////////////////// Chi+Iota - bcax v20.16b,v26.16b, v22.16b,v8.16b // A[1][3]=A[4][1] - bcax v21.16b,v8.16b,v23.16b,v22.16b // A[1][3]=A[4][1] - bcax v22.16b,v22.16b,v24.16b,v23.16b - bcax v23.16b,v23.16b,v26.16b, v24.16b - bcax v24.16b,v24.16b,v8.16b,v26.16b // A[1][3]=A[4][1] - - ld1r {v26.2d},[x10],#8 - - bcax v17.16b,v30.16b, v19.16b,v3.16b // A[0][3]=A[3][3] - bcax v18.16b,v3.16b,v15.16b,v19.16b // A[0][3]=A[3][3] - bcax v19.16b,v19.16b,v16.16b,v15.16b - bcax v15.16b,v15.16b,v30.16b, v16.16b - bcax v16.16b,v16.16b,v3.16b,v30.16b // A[0][3]=A[3][3] - - bcax v10.16b,v25.16b, v12.16b,v31.16b - bcax v11.16b,v31.16b, v13.16b,v12.16b - bcax v12.16b,v12.16b,v14.16b,v13.16b - bcax v13.16b,v13.16b,v25.16b, v14.16b - bcax v14.16b,v14.16b,v31.16b, v25.16b - - bcax v7.16b,v29.16b, v9.16b,v4.16b // A[0][4]=A[1][3] - bcax v8.16b,v4.16b,v5.16b,v9.16b // A[0][4]=A[1][3] - bcax v9.16b,v9.16b,v6.16b,v5.16b - bcax v5.16b,v5.16b,v29.16b, v6.16b - bcax v6.16b,v6.16b,v4.16b,v29.16b // A[0][4]=A[1][3] - - bcax v3.16b,v27.16b, v0.16b,v28.16b - bcax v4.16b,v28.16b, v1.16b,v0.16b - bcax v0.16b,v0.16b,v2.16b,v1.16b - bcax v1.16b,v1.16b,v27.16b, v2.16b - bcax v2.16b,v2.16b,v28.16b, v27.16b - - eor v0.16b,v0.16b,v26.16b - - subs x9,x9,#1 - bne .Loop_ce - - ret -.ifdef macOS -.else -.size KeccakP1600_ARMv8Asha3_Permute_12rounds_internal,.-KeccakP1600_ARMv8Asha3_Permute_12rounds_internal -.endif - -.ifdef macOS -.globl _KeccakP1600_ARMv8Asha3_Permute_12rounds -_KeccakP1600_ARMv8Asha3_Permute_12rounds: -.else -.globl KeccakP1600_ARMv8Asha3_Permute_12rounds -.type KeccakP1600_ARMv8Asha3_Permute_12rounds,%function -KeccakP1600_ARMv8Asha3_Permute_12rounds: -.endif -.balign 32 - stp x29,x30,[sp,#-80]! - add x29,sp,#0 - stp d8,d9,[sp,#16] // per ABI requirement - stp d10,d11,[sp,#32] - stp d12,d13,[sp,#48] - stp d14,d15,[sp,#64] - ldp d0,d1,[x0,#8*0] - ldp d2,d3,[x0,#8*2] - ldp d4,d5,[x0,#8*4] - ldp d6,d7,[x0,#8*6] - ldp d8,d9,[x0,#8*8] - ldp d10,d11,[x0,#8*10] - ldp d12,d13,[x0,#8*12] - ldp d14,d15,[x0,#8*14] - ldp d16,d17,[x0,#8*16] - ldp d18,d19,[x0,#8*18] - ldp d20,d21,[x0,#8*20] - ldp d22,d23,[x0,#8*22] - ldr d24,[x0,#8*24] - bl KeccakP1600_ARMv8Asha3_Permute_12rounds_internal - ldr x30,[sp,#8] - stp d0,d1,[x0,#8*0] - stp d2,d3,[x0,#8*2] - stp d4,d5,[x0,#8*4] - stp d6,d7,[x0,#8*6] - stp d8,d9,[x0,#8*8] - stp d10,d11,[x0,#8*10] - stp d12,d13,[x0,#8*12] - stp d14,d15,[x0,#8*14] - stp d16,d17,[x0,#8*16] - stp d18,d19,[x0,#8*18] - stp d20,d21,[x0,#8*20] - stp d22,d23,[x0,#8*22] - str d24,[x0,#8*24] - - ldp d8,d9,[sp,#16] - ldp d10,d11,[sp,#32] - ldp d12,d13,[sp,#48] - ldp d14,d15,[sp,#64] - ldr x29,[sp],#80 - ret -.ifdef macOS -.else -.size KeccakP1600_ARMv8Asha3_Permute_12rounds,.-KeccakP1600_ARMv8Asha3_Permute_12rounds -.endif - -// size_t KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb( -// void *state(x0), -// unsigned int laneCount(x1) = 21, -// const unsigned char *data(x2), -// size_t dataByteLen(x3)) -.ifdef macOS -.globl _KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb -_KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb: -.else -.globl KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb -.type KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb,%function -KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb: -.endif -.balign 32 - stp x29,x30,[sp,#-80]! - add x29,sp,#0 - stp d8,d9,[sp,#16] // per ABI requirement - stp d10,d11,[sp,#32] - stp d12,d13,[sp,#48] - stp d14,d15,[sp,#64] - - ldp d0,d1,[x0,#8*0] - ldp d2,d3,[x0,#8*2] - ldp d4,d5,[x0,#8*4] - ldp d6,d7,[x0,#8*6] - ldp d8,d9,[x0,#8*8] - ldp d10,d11,[x0,#8*10] - ldp d12,d13,[x0,#8*12] - ldp d14,d15,[x0,#8*14] - ldp d16,d17,[x0,#8*16] - ldp d18,d19,[x0,#8*18] - ldp d20,d21,[x0,#8*20] - ldp d22,d23,[x0,#8*22] - ldr d24,[x0,#8*24] - - // Prepare the return value - mov x11, #0 - b .KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb_loop - -.balign 16 -.KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb_loop: - subs x3, x3, #8*21 - b.cc .KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb_end - - // Lanes 0-3 - ld1 {v27.8b-v30.8b}, [x2], #32 - eor v0.16b, v0.16b, v27.16b - eor v1.16b, v1.16b, v28.16b - eor v2.16b, v2.16b, v29.16b - eor v3.16b, v3.16b, v30.16b - - // Lanes 4-7 - ld1 {v27.8b-v30.8b}, [x2], #32 - eor v4.16b, v4.16b, v27.16b - eor v5.16b, v5.16b, v28.16b - eor v6.16b, v6.16b, v29.16b - eor v7.16b, v7.16b, v30.16b - - // Lanes 8-11 - ld1 {v27.8b-v30.8b}, [x2], #32 - eor v8.16b, v8.16b, v27.16b - eor v9.16b, v9.16b, v28.16b - eor v10.16b, v10.16b, v29.16b - eor v11.16b, v11.16b, v30.16b - - // Lanes 12-15 - ld1 {v27.8b-v30.8b}, [x2], #32 - eor v12.16b, v12.16b, v27.16b - eor v13.16b, v13.16b, v28.16b - eor v14.16b, v14.16b, v29.16b - eor v15.16b, v15.16b, v30.16b - - // Lanes 16-20 - ld1 {v27.8b-v30.8b}, [x2], #32 - eor v16.16b, v16.16b, v27.16b - eor v17.16b, v17.16b, v28.16b - eor v18.16b, v18.16b, v29.16b - eor v19.16b, v19.16b, v30.16b - ld1 {v27.8b}, [x2], #8 - eor v20.16b, v20.16b, v27.16b - - bl KeccakP1600_ARMv8Asha3_Permute_12rounds_internal - - add x11, x11, #8*21 - - b .KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb_loop -.KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb_end: - - stp d0,d1,[x0,#8*0] - stp d2,d3,[x0,#8*2] - stp d4,d5,[x0,#8*4] - stp d6,d7,[x0,#8*6] - stp d8,d9,[x0,#8*8] - stp d10,d11,[x0,#8*10] - stp d12,d13,[x0,#8*12] - stp d14,d15,[x0,#8*14] - stp d16,d17,[x0,#8*16] - stp d18,d19,[x0,#8*18] - stp d20,d21,[x0,#8*20] - stp d22,d23,[x0,#8*22] - str d24,[x0,#8*24] - - mov x0, x11 - - ldr x30,[sp,#8] - ldp d8,d9,[sp,#16] - ldp d10,d11,[sp,#32] - ldp d12,d13,[sp,#48] - ldp d14,d15,[sp,#64] - ldr x29,[sp],#80 - - ret -.ifdef macOS -.else -.size KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb,.-KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb -.endif - -.ifdef macOS -.globl _KeccakP1600times2_ARMv8Asha3_Permute_12rounds -_KeccakP1600times2_ARMv8Asha3_Permute_12rounds: -.else -.globl KeccakP1600times2_ARMv8Asha3_Permute_12rounds -.type KeccakP1600times2_ARMv8Asha3_Permute_12rounds,%function -KeccakP1600times2_ARMv8Asha3_Permute_12rounds: -.endif -.balign 32 - stp x29,x30,[sp,#-80]! - add x29,sp,#0 - stp d8,d9,[sp,#16] // per ABI requirement - stp d10,d11,[sp,#32] - stp d12,d13,[sp,#48] - stp d14,d15,[sp,#64] - - ld1 { v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64 - ld1 { v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64 - ld1 { v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64 - ld1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64 - ld1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0], #64 - ld1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x0], #64 - ld1 {v24.2d}, [x0] - sub x0, x0, #64*6 - - bl KeccakP1600_ARMv8Asha3_Permute_12rounds_internal - - ldr x30,[sp,#8] - st1 { v0.2d, v1.2d, v2.2d, v3.2d}, [x0], #64 - st1 { v4.2d, v5.2d, v6.2d, v7.2d}, [x0], #64 - st1 { v8.2d, v9.2d, v10.2d, v11.2d}, [x0], #64 - st1 {v12.2d, v13.2d, v14.2d, v15.2d}, [x0], #64 - st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0], #64 - st1 {v20.2d, v21.2d, v22.2d, v23.2d}, [x0], #64 - st1 {v24.2d}, [x0] - - ldp d8,d9,[sp,#16] - ldp d10,d11,[sp,#32] - ldp d12,d13,[sp,#48] - ldp d14,d15,[sp,#64] - ldr x29,[sp],#80 - - ret -.ifdef macOS -.else -.size KeccakP1600times2_ARMv8Asha3_Permute_12rounds,.-KeccakP1600times2_ARMv8Asha3_Permute_12rounds -.endif - -.ifdef macOS -.globl _KangarooTwelve_ARMv8Asha3_Process2Leaves -_KangarooTwelve_ARMv8Asha3_Process2Leaves: -.else -.globl KangarooTwelve_ARMv8Asha3_Process2Leaves -.type KangarooTwelve_ARMv8Asha3_Process2Leaves,%function -KangarooTwelve_ARMv8Asha3_Process2Leaves: -.endif -.balign 32 - stp x29,x30,[sp,#-80]! - add x29,sp,#0 - stp d8,d9,[sp,#16] // per ABI requirement - stp d10,d11,[sp,#32] - stp d12,d13,[sp,#48] - stp d14,d15,[sp,#64] - - movi v0.2d, #0 - movi v1.2d, #0 - movi v2.2d, #0 - movi v3.2d, #0 - movi v4.2d, #0 - movi v5.2d, #0 - movi v6.2d, #0 - movi v7.2d, #0 - movi v8.2d, #0 - movi v9.2d, #0 - movi v10.2d, #0 - movi v11.2d, #0 - movi v12.2d, #0 - movi v13.2d, #0 - movi v14.2d, #0 - movi v15.2d, #0 - movi v16.2d, #0 - movi v17.2d, #0 - movi v18.2d, #0 - movi v19.2d, #0 - movi v20.2d, #0 - movi v21.2d, #0 - movi v22.2d, #0 - movi v23.2d, #0 - movi v24.2d, #0 - - // x12 is input + chunkSize - add x12, x0, #8192 - - // Loop over the first 48 blocks - mov x11, 48 - b .KangarooTwelve_ARMv8Asha3_Process2Leaves_blocks -.KangarooTwelve_ARMv8Asha3_Process2Leaves_blocks: - - // Lanes 0-3 - ld1 {v25.1d-v28.1d}, [x0], #32 - ld1 {v25.d}[1], [x12], #8 - ld1 {v26.d}[1], [x12], #8 - ld1 {v27.d}[1], [x12], #8 - ld1 {v28.d}[1], [x12], #8 -#ifdef __AARCH64EB__ - rev64 v25.16b, v25.16b - rev64 v26.16b, v26.16b - rev64 v27.16b, v27.16b - rev64 v28.16b, v28.16b -#endif - eor v0.16b, v0.16b, v25.16b - eor v1.16b, v1.16b, v26.16b - eor v2.16b, v2.16b, v27.16b - eor v3.16b, v3.16b, v28.16b - - // Lanes 4-7 - ld1 {v25.1d-v28.1d}, [x0], #32 - ld1 {v25.d}[1], [x12], #8 - ld1 {v26.d}[1], [x12], #8 - ld1 {v27.d}[1], [x12], #8 - ld1 {v28.d}[1], [x12], #8 -#ifdef __AARCH64EB__ - rev64 v25.16b, v25.16b - rev64 v26.16b, v26.16b - rev64 v27.16b, v27.16b - rev64 v28.16b, v28.16b -#endif - eor v4.16b, v4.16b, v25.16b - eor v5.16b, v5.16b, v26.16b - eor v6.16b, v6.16b, v27.16b - eor v7.16b, v7.16b, v28.16b - - // Lanes 8-11 - ld1 {v25.1d-v28.1d}, [x0], #32 - ld1 {v25.d}[1], [x12], #8 - ld1 {v26.d}[1], [x12], #8 - ld1 {v27.d}[1], [x12], #8 - ld1 {v28.d}[1], [x12], #8 -#ifdef __AARCH64EB__ - rev64 v25.16b, v25.16b - rev64 v26.16b, v26.16b - rev64 v27.16b, v27.16b - rev64 v28.16b, v28.16b -#endif - eor v8.16b, v8.16b, v25.16b - eor v9.16b, v9.16b, v26.16b - eor v10.16b, v10.16b, v27.16b - eor v11.16b, v11.16b, v28.16b - - // Lanes 12-15 - ld1 {v25.1d-v28.1d}, [x0], #32 - ld1 {v25.d}[1], [x12], #8 - ld1 {v26.d}[1], [x12], #8 - ld1 {v27.d}[1], [x12], #8 - ld1 {v28.d}[1], [x12], #8 -#ifdef __AARCH64EB__ - rev64 v25.16b, v25.16b - rev64 v26.16b, v26.16b - rev64 v27.16b, v27.16b - rev64 v28.16b, v28.16b -#endif - eor v12.16b, v12.16b, v25.16b - eor v13.16b, v13.16b, v26.16b - eor v14.16b, v14.16b, v27.16b - eor v15.16b, v15.16b, v28.16b - - // Lanes 16-20 - ld1 {v25.1d-v28.1d}, [x0], #32 - ld1 {v25.d}[1], [x12], #8 - ld1 {v26.d}[1], [x12], #8 - ld1 {v27.d}[1], [x12], #8 - ld1 {v28.d}[1], [x12], #8 - ld1 {v29.d}[0], [x0], #8 - ld1 {v29.d}[1], [x12], #8 -#ifdef __AARCH64EB__ - rev64 v25.16b, v25.16b - rev64 v26.16b, v26.16b - rev64 v27.16b, v27.16b - rev64 v28.16b, v28.16b - rev64 v29.16b, v29.16b -#endif - eor v16.16b, v16.16b, v25.16b - eor v17.16b, v17.16b, v26.16b - eor v18.16b, v18.16b, v27.16b - eor v19.16b, v19.16b, v28.16b - eor v20.16b, v20.16b, v29.16b - - bl KeccakP1600_ARMv8Asha3_Permute_12rounds_internal - - subs x11, x11, #1 - bne .KangarooTwelve_ARMv8Asha3_Process2Leaves_blocks - - // Lanes 0-3 - ld1 {v25.1d-v28.1d}, [x0], #32 - ld1 {v25.d}[1], [x12], #8 - ld1 {v26.d}[1], [x12], #8 - ld1 {v27.d}[1], [x12], #8 - ld1 {v28.d}[1], [x12], #8 -#ifdef __AARCH64EB__ - rev64 v25.16b, v25.16b - rev64 v26.16b, v26.16b - rev64 v27.16b, v27.16b - rev64 v28.16b, v28.16b -#endif - eor v0.16b, v0.16b, v25.16b - eor v1.16b, v1.16b, v26.16b - eor v2.16b, v2.16b, v27.16b - eor v3.16b, v3.16b, v28.16b - - // Lanes 4-7 - ld1 {v25.1d-v28.1d}, [x0], #32 - ld1 {v25.d}[1], [x12], #8 - ld1 {v26.d}[1], [x12], #8 - ld1 {v27.d}[1], [x12], #8 - ld1 {v28.d}[1], [x12], #8 -#ifdef __AARCH64EB__ - rev64 v25.16b, v25.16b - rev64 v26.16b, v26.16b - rev64 v27.16b, v27.16b - rev64 v28.16b, v28.16b -#endif - eor v4.16b, v4.16b, v25.16b - eor v5.16b, v5.16b, v26.16b - eor v6.16b, v6.16b, v27.16b - eor v7.16b, v7.16b, v28.16b - - // Lanes 8-11 - ld1 {v25.1d-v28.1d}, [x0], #32 - ld1 {v25.d}[1], [x12], #8 - ld1 {v26.d}[1], [x12], #8 - ld1 {v27.d}[1], [x12], #8 - ld1 {v28.d}[1], [x12], #8 -#ifdef __AARCH64EB__ - rev64 v25.16b, v25.16b - rev64 v26.16b, v26.16b - rev64 v27.16b, v27.16b - rev64 v28.16b, v28.16b -#endif - eor v8.16b, v8.16b, v25.16b - eor v9.16b, v9.16b, v26.16b - eor v10.16b, v10.16b, v27.16b - eor v11.16b, v11.16b, v28.16b - - // Lanes 12-15 - ld1 {v25.1d-v28.1d}, [x0], #32 - ld1 {v25.d}[1], [x12], #8 - ld1 {v26.d}[1], [x12], #8 - ld1 {v27.d}[1], [x12], #8 - ld1 {v28.d}[1], [x12], #8 -#ifdef __AARCH64EB__ - rev64 v25.16b, v25.16b - rev64 v26.16b, v26.16b - rev64 v27.16b, v27.16b - rev64 v28.16b, v28.16b -#endif - eor v12.16b, v12.16b, v25.16b - eor v13.16b, v13.16b, v26.16b - eor v14.16b, v14.16b, v27.16b - eor v15.16b, v15.16b, v28.16b - - mov x13, #0x0B - dup v25.2d, x13 - mov x13, #0x8000000000000000 - dup v26.2d, x13 - eor v16.16b, v16.16b, v25.16b - eor v20.16b, v20.16b, v26.16b - - bl KeccakP1600_ARMv8Asha3_Permute_12rounds_internal - - st1 {v0.1d-v3.1d}, [x1], #32 - st1 {v0.d}[1], [x1], #8 - st1 {v1.d}[1], [x1], #8 - st1 {v2.d}[1], [x1], #8 - st1 {v3.d}[1], [x1], #8 - - ldr x30,[sp,#8] - ldp d8,d9,[sp,#16] - ldp d10,d11,[sp,#32] - ldp d12,d13,[sp,#48] - ldp d14,d15,[sp,#64] - ldr x29,[sp],#80 - - ret -.ifdef macOS -.else -.size KangarooTwelve_ARMv8Asha3_Process2Leaves,.-KangarooTwelve_ARMv8Asha3_Process2Leaves -.endif diff --git a/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-SnP.h b/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-SnP.h deleted file mode 100644 index 512eca3..0000000 --- a/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-SnP.h +++ /dev/null @@ -1,65 +0,0 @@ -/* -K12 based on the eXtended Keccak Code Package (XKCP) -https://github.com/XKCP/XKCP - -The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. - -Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to the Keccak Team website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ - ---- - -Please refer to the XKCP for more details. -*/ - -#ifndef _KeccakP_1600_SnP_h_ -#define _KeccakP_1600_SnP_h_ - -/* Keccak-p[1600] */ - -#define KeccakP1600_stateSizeInBytes 200 -#define KeccakP1600_stateAlignment 8 -#define KeccakP1600_12rounds_FastLoop_supported - -const char * KeccakP1600_GetImplementation(); -void KeccakP1600_opt64_Initialize(void *state); -void KeccakP1600_opt64_AddByte(void *state, unsigned char data, unsigned int offset); -void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); -void KeccakP1600_ARMv8Asha3_Permute_12rounds(void *state); -void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); -size_t KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); - -#define KeccakP1600_Initialize KeccakP1600_opt64_Initialize -#define KeccakP1600_AddByte KeccakP1600_opt64_AddByte -#define KeccakP1600_AddBytes KeccakP1600_opt64_AddBytes -#define KeccakP1600_Permute_12rounds KeccakP1600_ARMv8Asha3_Permute_12rounds -#define KeccakP1600_ExtractBytes KeccakP1600_opt64_ExtractBytes -#define KeccakP1600_12rounds_FastLoop_Absorb KeccakP1600_ARMv8Asha3_12rounds_FastLoop_Absorb - -/* Keccak-p[1600]×2 */ - -int KeccakP1600times2_IsAvailable(); -const char * KeccakP1600times2_GetImplementation(); -void KeccakP1600times2_ARMv8Asha3_Permute_12rounds(void *state); -void KangarooTwelve_ARMv8Asha3_Process2Leaves(const unsigned char *input, unsigned char *output); - -#define KeccakP1600times2_Permute_12rounds KeccakP1600times2_ARMv8Asha3_Permute_12rounds -#define KangarooTwelve_Process2Leaves KangarooTwelve_ARMv8Asha3_Process2Leaves - -/* Keccak-p[1600]×4 */ - -int KeccakP1600times4_IsAvailable(); -const char * KeccakP1600times4_GetImplementation(); - -/* Keccak-p[1600]×8 */ - -int KeccakP1600times8_IsAvailable(); -const char * KeccakP1600times8_GetImplementation(); - -#endif diff --git a/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-opt64.c b/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-opt64.c deleted file mode 100644 index 7228d7a..0000000 --- a/ffi-deps/K12/lib/ARMv8Asha3/KeccakP-1600-opt64.c +++ /dev/null @@ -1,227 +0,0 @@ -/* -K12 based on the eXtended Keccak Code Package (XKCP) -https://github.com/XKCP/XKCP - -The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. - -Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to the Keccak Team website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ - ---- - -Please refer to the XKCP for more details. -*/ - -#include -#include -#include -#include - -const char * KeccakP1600_GetImplementation() -{ - return "ARMv8-A+SHA3 optimized implementation"; -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_opt64_Initialize(void *state) -{ - memset(state, 0, 200); -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_opt64_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length) -{ - uint64_t lane; - - if (length == 0) - return; - if (length == 1) - lane = data[0]; - else { - lane = 0; - memcpy(&lane, data, length); - } - lane <<= offset*8; - ((uint64_t*)state)[lanePosition] ^= lane; -} - -/* ---------------------------------------------------------------- */ - -static void KeccakP1600_opt64_AddLanes(void *state, const unsigned char *data, unsigned int laneCount) -{ - unsigned int i = 0; - - for( ; (i+8)<=laneCount; i+=8) { - ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0]; - ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1]; - ((uint64_t*)state)[i+2] ^= ((uint64_t*)data)[i+2]; - ((uint64_t*)state)[i+3] ^= ((uint64_t*)data)[i+3]; - ((uint64_t*)state)[i+4] ^= ((uint64_t*)data)[i+4]; - ((uint64_t*)state)[i+5] ^= ((uint64_t*)data)[i+5]; - ((uint64_t*)state)[i+6] ^= ((uint64_t*)data)[i+6]; - ((uint64_t*)state)[i+7] ^= ((uint64_t*)data)[i+7]; - } - for( ; (i+4)<=laneCount; i+=4) { - ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0]; - ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1]; - ((uint64_t*)state)[i+2] ^= ((uint64_t*)data)[i+2]; - ((uint64_t*)state)[i+3] ^= ((uint64_t*)data)[i+3]; - } - for( ; (i+2)<=laneCount; i+=2) { - ((uint64_t*)state)[i+0] ^= ((uint64_t*)data)[i+0]; - ((uint64_t*)state)[i+1] ^= ((uint64_t*)data)[i+1]; - } - if (i 0) { \ - unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ - if (_bytesInLane > _sizeLeft) \ - _bytesInLane = _sizeLeft; \ - SnP_AddBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ - _sizeLeft -= _bytesInLane; \ - _lanePosition++; \ - _offsetInLane = 0; \ - _curData += _bytesInLane; \ - } \ - } \ - } - -void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) -{ - SnP_AddBytes(state, data, offset, length, KeccakP1600_opt64_AddLanes, KeccakP1600_opt64_AddBytesInLane, 8); -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_opt64_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length) -{ - uint64_t lane = ((uint64_t*)state)[lanePosition]; - { - uint64_t lane1[1]; - lane1[0] = lane; - memcpy(data, (uint8_t*)lane1+offset, length); - } -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_opt64_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount) -{ - memcpy(data, state, laneCount*8); -} - -/* ---------------------------------------------------------------- */ - -#define SnP_ExtractBytes(state, data, offset, length, SnP_ExtractLanes, SnP_ExtractBytesInLane, SnP_laneLengthInBytes) \ - { \ - if ((offset) == 0) { \ - SnP_ExtractLanes(state, data, (length)/SnP_laneLengthInBytes); \ - SnP_ExtractBytesInLane(state, \ - (length)/SnP_laneLengthInBytes, \ - (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \ - 0, \ - (length)%SnP_laneLengthInBytes); \ - } \ - else { \ - unsigned int _sizeLeft = (length); \ - unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \ - unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \ - unsigned char *_curData = (data); \ - while(_sizeLeft > 0) { \ - unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ - if (_bytesInLane > _sizeLeft) \ - _bytesInLane = _sizeLeft; \ - SnP_ExtractBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ - _sizeLeft -= _bytesInLane; \ - _lanePosition++; \ - _offsetInLane = 0; \ - _curData += _bytesInLane; \ - } \ - } \ - } - -void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) -{ - SnP_ExtractBytes(state, data, offset, length, KeccakP1600_opt64_ExtractLanes, KeccakP1600_opt64_ExtractBytesInLane, 8); -} - -/* ---------------------------------------------------------------- */ - -/* Keccak-p[1600]×2 */ - -int KeccakP1600times2_IsAvailable() -{ - return 1; -} - -const char * KeccakP1600times2_GetImplementation() -{ - return "ARMv8-A+SHA3 optimized implementation"; -} - -/* Keccak-p[1600]×4 */ - -int KeccakP1600times4_IsAvailable() -{ - return 0; -} - -const char * KeccakP1600times4_GetImplementation() -{ - return ""; -} - -void KangarooTwelve_Process4Leaves(const unsigned char *input, unsigned char *output) -{ -} - -/* Keccak-p[1600]×8 */ - -int KeccakP1600times8_IsAvailable() -{ - return 0; -} - -const char * KeccakP1600times8_GetImplementation() -{ - return ""; -} - -void KangarooTwelve_Process8Leaves(const unsigned char *input, unsigned char *output) -{ -} diff --git a/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-SnP.h b/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-SnP.h deleted file mode 100644 index ac76272..0000000 --- a/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-SnP.h +++ /dev/null @@ -1,35 +0,0 @@ -/* -K12 based on the eXtended Keccak Code Package (XKCP) -https://github.com/XKCP/XKCP - -The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. - -Implementation by Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to the Keccak Team website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ - ---- - -Please refer to the XKCP for more details. -*/ - -#ifndef _KeccakP_1600_SnP_h_ -#define _KeccakP_1600_SnP_h_ - -#define KeccakP1600_stateSizeInBytes 200 -#define KeccakP1600_stateAlignment 8 -#define KeccakP1600_disableParallelism - -const char * KeccakP1600_GetImplementation(); -void KeccakP1600_Initialize(void *state); -void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset); -void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); -void KeccakP1600_Permute_12rounds(void *state); -void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); - -#endif diff --git a/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c b/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c deleted file mode 100644 index a72dc7c..0000000 --- a/ffi-deps/K12/lib/Inplace32BI/KeccakP-1600-inplace32BI.c +++ /dev/null @@ -1,1068 +0,0 @@ -/* -K12 based on the eXtended Keccak Code Package (XKCP) -https://github.com/XKCP/XKCP - -The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. - -Implementation by Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to the Keccak Team website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ - ---- - -Please refer to the XKCP for more details. -*/ - -#include -#include -#include "brg_endian.h" -#include "KeccakP-1600-SnP.h" - -const char * KeccakP1600_GetImplementation() -{ - return "in-place 32-bit implementation"; -} - - -#define ROL32(a, offset) ((((uint32_t)a) << (offset)) ^ (((uint32_t)a) >> (32-(offset)))) - -/* Credit to Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */ -#define prepareToBitInterleaving(low, high, temp, temp0, temp1) \ - temp0 = (low); \ - temp = (temp0 ^ (temp0 >> 1)) & 0x22222222UL; temp0 = temp0 ^ temp ^ (temp << 1); \ - temp = (temp0 ^ (temp0 >> 2)) & 0x0C0C0C0CUL; temp0 = temp0 ^ temp ^ (temp << 2); \ - temp = (temp0 ^ (temp0 >> 4)) & 0x00F000F0UL; temp0 = temp0 ^ temp ^ (temp << 4); \ - temp = (temp0 ^ (temp0 >> 8)) & 0x0000FF00UL; temp0 = temp0 ^ temp ^ (temp << 8); \ - temp1 = (high); \ - temp = (temp1 ^ (temp1 >> 1)) & 0x22222222UL; temp1 = temp1 ^ temp ^ (temp << 1); \ - temp = (temp1 ^ (temp1 >> 2)) & 0x0C0C0C0CUL; temp1 = temp1 ^ temp ^ (temp << 2); \ - temp = (temp1 ^ (temp1 >> 4)) & 0x00F000F0UL; temp1 = temp1 ^ temp ^ (temp << 4); \ - temp = (temp1 ^ (temp1 >> 8)) & 0x0000FF00UL; temp1 = temp1 ^ temp ^ (temp << 8); - -#define toBitInterleavingAndXOR(low, high, even, odd, temp, temp0, temp1) \ - prepareToBitInterleaving(low, high, temp, temp0, temp1) \ - even ^= (temp0 & 0x0000FFFF) | (temp1 << 16); \ - odd ^= (temp0 >> 16) | (temp1 & 0xFFFF0000); - -#define toBitInterleavingAndAND(low, high, even, odd, temp, temp0, temp1) \ - prepareToBitInterleaving(low, high, temp, temp0, temp1) \ - even &= (temp0 & 0x0000FFFF) | (temp1 << 16); \ - odd &= (temp0 >> 16) | (temp1 & 0xFFFF0000); - -#define toBitInterleavingAndSet(low, high, even, odd, temp, temp0, temp1) \ - prepareToBitInterleaving(low, high, temp, temp0, temp1) \ - even = (temp0 & 0x0000FFFF) | (temp1 << 16); \ - odd = (temp0 >> 16) | (temp1 & 0xFFFF0000); - -/* Credit to Henry S. Warren, Hacker's Delight, Addison-Wesley, 2002 */ -#define prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \ - temp0 = (even); \ - temp1 = (odd); \ - temp = (temp0 & 0x0000FFFF) | (temp1 << 16); \ - temp1 = (temp0 >> 16) | (temp1 & 0xFFFF0000); \ - temp0 = temp; \ - temp = (temp0 ^ (temp0 >> 8)) & 0x0000FF00UL; temp0 = temp0 ^ temp ^ (temp << 8); \ - temp = (temp0 ^ (temp0 >> 4)) & 0x00F000F0UL; temp0 = temp0 ^ temp ^ (temp << 4); \ - temp = (temp0 ^ (temp0 >> 2)) & 0x0C0C0C0CUL; temp0 = temp0 ^ temp ^ (temp << 2); \ - temp = (temp0 ^ (temp0 >> 1)) & 0x22222222UL; temp0 = temp0 ^ temp ^ (temp << 1); \ - temp = (temp1 ^ (temp1 >> 8)) & 0x0000FF00UL; temp1 = temp1 ^ temp ^ (temp << 8); \ - temp = (temp1 ^ (temp1 >> 4)) & 0x00F000F0UL; temp1 = temp1 ^ temp ^ (temp << 4); \ - temp = (temp1 ^ (temp1 >> 2)) & 0x0C0C0C0CUL; temp1 = temp1 ^ temp ^ (temp << 2); \ - temp = (temp1 ^ (temp1 >> 1)) & 0x22222222UL; temp1 = temp1 ^ temp ^ (temp << 1); - -#define fromBitInterleaving(even, odd, low, high, temp, temp0, temp1) \ - prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \ - low = temp0; \ - high = temp1; - -#define fromBitInterleavingAndXOR(even, odd, lowIn, highIn, lowOut, highOut, temp, temp0, temp1) \ - prepareFromBitInterleaving(even, odd, temp, temp0, temp1) \ - lowOut = lowIn ^ temp0; \ - highOut = highIn ^ temp1; - -void KeccakP1600_SetBytesInLaneToZero(void *state, unsigned int lanePosition, unsigned int offset, unsigned int length) -{ - uint8_t laneAsBytes[8]; - uint32_t low, high; - uint32_t temp, temp0, temp1; - uint32_t *stateAsHalfLanes = (uint32_t*)state; - - memset(laneAsBytes, 0xFF, offset); - memset(laneAsBytes+offset, 0x00, length); - memset(laneAsBytes+offset+length, 0xFF, 8-offset-length); -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) - low = *((uint32_t*)(laneAsBytes+0)); - high = *((uint32_t*)(laneAsBytes+4)); -#else - low = laneAsBytes[0] - | ((uint32_t)(laneAsBytes[1]) << 8) - | ((uint32_t)(laneAsBytes[2]) << 16) - | ((uint32_t)(laneAsBytes[3]) << 24); - high = laneAsBytes[4] - | ((uint32_t)(laneAsBytes[5]) << 8) - | ((uint32_t)(laneAsBytes[6]) << 16) - | ((uint32_t)(laneAsBytes[7]) << 24); -#endif - toBitInterleavingAndAND(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1); -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_Initialize(void *state) -{ - memset(state, 0, 200); -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_AddByte(void *state, unsigned char byte, unsigned int offset) -{ - unsigned int lanePosition = offset/8; - unsigned int offsetInLane = offset%8; - uint32_t low, high; - uint32_t temp, temp0, temp1; - uint32_t *stateAsHalfLanes = (uint32_t*)state; - - if (offsetInLane < 4) { - low = (uint32_t)byte << (offsetInLane*8); - high = 0; - } - else { - low = 0; - high = (uint32_t)byte << ((offsetInLane-4)*8); - } - toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1); -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length) -{ - uint8_t laneAsBytes[8]; - uint32_t low, high; - uint32_t temp, temp0, temp1; - uint32_t *stateAsHalfLanes = (uint32_t*)state; - - memset(laneAsBytes, 0, 8); - memcpy(laneAsBytes+offset, data, length); -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) - low = *((uint32_t*)(laneAsBytes+0)); - high = *((uint32_t*)(laneAsBytes+4)); -#else - low = laneAsBytes[0] - | ((uint32_t)(laneAsBytes[1]) << 8) - | ((uint32_t)(laneAsBytes[2]) << 16) - | ((uint32_t)(laneAsBytes[3]) << 24); - high = laneAsBytes[4] - | ((uint32_t)(laneAsBytes[5]) << 8) - | ((uint32_t)(laneAsBytes[6]) << 16) - | ((uint32_t)(laneAsBytes[7]) << 24); -#endif - toBitInterleavingAndXOR(low, high, stateAsHalfLanes[lanePosition*2+0], stateAsHalfLanes[lanePosition*2+1], temp, temp0, temp1); -} - -/* ---------------------------------------------------------------- */ - -static void KeccakP1600_AddLanes(void *state, const unsigned char *data, unsigned int laneCount) -{ -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) - const uint32_t * pI = (const uint32_t *)data; - uint32_t * pS = (uint32_t*)state; - uint32_t t, x0, x1; - int i; - for (i = laneCount-1; i >= 0; --i) { -#ifdef NO_MISALIGNED_ACCESSES - uint32_t low; - uint32_t high; - memcpy(&low, pI++, 4); - memcpy(&high, pI++, 4); - toBitInterleavingAndXOR(low, high, *(pS++), *(pS++), t, x0, x1); -#else - toBitInterleavingAndXOR(*(pI++), *(pI++), *(pS++), *(pS++), t, x0, x1) -#endif - } -#else - unsigned int lanePosition; - for(lanePosition=0; lanePosition 0) { \ - unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ - if (_bytesInLane > _sizeLeft) \ - _bytesInLane = _sizeLeft; \ - SnP_AddBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ - _sizeLeft -= _bytesInLane; \ - _lanePosition++; \ - _offsetInLane = 0; \ - _curData += _bytesInLane; \ - } \ - } \ - } - -void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) -{ - SnP_AddBytes(state, data, offset, length, KeccakP1600_AddLanes, KeccakP1600_AddBytesInLane, 8); -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length) -{ - uint32_t *stateAsHalfLanes = (uint32_t*)state; - uint32_t low, high, temp, temp0, temp1; - uint8_t laneAsBytes[8]; - - fromBitInterleaving(stateAsHalfLanes[lanePosition*2], stateAsHalfLanes[lanePosition*2+1], low, high, temp, temp0, temp1); -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) - *((uint32_t*)(laneAsBytes+0)) = low; - *((uint32_t*)(laneAsBytes+4)) = high; -#else - laneAsBytes[0] = low & 0xFF; - laneAsBytes[1] = (low >> 8) & 0xFF; - laneAsBytes[2] = (low >> 16) & 0xFF; - laneAsBytes[3] = (low >> 24) & 0xFF; - laneAsBytes[4] = high & 0xFF; - laneAsBytes[5] = (high >> 8) & 0xFF; - laneAsBytes[6] = (high >> 16) & 0xFF; - laneAsBytes[7] = (high >> 24) & 0xFF; -#endif - memcpy(data, laneAsBytes+offset, length); -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount) -{ -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) - uint32_t * pI = (uint32_t *)data; - const uint32_t * pS = ( const uint32_t *)state; - uint32_t t, x0, x1; - int i; - for (i = laneCount-1; i >= 0; --i) { -#ifdef NO_MISALIGNED_ACCESSES - uint32_t low; - uint32_t high; - fromBitInterleaving(*(pS++), *(pS++), low, high, t, x0, x1); - memcpy(pI++, &low, 4); - memcpy(pI++, &high, 4); -#else - fromBitInterleaving(*(pS++), *(pS++), *(pI++), *(pI++), t, x0, x1) -#endif - } -#else - unsigned int lanePosition; - for(lanePosition=0; lanePosition> 8) & 0xFF; - laneAsBytes[2] = (low >> 16) & 0xFF; - laneAsBytes[3] = (low >> 24) & 0xFF; - laneAsBytes[4] = high & 0xFF; - laneAsBytes[5] = (high >> 8) & 0xFF; - laneAsBytes[6] = (high >> 16) & 0xFF; - laneAsBytes[7] = (high >> 24) & 0xFF; - memcpy(data+lanePosition*8, laneAsBytes, 8); - } -#endif -} - -/* ---------------------------------------------------------------- */ - -#define SnP_ExtractBytes(state, data, offset, length, SnP_ExtractLanes, SnP_ExtractBytesInLane, SnP_laneLengthInBytes) \ - { \ - if ((offset) == 0) { \ - SnP_ExtractLanes(state, data, (length)/SnP_laneLengthInBytes); \ - SnP_ExtractBytesInLane(state, \ - (length)/SnP_laneLengthInBytes, \ - (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \ - 0, \ - (length)%SnP_laneLengthInBytes); \ - } \ - else { \ - unsigned int _sizeLeft = (length); \ - unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \ - unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \ - unsigned char *_curData = (data); \ - while(_sizeLeft > 0) { \ - unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ - if (_bytesInLane > _sizeLeft) \ - _bytesInLane = _sizeLeft; \ - SnP_ExtractBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ - _sizeLeft -= _bytesInLane; \ - _lanePosition++; \ - _offsetInLane = 0; \ - _curData += _bytesInLane; \ - } \ - } \ - } - -void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) -{ - SnP_ExtractBytes(state, data, offset, length, KeccakP1600_ExtractLanes, KeccakP1600_ExtractBytesInLane, 8); -} - -/* ---------------------------------------------------------------- */ - -static const uint32_t KeccakF1600RoundConstants_int2[2*24+1] = -{ - 0x00000001UL, 0x00000000UL, - 0x00000000UL, 0x00000089UL, - 0x00000000UL, 0x8000008bUL, - 0x00000000UL, 0x80008080UL, - 0x00000001UL, 0x0000008bUL, - 0x00000001UL, 0x00008000UL, - 0x00000001UL, 0x80008088UL, - 0x00000001UL, 0x80000082UL, - 0x00000000UL, 0x0000000bUL, - 0x00000000UL, 0x0000000aUL, - 0x00000001UL, 0x00008082UL, - 0x00000000UL, 0x00008003UL, - 0x00000001UL, 0x0000808bUL, - 0x00000001UL, 0x8000000bUL, - 0x00000001UL, 0x8000008aUL, - 0x00000001UL, 0x80000081UL, - 0x00000000UL, 0x80000081UL, - 0x00000000UL, 0x80000008UL, - 0x00000000UL, 0x00000083UL, - 0x00000000UL, 0x80008003UL, - 0x00000001UL, 0x80008088UL, - 0x00000000UL, 0x80000088UL, - 0x00000001UL, 0x00008000UL, - 0x00000000UL, 0x80008082UL, - 0x000000FFUL -}; - -#define KeccakRound0() \ - Cx = Abu0^Agu0^Aku0^Amu0^Asu0; \ - Du1 = Abe1^Age1^Ake1^Ame1^Ase1; \ - Da0 = Cx^ROL32(Du1, 1); \ - Cz = Abu1^Agu1^Aku1^Amu1^Asu1; \ - Du0 = Abe0^Age0^Ake0^Ame0^Ase0; \ - Da1 = Cz^Du0; \ - Cw = Abi0^Agi0^Aki0^Ami0^Asi0; \ - Do0 = Cw^ROL32(Cz, 1); \ - Cy = Abi1^Agi1^Aki1^Ami1^Asi1; \ - Do1 = Cy^Cx; \ - Cx = Aba0^Aga0^Aka0^Ama0^Asa0; \ - De0 = Cx^ROL32(Cy, 1); \ - Cz = Aba1^Aga1^Aka1^Ama1^Asa1; \ - De1 = Cz^Cw; \ - Cy = Abo1^Ago1^Ako1^Amo1^Aso1; \ - Di0 = Du0^ROL32(Cy, 1); \ - Cw = Abo0^Ago0^Ako0^Amo0^Aso0; \ - Di1 = Du1^Cw; \ - Du0 = Cw^ROL32(Cz, 1); \ - Du1 = Cy^Cx; \ -\ - Ba = (Aba0^Da0); \ - Be = ROL32((Age0^De0), 22); \ - Bi = ROL32((Aki1^Di1), 22); \ - Bo = ROL32((Amo1^Do1), 11); \ - Bu = ROL32((Asu0^Du0), 7); \ - Aba0 = Ba ^((~Be)& Bi ); \ - Aba0 ^= *(pRoundConstants++); \ - Age0 = Be ^((~Bi)& Bo ); \ - Aki1 = Bi ^((~Bo)& Bu ); \ - Amo1 = Bo ^((~Bu)& Ba ); \ - Asu0 = Bu ^((~Ba)& Be ); \ - Ba = (Aba1^Da1); \ - Be = ROL32((Age1^De1), 22); \ - Bi = ROL32((Aki0^Di0), 21); \ - Bo = ROL32((Amo0^Do0), 10); \ - Bu = ROL32((Asu1^Du1), 7); \ - Aba1 = Ba ^((~Be)& Bi ); \ - Aba1 ^= *(pRoundConstants++); \ - Age1 = Be ^((~Bi)& Bo ); \ - Aki0 = Bi ^((~Bo)& Bu ); \ - Amo0 = Bo ^((~Bu)& Ba ); \ - Asu1 = Bu ^((~Ba)& Be ); \ - Bi = ROL32((Aka1^Da1), 2); \ - Bo = ROL32((Ame1^De1), 23); \ - Bu = ROL32((Asi1^Di1), 31); \ - Ba = ROL32((Abo0^Do0), 14); \ - Be = ROL32((Agu0^Du0), 10); \ - Aka1 = Ba ^((~Be)& Bi ); \ - Ame1 = Be ^((~Bi)& Bo ); \ - Asi1 = Bi ^((~Bo)& Bu ); \ - Abo0 = Bo ^((~Bu)& Ba ); \ - Agu0 = Bu ^((~Ba)& Be ); \ - Bi = ROL32((Aka0^Da0), 1); \ - Bo = ROL32((Ame0^De0), 22); \ - Bu = ROL32((Asi0^Di0), 30); \ - Ba = ROL32((Abo1^Do1), 14); \ - Be = ROL32((Agu1^Du1), 10); \ - Aka0 = Ba ^((~Be)& Bi ); \ - Ame0 = Be ^((~Bi)& Bo ); \ - Asi0 = Bi ^((~Bo)& Bu ); \ - Abo1 = Bo ^((~Bu)& Ba ); \ - Agu1 = Bu ^((~Ba)& Be ); \ - Bu = ROL32((Asa0^Da0), 9); \ - Ba = ROL32((Abe1^De1), 1); \ - Be = ROL32((Agi0^Di0), 3); \ - Bi = ROL32((Ako1^Do1), 13); \ - Bo = ROL32((Amu0^Du0), 4); \ - Asa0 = Ba ^((~Be)& Bi ); \ - Abe1 = Be ^((~Bi)& Bo ); \ - Agi0 = Bi ^((~Bo)& Bu ); \ - Ako1 = Bo ^((~Bu)& Ba ); \ - Amu0 = Bu ^((~Ba)& Be ); \ - Bu = ROL32((Asa1^Da1), 9); \ - Ba = (Abe0^De0); \ - Be = ROL32((Agi1^Di1), 3); \ - Bi = ROL32((Ako0^Do0), 12); \ - Bo = ROL32((Amu1^Du1), 4); \ - Asa1 = Ba ^((~Be)& Bi ); \ - Abe0 = Be ^((~Bi)& Bo ); \ - Agi1 = Bi ^((~Bo)& Bu ); \ - Ako0 = Bo ^((~Bu)& Ba ); \ - Amu1 = Bu ^((~Ba)& Be ); \ - Be = ROL32((Aga0^Da0), 18); \ - Bi = ROL32((Ake0^De0), 5); \ - Bo = ROL32((Ami1^Di1), 8); \ - Bu = ROL32((Aso0^Do0), 28); \ - Ba = ROL32((Abu1^Du1), 14); \ - Aga0 = Ba ^((~Be)& Bi ); \ - Ake0 = Be ^((~Bi)& Bo ); \ - Ami1 = Bi ^((~Bo)& Bu ); \ - Aso0 = Bo ^((~Bu)& Ba ); \ - Abu1 = Bu ^((~Ba)& Be ); \ - Be = ROL32((Aga1^Da1), 18); \ - Bi = ROL32((Ake1^De1), 5); \ - Bo = ROL32((Ami0^Di0), 7); \ - Bu = ROL32((Aso1^Do1), 28); \ - Ba = ROL32((Abu0^Du0), 13); \ - Aga1 = Ba ^((~Be)& Bi ); \ - Ake1 = Be ^((~Bi)& Bo ); \ - Ami0 = Bi ^((~Bo)& Bu ); \ - Aso1 = Bo ^((~Bu)& Ba ); \ - Abu0 = Bu ^((~Ba)& Be ); \ - Bo = ROL32((Ama1^Da1), 21); \ - Bu = ROL32((Ase0^De0), 1); \ - Ba = ROL32((Abi0^Di0), 31); \ - Be = ROL32((Ago1^Do1), 28); \ - Bi = ROL32((Aku1^Du1), 20); \ - Ama1 = Ba ^((~Be)& Bi ); \ - Ase0 = Be ^((~Bi)& Bo ); \ - Abi0 = Bi ^((~Bo)& Bu ); \ - Ago1 = Bo ^((~Bu)& Ba ); \ - Aku1 = Bu ^((~Ba)& Be ); \ - Bo = ROL32((Ama0^Da0), 20); \ - Bu = ROL32((Ase1^De1), 1); \ - Ba = ROL32((Abi1^Di1), 31); \ - Be = ROL32((Ago0^Do0), 27); \ - Bi = ROL32((Aku0^Du0), 19); \ - Ama0 = Ba ^((~Be)& Bi ); \ - Ase1 = Be ^((~Bi)& Bo ); \ - Abi1 = Bi ^((~Bo)& Bu ); \ - Ago0 = Bo ^((~Bu)& Ba ); \ - Aku0 = Bu ^((~Ba)& Be ) - -#define KeccakRound1() \ - Cx = Asu0^Agu0^Amu0^Abu1^Aku1; \ - Du1 = Age1^Ame0^Abe0^Ake1^Ase1; \ - Da0 = Cx^ROL32(Du1, 1); \ - Cz = Asu1^Agu1^Amu1^Abu0^Aku0; \ - Du0 = Age0^Ame1^Abe1^Ake0^Ase0; \ - Da1 = Cz^Du0; \ - Cw = Aki1^Asi1^Agi0^Ami1^Abi0; \ - Do0 = Cw^ROL32(Cz, 1); \ - Cy = Aki0^Asi0^Agi1^Ami0^Abi1; \ - Do1 = Cy^Cx; \ - Cx = Aba0^Aka1^Asa0^Aga0^Ama1; \ - De0 = Cx^ROL32(Cy, 1); \ - Cz = Aba1^Aka0^Asa1^Aga1^Ama0; \ - De1 = Cz^Cw; \ - Cy = Amo0^Abo1^Ako0^Aso1^Ago0; \ - Di0 = Du0^ROL32(Cy, 1); \ - Cw = Amo1^Abo0^Ako1^Aso0^Ago1; \ - Di1 = Du1^Cw; \ - Du0 = Cw^ROL32(Cz, 1); \ - Du1 = Cy^Cx; \ -\ - Ba = (Aba0^Da0); \ - Be = ROL32((Ame1^De0), 22); \ - Bi = ROL32((Agi1^Di1), 22); \ - Bo = ROL32((Aso1^Do1), 11); \ - Bu = ROL32((Aku1^Du0), 7); \ - Aba0 = Ba ^((~Be)& Bi ); \ - Aba0 ^= *(pRoundConstants++); \ - Ame1 = Be ^((~Bi)& Bo ); \ - Agi1 = Bi ^((~Bo)& Bu ); \ - Aso1 = Bo ^((~Bu)& Ba ); \ - Aku1 = Bu ^((~Ba)& Be ); \ - Ba = (Aba1^Da1); \ - Be = ROL32((Ame0^De1), 22); \ - Bi = ROL32((Agi0^Di0), 21); \ - Bo = ROL32((Aso0^Do0), 10); \ - Bu = ROL32((Aku0^Du1), 7); \ - Aba1 = Ba ^((~Be)& Bi ); \ - Aba1 ^= *(pRoundConstants++); \ - Ame0 = Be ^((~Bi)& Bo ); \ - Agi0 = Bi ^((~Bo)& Bu ); \ - Aso0 = Bo ^((~Bu)& Ba ); \ - Aku0 = Bu ^((~Ba)& Be ); \ - Bi = ROL32((Asa1^Da1), 2); \ - Bo = ROL32((Ake1^De1), 23); \ - Bu = ROL32((Abi1^Di1), 31); \ - Ba = ROL32((Amo1^Do0), 14); \ - Be = ROL32((Agu0^Du0), 10); \ - Asa1 = Ba ^((~Be)& Bi ); \ - Ake1 = Be ^((~Bi)& Bo ); \ - Abi1 = Bi ^((~Bo)& Bu ); \ - Amo1 = Bo ^((~Bu)& Ba ); \ - Agu0 = Bu ^((~Ba)& Be ); \ - Bi = ROL32((Asa0^Da0), 1); \ - Bo = ROL32((Ake0^De0), 22); \ - Bu = ROL32((Abi0^Di0), 30); \ - Ba = ROL32((Amo0^Do1), 14); \ - Be = ROL32((Agu1^Du1), 10); \ - Asa0 = Ba ^((~Be)& Bi ); \ - Ake0 = Be ^((~Bi)& Bo ); \ - Abi0 = Bi ^((~Bo)& Bu ); \ - Amo0 = Bo ^((~Bu)& Ba ); \ - Agu1 = Bu ^((~Ba)& Be ); \ - Bu = ROL32((Ama1^Da0), 9); \ - Ba = ROL32((Age1^De1), 1); \ - Be = ROL32((Asi1^Di0), 3); \ - Bi = ROL32((Ako0^Do1), 13); \ - Bo = ROL32((Abu1^Du0), 4); \ - Ama1 = Ba ^((~Be)& Bi ); \ - Age1 = Be ^((~Bi)& Bo ); \ - Asi1 = Bi ^((~Bo)& Bu ); \ - Ako0 = Bo ^((~Bu)& Ba ); \ - Abu1 = Bu ^((~Ba)& Be ); \ - Bu = ROL32((Ama0^Da1), 9); \ - Ba = (Age0^De0); \ - Be = ROL32((Asi0^Di1), 3); \ - Bi = ROL32((Ako1^Do0), 12); \ - Bo = ROL32((Abu0^Du1), 4); \ - Ama0 = Ba ^((~Be)& Bi ); \ - Age0 = Be ^((~Bi)& Bo ); \ - Asi0 = Bi ^((~Bo)& Bu ); \ - Ako1 = Bo ^((~Bu)& Ba ); \ - Abu0 = Bu ^((~Ba)& Be ); \ - Be = ROL32((Aka1^Da0), 18); \ - Bi = ROL32((Abe1^De0), 5); \ - Bo = ROL32((Ami0^Di1), 8); \ - Bu = ROL32((Ago1^Do0), 28); \ - Ba = ROL32((Asu1^Du1), 14); \ - Aka1 = Ba ^((~Be)& Bi ); \ - Abe1 = Be ^((~Bi)& Bo ); \ - Ami0 = Bi ^((~Bo)& Bu ); \ - Ago1 = Bo ^((~Bu)& Ba ); \ - Asu1 = Bu ^((~Ba)& Be ); \ - Be = ROL32((Aka0^Da1), 18); \ - Bi = ROL32((Abe0^De1), 5); \ - Bo = ROL32((Ami1^Di0), 7); \ - Bu = ROL32((Ago0^Do1), 28); \ - Ba = ROL32((Asu0^Du0), 13); \ - Aka0 = Ba ^((~Be)& Bi ); \ - Abe0 = Be ^((~Bi)& Bo ); \ - Ami1 = Bi ^((~Bo)& Bu ); \ - Ago0 = Bo ^((~Bu)& Ba ); \ - Asu0 = Bu ^((~Ba)& Be ); \ - Bo = ROL32((Aga1^Da1), 21); \ - Bu = ROL32((Ase0^De0), 1); \ - Ba = ROL32((Aki1^Di0), 31); \ - Be = ROL32((Abo1^Do1), 28); \ - Bi = ROL32((Amu1^Du1), 20); \ - Aga1 = Ba ^((~Be)& Bi ); \ - Ase0 = Be ^((~Bi)& Bo ); \ - Aki1 = Bi ^((~Bo)& Bu ); \ - Abo1 = Bo ^((~Bu)& Ba ); \ - Amu1 = Bu ^((~Ba)& Be ); \ - Bo = ROL32((Aga0^Da0), 20); \ - Bu = ROL32((Ase1^De1), 1); \ - Ba = ROL32((Aki0^Di1), 31); \ - Be = ROL32((Abo0^Do0), 27); \ - Bi = ROL32((Amu0^Du0), 19); \ - Aga0 = Ba ^((~Be)& Bi ); \ - Ase1 = Be ^((~Bi)& Bo ); \ - Aki0 = Bi ^((~Bo)& Bu ); \ - Abo0 = Bo ^((~Bu)& Ba ); \ - Amu0 = Bu ^((~Ba)& Be ); - -#define KeccakRound2() \ - Cx = Aku1^Agu0^Abu1^Asu1^Amu1; \ - Du1 = Ame0^Ake0^Age0^Abe0^Ase1; \ - Da0 = Cx^ROL32(Du1, 1); \ - Cz = Aku0^Agu1^Abu0^Asu0^Amu0; \ - Du0 = Ame1^Ake1^Age1^Abe1^Ase0; \ - Da1 = Cz^Du0; \ - Cw = Agi1^Abi1^Asi1^Ami0^Aki1; \ - Do0 = Cw^ROL32(Cz, 1); \ - Cy = Agi0^Abi0^Asi0^Ami1^Aki0; \ - Do1 = Cy^Cx; \ - Cx = Aba0^Asa1^Ama1^Aka1^Aga1; \ - De0 = Cx^ROL32(Cy, 1); \ - Cz = Aba1^Asa0^Ama0^Aka0^Aga0; \ - De1 = Cz^Cw; \ - Cy = Aso0^Amo0^Ako1^Ago0^Abo0; \ - Di0 = Du0^ROL32(Cy, 1); \ - Cw = Aso1^Amo1^Ako0^Ago1^Abo1; \ - Di1 = Du1^Cw; \ - Du0 = Cw^ROL32(Cz, 1); \ - Du1 = Cy^Cx; \ -\ - Ba = (Aba0^Da0); \ - Be = ROL32((Ake1^De0), 22); \ - Bi = ROL32((Asi0^Di1), 22); \ - Bo = ROL32((Ago0^Do1), 11); \ - Bu = ROL32((Amu1^Du0), 7); \ - Aba0 = Ba ^((~Be)& Bi ); \ - Aba0 ^= *(pRoundConstants++); \ - Ake1 = Be ^((~Bi)& Bo ); \ - Asi0 = Bi ^((~Bo)& Bu ); \ - Ago0 = Bo ^((~Bu)& Ba ); \ - Amu1 = Bu ^((~Ba)& Be ); \ - Ba = (Aba1^Da1); \ - Be = ROL32((Ake0^De1), 22); \ - Bi = ROL32((Asi1^Di0), 21); \ - Bo = ROL32((Ago1^Do0), 10); \ - Bu = ROL32((Amu0^Du1), 7); \ - Aba1 = Ba ^((~Be)& Bi ); \ - Aba1 ^= *(pRoundConstants++); \ - Ake0 = Be ^((~Bi)& Bo ); \ - Asi1 = Bi ^((~Bo)& Bu ); \ - Ago1 = Bo ^((~Bu)& Ba ); \ - Amu0 = Bu ^((~Ba)& Be ); \ - Bi = ROL32((Ama0^Da1), 2); \ - Bo = ROL32((Abe0^De1), 23); \ - Bu = ROL32((Aki0^Di1), 31); \ - Ba = ROL32((Aso1^Do0), 14); \ - Be = ROL32((Agu0^Du0), 10); \ - Ama0 = Ba ^((~Be)& Bi ); \ - Abe0 = Be ^((~Bi)& Bo ); \ - Aki0 = Bi ^((~Bo)& Bu ); \ - Aso1 = Bo ^((~Bu)& Ba ); \ - Agu0 = Bu ^((~Ba)& Be ); \ - Bi = ROL32((Ama1^Da0), 1); \ - Bo = ROL32((Abe1^De0), 22); \ - Bu = ROL32((Aki1^Di0), 30); \ - Ba = ROL32((Aso0^Do1), 14); \ - Be = ROL32((Agu1^Du1), 10); \ - Ama1 = Ba ^((~Be)& Bi ); \ - Abe1 = Be ^((~Bi)& Bo ); \ - Aki1 = Bi ^((~Bo)& Bu ); \ - Aso0 = Bo ^((~Bu)& Ba ); \ - Agu1 = Bu ^((~Ba)& Be ); \ - Bu = ROL32((Aga1^Da0), 9); \ - Ba = ROL32((Ame0^De1), 1); \ - Be = ROL32((Abi1^Di0), 3); \ - Bi = ROL32((Ako1^Do1), 13); \ - Bo = ROL32((Asu1^Du0), 4); \ - Aga1 = Ba ^((~Be)& Bi ); \ - Ame0 = Be ^((~Bi)& Bo ); \ - Abi1 = Bi ^((~Bo)& Bu ); \ - Ako1 = Bo ^((~Bu)& Ba ); \ - Asu1 = Bu ^((~Ba)& Be ); \ - Bu = ROL32((Aga0^Da1), 9); \ - Ba = (Ame1^De0); \ - Be = ROL32((Abi0^Di1), 3); \ - Bi = ROL32((Ako0^Do0), 12); \ - Bo = ROL32((Asu0^Du1), 4); \ - Aga0 = Ba ^((~Be)& Bi ); \ - Ame1 = Be ^((~Bi)& Bo ); \ - Abi0 = Bi ^((~Bo)& Bu ); \ - Ako0 = Bo ^((~Bu)& Ba ); \ - Asu0 = Bu ^((~Ba)& Be ); \ - Be = ROL32((Asa1^Da0), 18); \ - Bi = ROL32((Age1^De0), 5); \ - Bo = ROL32((Ami1^Di1), 8); \ - Bu = ROL32((Abo1^Do0), 28); \ - Ba = ROL32((Aku0^Du1), 14); \ - Asa1 = Ba ^((~Be)& Bi ); \ - Age1 = Be ^((~Bi)& Bo ); \ - Ami1 = Bi ^((~Bo)& Bu ); \ - Abo1 = Bo ^((~Bu)& Ba ); \ - Aku0 = Bu ^((~Ba)& Be ); \ - Be = ROL32((Asa0^Da1), 18); \ - Bi = ROL32((Age0^De1), 5); \ - Bo = ROL32((Ami0^Di0), 7); \ - Bu = ROL32((Abo0^Do1), 28); \ - Ba = ROL32((Aku1^Du0), 13); \ - Asa0 = Ba ^((~Be)& Bi ); \ - Age0 = Be ^((~Bi)& Bo ); \ - Ami0 = Bi ^((~Bo)& Bu ); \ - Abo0 = Bo ^((~Bu)& Ba ); \ - Aku1 = Bu ^((~Ba)& Be ); \ - Bo = ROL32((Aka0^Da1), 21); \ - Bu = ROL32((Ase0^De0), 1); \ - Ba = ROL32((Agi1^Di0), 31); \ - Be = ROL32((Amo0^Do1), 28); \ - Bi = ROL32((Abu0^Du1), 20); \ - Aka0 = Ba ^((~Be)& Bi ); \ - Ase0 = Be ^((~Bi)& Bo ); \ - Agi1 = Bi ^((~Bo)& Bu ); \ - Amo0 = Bo ^((~Bu)& Ba ); \ - Abu0 = Bu ^((~Ba)& Be ); \ - Bo = ROL32((Aka1^Da0), 20); \ - Bu = ROL32((Ase1^De1), 1); \ - Ba = ROL32((Agi0^Di1), 31); \ - Be = ROL32((Amo1^Do0), 27); \ - Bi = ROL32((Abu1^Du0), 19); \ - Aka1 = Ba ^((~Be)& Bi ); \ - Ase1 = Be ^((~Bi)& Bo ); \ - Agi0 = Bi ^((~Bo)& Bu ); \ - Amo1 = Bo ^((~Bu)& Ba ); \ - Abu1 = Bu ^((~Ba)& Be ); - -#define KeccakRound3() \ - Cx = Amu1^Agu0^Asu1^Aku0^Abu0; \ - Du1 = Ake0^Abe1^Ame1^Age0^Ase1; \ - Da0 = Cx^ROL32(Du1, 1); \ - Cz = Amu0^Agu1^Asu0^Aku1^Abu1; \ - Du0 = Ake1^Abe0^Ame0^Age1^Ase0; \ - Da1 = Cz^Du0; \ - Cw = Asi0^Aki0^Abi1^Ami1^Agi1; \ - Do0 = Cw^ROL32(Cz, 1); \ - Cy = Asi1^Aki1^Abi0^Ami0^Agi0; \ - Do1 = Cy^Cx; \ - Cx = Aba0^Ama0^Aga1^Asa1^Aka0; \ - De0 = Cx^ROL32(Cy, 1); \ - Cz = Aba1^Ama1^Aga0^Asa0^Aka1; \ - De1 = Cz^Cw; \ - Cy = Ago1^Aso0^Ako0^Abo0^Amo1; \ - Di0 = Du0^ROL32(Cy, 1); \ - Cw = Ago0^Aso1^Ako1^Abo1^Amo0; \ - Di1 = Du1^Cw; \ - Du0 = Cw^ROL32(Cz, 1); \ - Du1 = Cy^Cx; \ -\ - Ba = (Aba0^Da0); \ - Be = ROL32((Abe0^De0), 22); \ - Bi = ROL32((Abi0^Di1), 22); \ - Bo = ROL32((Abo0^Do1), 11); \ - Bu = ROL32((Abu0^Du0), 7); \ - Aba0 = Ba ^((~Be)& Bi ); \ - Aba0 ^= *(pRoundConstants++); \ - Abe0 = Be ^((~Bi)& Bo ); \ - Abi0 = Bi ^((~Bo)& Bu ); \ - Abo0 = Bo ^((~Bu)& Ba ); \ - Abu0 = Bu ^((~Ba)& Be ); \ - Ba = (Aba1^Da1); \ - Be = ROL32((Abe1^De1), 22); \ - Bi = ROL32((Abi1^Di0), 21); \ - Bo = ROL32((Abo1^Do0), 10); \ - Bu = ROL32((Abu1^Du1), 7); \ - Aba1 = Ba ^((~Be)& Bi ); \ - Aba1 ^= *(pRoundConstants++); \ - Abe1 = Be ^((~Bi)& Bo ); \ - Abi1 = Bi ^((~Bo)& Bu ); \ - Abo1 = Bo ^((~Bu)& Ba ); \ - Abu1 = Bu ^((~Ba)& Be ); \ - Bi = ROL32((Aga0^Da1), 2); \ - Bo = ROL32((Age0^De1), 23); \ - Bu = ROL32((Agi0^Di1), 31); \ - Ba = ROL32((Ago0^Do0), 14); \ - Be = ROL32((Agu0^Du0), 10); \ - Aga0 = Ba ^((~Be)& Bi ); \ - Age0 = Be ^((~Bi)& Bo ); \ - Agi0 = Bi ^((~Bo)& Bu ); \ - Ago0 = Bo ^((~Bu)& Ba ); \ - Agu0 = Bu ^((~Ba)& Be ); \ - Bi = ROL32((Aga1^Da0), 1); \ - Bo = ROL32((Age1^De0), 22); \ - Bu = ROL32((Agi1^Di0), 30); \ - Ba = ROL32((Ago1^Do1), 14); \ - Be = ROL32((Agu1^Du1), 10); \ - Aga1 = Ba ^((~Be)& Bi ); \ - Age1 = Be ^((~Bi)& Bo ); \ - Agi1 = Bi ^((~Bo)& Bu ); \ - Ago1 = Bo ^((~Bu)& Ba ); \ - Agu1 = Bu ^((~Ba)& Be ); \ - Bu = ROL32((Aka0^Da0), 9); \ - Ba = ROL32((Ake0^De1), 1); \ - Be = ROL32((Aki0^Di0), 3); \ - Bi = ROL32((Ako0^Do1), 13); \ - Bo = ROL32((Aku0^Du0), 4); \ - Aka0 = Ba ^((~Be)& Bi ); \ - Ake0 = Be ^((~Bi)& Bo ); \ - Aki0 = Bi ^((~Bo)& Bu ); \ - Ako0 = Bo ^((~Bu)& Ba ); \ - Aku0 = Bu ^((~Ba)& Be ); \ - Bu = ROL32((Aka1^Da1), 9); \ - Ba = (Ake1^De0); \ - Be = ROL32((Aki1^Di1), 3); \ - Bi = ROL32((Ako1^Do0), 12); \ - Bo = ROL32((Aku1^Du1), 4); \ - Aka1 = Ba ^((~Be)& Bi ); \ - Ake1 = Be ^((~Bi)& Bo ); \ - Aki1 = Bi ^((~Bo)& Bu ); \ - Ako1 = Bo ^((~Bu)& Ba ); \ - Aku1 = Bu ^((~Ba)& Be ); \ - Be = ROL32((Ama0^Da0), 18); \ - Bi = ROL32((Ame0^De0), 5); \ - Bo = ROL32((Ami0^Di1), 8); \ - Bu = ROL32((Amo0^Do0), 28); \ - Ba = ROL32((Amu0^Du1), 14); \ - Ama0 = Ba ^((~Be)& Bi ); \ - Ame0 = Be ^((~Bi)& Bo ); \ - Ami0 = Bi ^((~Bo)& Bu ); \ - Amo0 = Bo ^((~Bu)& Ba ); \ - Amu0 = Bu ^((~Ba)& Be ); \ - Be = ROL32((Ama1^Da1), 18); \ - Bi = ROL32((Ame1^De1), 5); \ - Bo = ROL32((Ami1^Di0), 7); \ - Bu = ROL32((Amo1^Do1), 28); \ - Ba = ROL32((Amu1^Du0), 13); \ - Ama1 = Ba ^((~Be)& Bi ); \ - Ame1 = Be ^((~Bi)& Bo ); \ - Ami1 = Bi ^((~Bo)& Bu ); \ - Amo1 = Bo ^((~Bu)& Ba ); \ - Amu1 = Bu ^((~Ba)& Be ); \ - Bo = ROL32((Asa0^Da1), 21); \ - Bu = ROL32((Ase0^De0), 1); \ - Ba = ROL32((Asi0^Di0), 31); \ - Be = ROL32((Aso0^Do1), 28); \ - Bi = ROL32((Asu0^Du1), 20); \ - Asa0 = Ba ^((~Be)& Bi ); \ - Ase0 = Be ^((~Bi)& Bo ); \ - Asi0 = Bi ^((~Bo)& Bu ); \ - Aso0 = Bo ^((~Bu)& Ba ); \ - Asu0 = Bu ^((~Ba)& Be ); \ - Bo = ROL32((Asa1^Da0), 20); \ - Bu = ROL32((Ase1^De1), 1); \ - Ba = ROL32((Asi1^Di1), 31); \ - Be = ROL32((Aso1^Do0), 27); \ - Bi = ROL32((Asu1^Du0), 19); \ - Asa1 = Ba ^((~Be)& Bi ); \ - Ase1 = Be ^((~Bi)& Bo ); \ - Asi1 = Bi ^((~Bo)& Bu ); \ - Aso1 = Bo ^((~Bu)& Ba ); \ - Asu1 = Bu ^((~Ba)& Be ); - -void KeccakP1600_Permute_Nrounds(void *state, unsigned int nRounds) -{ - uint32_t Da0, De0, Di0, Do0, Du0; - uint32_t Da1, De1, Di1, Do1, Du1; - uint32_t Ba, Be, Bi, Bo, Bu; - uint32_t Cx, Cy, Cz, Cw; - const uint32_t *pRoundConstants = KeccakF1600RoundConstants_int2+(24-nRounds)*2; - uint32_t *stateAsHalfLanes = (uint32_t*)state; - #define Aba0 stateAsHalfLanes[ 0] - #define Aba1 stateAsHalfLanes[ 1] - #define Abe0 stateAsHalfLanes[ 2] - #define Abe1 stateAsHalfLanes[ 3] - #define Abi0 stateAsHalfLanes[ 4] - #define Abi1 stateAsHalfLanes[ 5] - #define Abo0 stateAsHalfLanes[ 6] - #define Abo1 stateAsHalfLanes[ 7] - #define Abu0 stateAsHalfLanes[ 8] - #define Abu1 stateAsHalfLanes[ 9] - #define Aga0 stateAsHalfLanes[10] - #define Aga1 stateAsHalfLanes[11] - #define Age0 stateAsHalfLanes[12] - #define Age1 stateAsHalfLanes[13] - #define Agi0 stateAsHalfLanes[14] - #define Agi1 stateAsHalfLanes[15] - #define Ago0 stateAsHalfLanes[16] - #define Ago1 stateAsHalfLanes[17] - #define Agu0 stateAsHalfLanes[18] - #define Agu1 stateAsHalfLanes[19] - #define Aka0 stateAsHalfLanes[20] - #define Aka1 stateAsHalfLanes[21] - #define Ake0 stateAsHalfLanes[22] - #define Ake1 stateAsHalfLanes[23] - #define Aki0 stateAsHalfLanes[24] - #define Aki1 stateAsHalfLanes[25] - #define Ako0 stateAsHalfLanes[26] - #define Ako1 stateAsHalfLanes[27] - #define Aku0 stateAsHalfLanes[28] - #define Aku1 stateAsHalfLanes[29] - #define Ama0 stateAsHalfLanes[30] - #define Ama1 stateAsHalfLanes[31] - #define Ame0 stateAsHalfLanes[32] - #define Ame1 stateAsHalfLanes[33] - #define Ami0 stateAsHalfLanes[34] - #define Ami1 stateAsHalfLanes[35] - #define Amo0 stateAsHalfLanes[36] - #define Amo1 stateAsHalfLanes[37] - #define Amu0 stateAsHalfLanes[38] - #define Amu1 stateAsHalfLanes[39] - #define Asa0 stateAsHalfLanes[40] - #define Asa1 stateAsHalfLanes[41] - #define Ase0 stateAsHalfLanes[42] - #define Ase1 stateAsHalfLanes[43] - #define Asi0 stateAsHalfLanes[44] - #define Asi1 stateAsHalfLanes[45] - #define Aso0 stateAsHalfLanes[46] - #define Aso1 stateAsHalfLanes[47] - #define Asu0 stateAsHalfLanes[48] - #define Asu1 stateAsHalfLanes[49] - - nRounds &= 3; - switch ( nRounds ) - { - #define I0 Ba - #define I1 Be - #define T0 Bi - #define T1 Bo - #define SwapPI13( in0,in1,in2,in3,eo0,eo1,eo2,eo3 ) \ - I0 = (in0)[0]; I1 = (in0)[1]; \ - T0 = (in1)[0]; T1 = (in1)[1]; \ - (in0)[eo0] = T0; (in0)[eo0^1] = T1; \ - T0 = (in2)[0]; T1 = (in2)[1]; \ - (in1)[eo1] = T0; (in1)[eo1^1] = T1; \ - T0 = (in3)[0]; T1 = (in3)[1]; \ - (in2)[eo2] = T0; (in2)[eo2^1] = T1; \ - (in3)[eo3] = I0; (in3)[eo3^1] = I1 - #define SwapPI2( in0,in1,in2,in3 ) \ - I0 = (in0)[0]; I1 = (in0)[1]; \ - T0 = (in1)[0]; T1 = (in1)[1]; \ - (in0)[1] = T0; (in0)[0] = T1; \ - (in1)[1] = I0; (in1)[0] = I1; \ - I0 = (in2)[0]; I1 = (in2)[1]; \ - T0 = (in3)[0]; T1 = (in3)[1]; \ - (in2)[1] = T0; (in2)[0] = T1; \ - (in3)[1] = I0; (in3)[0] = I1 - #define SwapEO( even,odd ) T0 = even; even = odd; odd = T0 - - case 1: - SwapPI13( &Aga0, &Aka0, &Asa0, &Ama0, 1, 0, 1, 0 ); - SwapPI13( &Abe0, &Age0, &Ame0, &Ake0, 0, 1, 0, 1 ); - SwapPI13( &Abi0, &Aki0, &Agi0, &Asi0, 1, 0, 1, 0 ); - SwapEO( Ami0, Ami1 ); - SwapPI13( &Abo0, &Amo0, &Aso0, &Ago0, 1, 0, 1, 0 ); - SwapEO( Ako0, Ako1 ); - SwapPI13( &Abu0, &Asu0, &Aku0, &Amu0, 0, 1, 0, 1 ); - break; - - case 2: - SwapPI2( &Aga0, &Asa0, &Aka0, &Ama0 ); - SwapPI2( &Abe0, &Ame0, &Age0, &Ake0 ); - SwapPI2( &Abi0, &Agi0, &Aki0, &Asi0 ); - SwapPI2( &Abo0, &Aso0, &Ago0, &Amo0 ); - SwapPI2( &Abu0, &Aku0, &Amu0, &Asu0 ); - break; - - case 3: - SwapPI13( &Aga0, &Ama0, &Asa0, &Aka0, 0, 1, 0, 1 ); - SwapPI13( &Abe0, &Ake0, &Ame0, &Age0, 1, 0, 1, 0 ); - SwapPI13( &Abi0, &Asi0, &Agi0, &Aki0, 0, 1, 0, 1 ); - SwapEO( Ami0, Ami1 ); - SwapPI13( &Abo0, &Ago0, &Aso0, &Amo0, 0, 1, 0, 1 ); - SwapEO( Ako0, Ako1 ); - SwapPI13( &Abu0, &Amu0, &Aku0, &Asu0, 1, 0, 1, 0 ); - break; - #undef I0 - #undef I1 - #undef T0 - #undef T1 - #undef SwapPI13 - #undef SwapPI2 - #undef SwapEO - } - - do - { - /* Code for 4 rounds, using factor 2 interleaving, 64-bit lanes mapped to 32-bit words */ - switch ( nRounds ) - { - case 0: KeccakRound0(); /* fall through */ - case 3: KeccakRound1(); - case 2: KeccakRound2(); - case 1: KeccakRound3(); - } - nRounds = 0; - } - while ( *pRoundConstants != 0xFF ); - - #undef Aba0 - #undef Aba1 - #undef Abe0 - #undef Abe1 - #undef Abi0 - #undef Abi1 - #undef Abo0 - #undef Abo1 - #undef Abu0 - #undef Abu1 - #undef Aga0 - #undef Aga1 - #undef Age0 - #undef Age1 - #undef Agi0 - #undef Agi1 - #undef Ago0 - #undef Ago1 - #undef Agu0 - #undef Agu1 - #undef Aka0 - #undef Aka1 - #undef Ake0 - #undef Ake1 - #undef Aki0 - #undef Aki1 - #undef Ako0 - #undef Ako1 - #undef Aku0 - #undef Aku1 - #undef Ama0 - #undef Ama1 - #undef Ame0 - #undef Ame1 - #undef Ami0 - #undef Ami1 - #undef Amo0 - #undef Amo1 - #undef Amu0 - #undef Amu1 - #undef Asa0 - #undef Asa1 - #undef Ase0 - #undef Ase1 - #undef Asi0 - #undef Asi1 - #undef Aso0 - #undef Aso1 - #undef Asu0 - #undef Asu1 -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_Permute_12rounds(void *state) -{ - KeccakP1600_Permute_Nrounds(state, 12); -} diff --git a/ffi-deps/K12/lib/KangarooTwelve.c b/ffi-deps/K12/lib/KangarooTwelve.c deleted file mode 100644 index ad184b1..0000000 --- a/ffi-deps/K12/lib/KangarooTwelve.c +++ /dev/null @@ -1,333 +0,0 @@ -/* -K12 based on the eXtended Keccak Code Package (XKCP) -https://github.com/XKCP/XKCP - -KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier. - -Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to the Keccak Team website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ -*/ - -#include -#include -#include "KangarooTwelve.h" -#include "KeccakP-1600-SnP.h" - -/* ---------------------------------------------------------------- */ - -#define K12_security 128 -#define K12_capacity (2*K12_security) -#define K12_capacityInBytes (K12_capacity/8) -#define K12_rate (1600-K12_capacity) -#define K12_rateInBytes (K12_rate/8) -#define K12_rateInLanes (K12_rate/64) - -static void TurboSHAKE128_Initialize(TurboSHAKE128_Instance *instance) -{ - KeccakP1600_Initialize(instance->state); - instance->byteIOIndex = 0; - instance->squeezing = 0; -} - -static void TurboSHAKE128_Absorb(TurboSHAKE128_Instance *instance, const unsigned char *data, size_t dataByteLen) -{ - size_t i, j; - uint8_t partialBlock; - const unsigned char *curData; - const uint8_t rateInBytes = K12_rateInBytes; - - assert(instance->squeezing == 0); - - i = 0; - curData = data; - while(i < dataByteLen) { - if ((instance->byteIOIndex == 0) && (dataByteLen-i >= rateInBytes)) { -#ifdef KeccakP1600_12rounds_FastLoop_supported - /* processing full blocks first */ - j = KeccakP1600_12rounds_FastLoop_Absorb(instance->state, K12_rateInLanes, curData, dataByteLen - i); - i += j; - curData += j; -#endif - for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) { - KeccakP1600_AddBytes(instance->state, curData, 0, rateInBytes); - KeccakP1600_Permute_12rounds(instance->state); - curData+=rateInBytes; - } - i = dataByteLen - j; - } else { - /* normal lane: using the message queue */ - if (dataByteLen - i > (size_t)rateInBytes - instance->byteIOIndex) { - partialBlock = rateInBytes-instance->byteIOIndex; - } else { - partialBlock = (uint8_t)(dataByteLen - i); - } - i += partialBlock; - - KeccakP1600_AddBytes(instance->state, curData, instance->byteIOIndex, partialBlock); - curData += partialBlock; - instance->byteIOIndex += partialBlock; - if (instance->byteIOIndex == rateInBytes) { - KeccakP1600_Permute_12rounds(instance->state); - instance->byteIOIndex = 0; - } - } - } -} - -static void TurboSHAKE128_AbsorbDomainSeparationByte(TurboSHAKE128_Instance *instance, unsigned char D) -{ - const unsigned int rateInBytes = K12_rateInBytes; - - assert(D != 0); - assert(instance->squeezing == 0); - - /* Last few bits, whose delimiter coincides with first bit of padding */ - KeccakP1600_AddByte(instance->state, D, instance->byteIOIndex); - /* If the first bit of padding is at position rate-1, we need a whole new block for the second bit of padding */ - if ((D >= 0x80) && (instance->byteIOIndex == (rateInBytes-1))) - KeccakP1600_Permute_12rounds(instance->state); - /* Second bit of padding */ - KeccakP1600_AddByte(instance->state, 0x80, rateInBytes-1); - KeccakP1600_Permute_12rounds(instance->state); - instance->byteIOIndex = 0; - instance->squeezing = 1; -} - -static void TurboSHAKE128_Squeeze(TurboSHAKE128_Instance *instance, unsigned char *data, size_t dataByteLen) -{ - size_t i, j; - unsigned int partialBlock; - const unsigned int rateInBytes = K12_rateInBytes; - unsigned char *curData; - - if (!instance->squeezing) - TurboSHAKE128_AbsorbDomainSeparationByte(instance, 0x01); - - i = 0; - curData = data; - while(i < dataByteLen) { - if ((instance->byteIOIndex == rateInBytes) && (dataByteLen-i >= rateInBytes)) { - for(j=dataByteLen-i; j>=rateInBytes; j-=rateInBytes) { - KeccakP1600_Permute_12rounds(instance->state); - KeccakP1600_ExtractBytes(instance->state, curData, 0, rateInBytes); - curData+=rateInBytes; - } - i = dataByteLen - j; - } else { - /* normal lane: using the message queue */ - if (instance->byteIOIndex == rateInBytes) { - KeccakP1600_Permute_12rounds(instance->state); - instance->byteIOIndex = 0; - } - if (dataByteLen-i > rateInBytes-instance->byteIOIndex) - partialBlock = rateInBytes-instance->byteIOIndex; - else - partialBlock = (unsigned int)(dataByteLen - i); - i += partialBlock; - - KeccakP1600_ExtractBytes(instance->state, curData, instance->byteIOIndex, partialBlock); - curData += partialBlock; - instance->byteIOIndex += partialBlock; - } - } -} - -/* ---------------------------------------------------------------- */ - -typedef enum { - NOT_INITIALIZED, - ABSORBING, - FINAL, - SQUEEZING -} KCP_Phases; -typedef KCP_Phases KangarooTwelve_Phases; - -#define K12_chunkSize 8192 -#define K12_suffixLeaf 0x0B /* '110': message hop, simple padding, inner node */ - -#ifndef KeccakP1600_disableParallelism - -void KangarooTwelve_Process2Leaves(const unsigned char *input, unsigned char *output); -void KangarooTwelve_Process4Leaves(const unsigned char *input, unsigned char *output); -void KangarooTwelve_Process8Leaves(const unsigned char *input, unsigned char *output); - -#define ProcessLeaves( Parallellism ) \ - while (inputByteLen >= Parallellism * K12_chunkSize) { \ - unsigned char intermediate[Parallellism*K12_capacityInBytes]; \ - \ - KangarooTwelve_Process##Parallellism##Leaves(input, intermediate); \ - input += Parallellism * K12_chunkSize; \ - inputByteLen -= Parallellism * K12_chunkSize; \ - ktInstance->blockNumber += Parallellism; \ - TurboSHAKE128_Absorb(&ktInstance->finalNode, intermediate, Parallellism * K12_capacityInBytes); \ - } - -#endif // KeccakP1600_disableParallelism - -static unsigned int right_encode(unsigned char * encbuf, size_t value) -{ - unsigned int n, i; - size_t v; - - for (v = value, n = 0; v && (n < sizeof(size_t)); ++n, v >>= 8) - ; /* empty */ - for (i = 1; i <= n; ++i) { - encbuf[i-1] = (unsigned char)(value >> (8 * (n-i))); - } - encbuf[n] = (unsigned char)n; - return n + 1; -} - -int KangarooTwelve_Initialize(KangarooTwelve_Instance *ktInstance, size_t outputByteLen) -{ - ktInstance->fixedOutputLength = outputByteLen; - ktInstance->queueAbsorbedLen = 0; - ktInstance->blockNumber = 0; - ktInstance->phase = ABSORBING; - TurboSHAKE128_Initialize(&ktInstance->finalNode); - return 0; -} - -int KangarooTwelve_Update(KangarooTwelve_Instance *ktInstance, const unsigned char *input, size_t inputByteLen) -{ - if (ktInstance->phase != ABSORBING) - return 1; - - if (ktInstance->blockNumber == 0) { - /* First block, absorb in final node */ - unsigned int len = (inputByteLen < (K12_chunkSize - ktInstance->queueAbsorbedLen)) ? (unsigned int)inputByteLen : (K12_chunkSize - ktInstance->queueAbsorbedLen); - TurboSHAKE128_Absorb(&ktInstance->finalNode, input, len); - input += len; - inputByteLen -= len; - ktInstance->queueAbsorbedLen += len; - if ((ktInstance->queueAbsorbedLen == K12_chunkSize) && (inputByteLen != 0)) { - /* First block complete and more input data available, finalize it */ - const unsigned char padding = 0x03; /* '110^6': message hop, simple padding */ - ktInstance->queueAbsorbedLen = 0; - ktInstance->blockNumber = 1; - TurboSHAKE128_Absorb(&ktInstance->finalNode, &padding, 1); - ktInstance->finalNode.byteIOIndex = (ktInstance->finalNode.byteIOIndex + 7) & ~7; /* Zero padding up to 64 bits */ - } - } else if (ktInstance->queueAbsorbedLen != 0) { - /* There is data in the queue, absorb further in queue until block complete */ - unsigned int len = (inputByteLen < (K12_chunkSize - ktInstance->queueAbsorbedLen)) ? (unsigned int)inputByteLen : (K12_chunkSize - ktInstance->queueAbsorbedLen); - TurboSHAKE128_Absorb(&ktInstance->queueNode, input, len); - input += len; - inputByteLen -= len; - ktInstance->queueAbsorbedLen += len; - if (ktInstance->queueAbsorbedLen == K12_chunkSize) { - unsigned char intermediate[K12_capacityInBytes]; - ktInstance->queueAbsorbedLen = 0; - ++ktInstance->blockNumber; - TurboSHAKE128_AbsorbDomainSeparationByte(&ktInstance->queueNode, K12_suffixLeaf); - TurboSHAKE128_Squeeze(&ktInstance->queueNode, intermediate, K12_capacityInBytes); - TurboSHAKE128_Absorb(&ktInstance->finalNode, intermediate, K12_capacityInBytes); - } - } - -#ifndef KeccakP1600_disableParallelism - if (KeccakP1600times8_IsAvailable()) { - ProcessLeaves(8); - } - - if (KeccakP1600times4_IsAvailable()) { - ProcessLeaves(4); - } - - if (KeccakP1600times2_IsAvailable()) { - ProcessLeaves(2); - } -#endif - - while (inputByteLen > 0) { - unsigned int len = (inputByteLen < K12_chunkSize) ? (unsigned int)inputByteLen : K12_chunkSize; - TurboSHAKE128_Initialize(&ktInstance->queueNode); - TurboSHAKE128_Absorb(&ktInstance->queueNode, input, len); - input += len; - inputByteLen -= len; - if (len == K12_chunkSize) { - unsigned char intermediate[K12_capacityInBytes]; - ++ktInstance->blockNumber; - TurboSHAKE128_AbsorbDomainSeparationByte(&ktInstance->queueNode, K12_suffixLeaf); - TurboSHAKE128_Squeeze(&ktInstance->queueNode, intermediate, K12_capacityInBytes); - TurboSHAKE128_Absorb(&ktInstance->finalNode, intermediate, K12_capacityInBytes); - } else { - ktInstance->queueAbsorbedLen = len; - } - } - - return 0; -} - -int KangarooTwelve_Final(KangarooTwelve_Instance *ktInstance, unsigned char *output, const unsigned char *customization, size_t customByteLen) -{ - unsigned char encbuf[sizeof(size_t)+1+2]; - unsigned char padding; - - if (ktInstance->phase != ABSORBING) - return 1; - - /* Absorb customization | right_encode(customByteLen) */ - if ((customByteLen != 0) && (KangarooTwelve_Update(ktInstance, customization, customByteLen) != 0)) - return 1; - if (KangarooTwelve_Update(ktInstance, encbuf, right_encode(encbuf, customByteLen)) != 0) - return 1; - - if (ktInstance->blockNumber == 0) { - /* Non complete first block in final node, pad it */ - padding = 0x07; /* '11': message hop, final node */ - } else { - unsigned int n; - - if (ktInstance->queueAbsorbedLen != 0) { - /* There is data in the queue node */ - unsigned char intermediate[K12_capacityInBytes]; - ++ktInstance->blockNumber; - TurboSHAKE128_AbsorbDomainSeparationByte(&ktInstance->queueNode, K12_suffixLeaf); - TurboSHAKE128_Squeeze(&ktInstance->queueNode, intermediate, K12_capacityInBytes); - TurboSHAKE128_Absorb(&ktInstance->finalNode, intermediate, K12_capacityInBytes); - } - --ktInstance->blockNumber; /* Absorb right_encode(number of Chaining Values) || 0xFF || 0xFF */ - n = right_encode(encbuf, ktInstance->blockNumber); - encbuf[n++] = 0xFF; - encbuf[n++] = 0xFF; - TurboSHAKE128_Absorb(&ktInstance->finalNode, encbuf, n); - padding = 0x06; /* '01': chaining hop, final node */ - } - TurboSHAKE128_AbsorbDomainSeparationByte(&ktInstance->finalNode, padding); - if (ktInstance->fixedOutputLength != 0) { - ktInstance->phase = FINAL; - TurboSHAKE128_Squeeze(&ktInstance->finalNode, output, ktInstance->fixedOutputLength); - return 0; - } - ktInstance->phase = SQUEEZING; - return 0; -} - -int KangarooTwelve_Squeeze(KangarooTwelve_Instance *ktInstance, unsigned char *output, size_t outputByteLen) -{ - if (ktInstance->phase != SQUEEZING) - return 1; - TurboSHAKE128_Squeeze(&ktInstance->finalNode, output, outputByteLen); - return 0; -} - -int KangarooTwelve(const unsigned char *input, size_t inputByteLen, - unsigned char *output, size_t outputByteLen, - const unsigned char *customization, size_t customByteLen) -{ - KangarooTwelve_Instance ktInstance; - - if (outputByteLen == 0) - return 1; - KangarooTwelve_Initialize(&ktInstance, outputByteLen); - if (KangarooTwelve_Update(&ktInstance, input, inputByteLen) != 0) - return 1; - return KangarooTwelve_Final(&ktInstance, output, customization, customByteLen); -} diff --git a/ffi-deps/K12/lib/KangarooTwelve.h b/ffi-deps/K12/lib/KangarooTwelve.h deleted file mode 100644 index f7b3e33..0000000 --- a/ffi-deps/K12/lib/KangarooTwelve.h +++ /dev/null @@ -1,134 +0,0 @@ -/* -K12 based on the eXtended Keccak Code Package (XKCP) -https://github.com/XKCP/XKCP - -KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier. - -Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to the Keccak Team website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ -*/ - -#ifndef _KangarooTwelve_h_ -#define _KangarooTwelve_h_ - -#include -#include -#include "align.h" -#include "KeccakP-1600-SnP.h" - -typedef struct TurboSHAKE128_InstanceStruct { - uint8_t state[KeccakP1600_stateSizeInBytes]; - uint8_t byteIOIndex; - uint8_t squeezing; -} TurboSHAKE128_Instance; - -typedef struct KangarooTwelve_InstanceStruct { - ALIGN(KeccakP1600_stateAlignment) TurboSHAKE128_Instance queueNode; - ALIGN(KeccakP1600_stateAlignment) TurboSHAKE128_Instance finalNode; - size_t fixedOutputLength; - size_t blockNumber; - unsigned int queueAbsorbedLen; - int phase; -} KangarooTwelve_Instance; - -/** Extendable ouput function KangarooTwelve. - * @param input Pointer to the input message (M). - * @param inputByteLen The length of the input message in bytes. - * @param output Pointer to the output buffer. - * @param outputByteLen The desired number of output bytes. - * @param customization Pointer to the customization string (C). - * @param customByteLen The length of the customization string in bytes. - * @return 0 if successful, 1 otherwise. - */ -int KangarooTwelve(const unsigned char *input, size_t inputByteLen, unsigned char *output, size_t outputByteLen, const unsigned char *customization, size_t customByteLen); - -/** - * Function to initialize a KangarooTwelve instance. - * @param ktInstance Pointer to the instance to be initialized. - * @param outputByteLen The desired number of output bytes, - * or 0 for an arbitrarily-long output. - * @return 0 if successful, 1 otherwise. - */ -int KangarooTwelve_Initialize(KangarooTwelve_Instance *ktInstance, size_t outputByteLen); - -/** - * Function to give input data to be absorbed. - * @param ktInstance Pointer to the instance initialized by KangarooTwelve_Initialize(). - * @param input Pointer to the input message data (M). - * @param inputByteLen The number of bytes provided in the input message data. - * @return 0 if successful, 1 otherwise. - */ -int KangarooTwelve_Update(KangarooTwelve_Instance *ktInstance, const unsigned char *input, size_t inputByteLen); - -/** - * Function to call after all the input message has been input, and to get - * output bytes if the length was specified when calling KangarooTwelve_Initialize(). - * @param ktInstance Pointer to the hash instance initialized by KangarooTwelve_Initialize(). - * If @a outputByteLen was not 0 in the call to KangarooTwelve_Initialize(), the number of - * output bytes is equal to @a outputByteLen. - * If @a outputByteLen was 0 in the call to KangarooTwelve_Initialize(), the output bytes - * must be extracted using the KangarooTwelve_Squeeze() function. - * @param output Pointer to the buffer where to store the output data. - * @param customization Pointer to the customization string (C). - * @param customByteLen The length of the customization string in bytes. - * @return 0 if successful, 1 otherwise. - */ -int KangarooTwelve_Final(KangarooTwelve_Instance *ktInstance, unsigned char *output, const unsigned char *customization, size_t customByteLen); - -/** - * Function to squeeze output data. - * @param ktInstance Pointer to the hash instance initialized by KangarooTwelve_Initialize(). - * @param data Pointer to the buffer where to store the output data. - * @param outputByteLen The number of output bytes desired. - * @pre KangarooTwelve_Final() must have been already called. - * @return 0 if successful, 1 otherwise. - */ -int KangarooTwelve_Squeeze(KangarooTwelve_Instance *ktInstance, unsigned char *output, size_t outputByteLen); - -#if !defined(KeccakP1600_disableParallelism) && defined(KeccakP1600_enable_simd_options) -/** - * Functions to selectively disable the use of CPU features. Should be rarely - * needed; if you're not sure this is what you want, don't worry about it. - * - * /!\ WARNING /!\: Calling these functions REQUIRES that there are no - * KangarooTwelve instances in use. The effects are global and affect the code - * paths taken by every call, as well as the details of the represented states. - * Calling these functions in the middle of your program (as opposed to during - * setup) is PROBABLY WRONG. - * - * These functions are at present only used to increase test suite coverage, - * and demonstrate comparative performance between implementations in different - * instruction sets. To enable them, the macro KeccakP1600_enable_simd_options - * must be defined at compile time. - * - * They can potentially also be useful in an environment where it is - * detrimental to online large vector units on the CPU, since doing so can lead - * to downclocking, performance hits in other threads sharing the same CPU - * core, and short delays while the CPU's power license is increased to online - * the vector unit. - * - * In the majority of situations, however, this should rarely matter and it is - * usually the case that the performance difference will be a wash or even an - * overall improvement despite the downsides. - * - * @return 1 if the feature was enabled and available and has been turned off, - * 0 if it was already disabled or unavailable. - */ -int KangarooTwelve_DisableAVX512(void); -int KangarooTwelve_DisableAVX2(void); -int KangarooTwelve_DisableSSSE3(void); - -/** - * Function to reset all CPU features to enabled-if-available. Calling this - * always has no effect if no CPU features have been explicitly disabled. - */ -void KangarooTwelve_EnableAllCpuFeatures(void); -#endif // !KeccakP1600_disableParallelism && KeccakP1600_enable_simd_options - -#endif diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s deleted file mode 100644 index d7ae46b..0000000 --- a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX2.s +++ /dev/null @@ -1,664 +0,0 @@ -# Copyright (c) 2006-2017, CRYPTOGAMS by -# Copyright (c) 2017 Ronny Van Keer -# All rights reserved. -# -# The source code in this file is licensed under the CRYPTOGAMS license. -# For further details see http://www.openssl.org/~appro/cryptogams/. -# -# Notes: -# The code for the permutation (__KeccakF1600) was generated with -# Andy Polyakov's keccak1600-avx2.pl from the CRYPTOGAMS project -# (https://github.com/dot-asm/cryptogams/blob/master/x86_64/keccak1600-avx2.pl). -# The rest of the code was written by Ronny Van Keer. -# Adaptations for macOS by Stéphane Léon. - -.text - -# ----------------------------------------------------------------------------- -# -# void KeccakP1600_AVX2_Initialize(void *state); -# -.ifdef macOS -.globl _KeccakP1600_AVX2_Initialize -_KeccakP1600_AVX2_Initialize: -.else -.globl KeccakP1600_AVX2_Initialize -.type KeccakP1600_AVX2_Initialize,@function -KeccakP1600_AVX2_Initialize: -.endif -.balign 32 - vpxor %ymm0,%ymm0,%ymm0 - vmovdqu %ymm0,0*32(%rdi) - vmovdqu %ymm0,1*32(%rdi) - vmovdqu %ymm0,2*32(%rdi) - vmovdqu %ymm0,3*32(%rdi) - vmovdqu %ymm0,4*32(%rdi) - vmovdqu %ymm0,5*32(%rdi) - movq $0,6*32(%rdi) - ret -.ifdef macOS -.else -.size KeccakP1600_AVX2_Initialize,.-KeccakP1600_AVX2_Initialize -.endif - -# ----------------------------------------------------------------------------- -# -# void KeccakP1600_AVX2_AddByte(void *state, unsigned char data, unsigned int offset); -# %rdi %rsi %rdx -# -.ifdef macOS -.globl _KeccakP1600_AVX2_AddByte -_KeccakP1600_AVX2_AddByte: -.else -.globl KeccakP1600_AVX2_AddByte -.type KeccakP1600_AVX2_AddByte,@function -KeccakP1600_AVX2_AddByte: -.endif -.balign 32 - mov %rdx, %rax - and $7, %rax - and $0xFFFFFFF8, %edx - lea mapState(%rip), %r9 - mov (%r9, %rdx), %rdx - add %rdx, %rdi - add %rax, %rdi - xorb %sil, (%rdi) - ret -.ifdef macOS -.else -.size KeccakP1600_AVX2_AddByte,.-KeccakP1600_AVX2_AddByte -.endif - -# ----------------------------------------------------------------------------- -# -# void KeccakP1600_AVX2_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); -# %rdi %rsi %rdx %rcx -# -.ifdef macOS -.globl _KeccakP1600_AVX2_AddBytes -_KeccakP1600_AVX2_AddBytes: -.else -.globl KeccakP1600_AVX2_AddBytes -.type KeccakP1600_AVX2_AddBytes,@function -KeccakP1600_AVX2_AddBytes: -.endif -.balign 32 - cmp $0, %rcx - jz KeccakP1600_AVX2_AddBytes_Exit - mov %rdx, %rax # rax offset in lane - and $0xFFFFFFF8, %edx # rdx pointer into state index mapper - lea mapState(%rip), %r9 - add %r9, %rdx - and $7, %rax - jz KeccakP1600_AVX2_AddBytes_LaneAlignedCheck - mov $8, %r9 # r9 is (max) length of incomplete lane - sub %rax, %r9 - cmp %rcx, %r9 - cmovae %rcx, %r9 - sub %r9, %rcx # length -= length of incomplete lane - add (%rdx), %rax # rax = pointer to state lane - add $8, %rdx - add %rdi, %rax -KeccakP1600_AVX2_AddBytes_NotAlignedLoop: - mov (%rsi), %r8b - inc %rsi - xorb %r8b, (%rax) - inc %rax - dec %r9 - jnz KeccakP1600_AVX2_AddBytes_NotAlignedLoop - jmp KeccakP1600_AVX2_AddBytes_LaneAlignedCheck -KeccakP1600_AVX2_AddBytes_LaneAlignedLoop: - mov (%rsi), %r8 - add $8, %rsi - mov (%rdx), %rax - add $8, %rdx - add %rdi, %rax - xor %r8, (%rax) -KeccakP1600_AVX2_AddBytes_LaneAlignedCheck: - sub $8, %rcx - jnc KeccakP1600_AVX2_AddBytes_LaneAlignedLoop -KeccakP1600_AVX2_AddBytes_LastIncompleteLane: - add $8, %rcx - jz KeccakP1600_AVX2_AddBytes_Exit - mov (%rdx), %rax - add %rdi, %rax -KeccakP1600_AVX2_AddBytes_LastIncompleteLaneLoop: - mov (%rsi), %r8b - inc %rsi - xor %r8b, (%rax) - inc %rax - dec %rcx - jnz KeccakP1600_AVX2_AddBytes_LastIncompleteLaneLoop -KeccakP1600_AVX2_AddBytes_Exit: - ret -.ifdef macOS -.else -.size KeccakP1600_AVX2_AddBytes,.-KeccakP1600_AVX2_AddBytes -.endif - -# ----------------------------------------------------------------------------- -# -# void KeccakP1600_AVX2_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); -# %rdi %rsi %rdx %rcx -# -.ifdef macOS -.globl _KeccakP1600_AVX2_ExtractBytes -_KeccakP1600_AVX2_ExtractBytes: -.else -.globl KeccakP1600_AVX2_ExtractBytes -.type KeccakP1600_AVX2_ExtractBytes,@function -KeccakP1600_AVX2_ExtractBytes: -.endif -.balign 32 - push %rbx - cmp $0, %rcx - jz KeccakP1600_AVX2_ExtractBytes_Exit - mov %rdx, %rax # rax offset in lane - and $0xFFFFFFF8, %edx # rdx pointer into state index mapper - lea mapState(%rip), %r9 - add %r9, %rdx - and $7, %rax - jz KeccakP1600_AVX2_ExtractBytes_LaneAlignedCheck - mov $8, %rbx # rbx is (max) length of incomplete lane - sub %rax, %rbx - cmp %rcx, %rbx - cmovae %rcx, %rbx - sub %rbx, %rcx # length -= length of incomplete lane - mov (%rdx), %r9 - add $8, %rdx - add %rdi, %r9 - add %rax, %r9 -KeccakP1600_AVX2_ExtractBytes_NotAlignedLoop: - mov (%r9), %r8b - inc %r9 - mov %r8b, (%rsi) - inc %rsi - dec %rbx - jnz KeccakP1600_AVX2_ExtractBytes_NotAlignedLoop - jmp KeccakP1600_AVX2_ExtractBytes_LaneAlignedCheck -KeccakP1600_AVX2_ExtractBytes_LaneAlignedLoop: - mov (%rdx), %rax - add $8, %rdx - add %rdi, %rax - mov (%rax), %r8 - mov %r8, (%rsi) - add $8, %rsi -KeccakP1600_AVX2_ExtractBytes_LaneAlignedCheck: - sub $8, %rcx - jnc KeccakP1600_AVX2_ExtractBytes_LaneAlignedLoop -KeccakP1600_AVX2_ExtractBytes_LastIncompleteLane: - add $8, %rcx - jz KeccakP1600_AVX2_ExtractBytes_Exit - mov (%rdx), %rax - add %rdi, %rax - mov (%rax), %r8 -KeccakP1600_AVX2_ExtractBytes_LastIncompleteLaneLoop: - mov %r8b, (%rsi) - shr $8, %r8 - inc %rsi - dec %rcx - jnz KeccakP1600_AVX2_ExtractBytes_LastIncompleteLaneLoop -KeccakP1600_AVX2_ExtractBytes_Exit: - pop %rbx - ret -.ifdef macOS -.else -.size KeccakP1600_AVX2_ExtractBytes,.-KeccakP1600_AVX2_ExtractBytes -.endif - -# ----------------------------------------------------------------------------- -# -# internal -# -.ifdef macOS -.else -.type __KeccakF1600,@function -.endif -.balign 32 -__KeccakF1600: -.Loop_avx2: - ######################################### Theta - vpshufd $0b01001110,%ymm2,%ymm13 - vpxor %ymm3,%ymm5,%ymm12 - vpxor %ymm6,%ymm4,%ymm9 - vpxor %ymm1,%ymm12,%ymm12 - vpxor %ymm9,%ymm12,%ymm12 # C[1..4] - - vpermq $0b10010011,%ymm12,%ymm11 - vpxor %ymm2,%ymm13,%ymm13 - vpermq $0b01001110,%ymm13,%ymm7 - - vpsrlq $63,%ymm12,%ymm8 - vpaddq %ymm12,%ymm12,%ymm9 - vpor %ymm9,%ymm8,%ymm8 # ROL64(C[1..4],1) - - vpermq $0b00111001,%ymm8,%ymm15 - vpxor %ymm11,%ymm8,%ymm14 - vpermq $0b00000000,%ymm14,%ymm14 # D[0..0] = ROL64(C[1],1) ^ C[4] - - vpxor %ymm0,%ymm13,%ymm13 - vpxor %ymm7,%ymm13,%ymm13 # C[0..0] - - vpsrlq $63,%ymm13,%ymm7 - vpaddq %ymm13,%ymm13,%ymm8 - vpor %ymm7,%ymm8,%ymm8 # ROL64(C[0..0],1) - - vpxor %ymm14,%ymm2,%ymm2 # ^= D[0..0] - vpxor %ymm14,%ymm0,%ymm0 # ^= D[0..0] - - vpblendd $0b11000000,%ymm8,%ymm15,%ymm15 - vpblendd $0b00000011,%ymm13,%ymm11,%ymm11 - vpxor %ymm11,%ymm15,%ymm15 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3] - - ######################################### Rho + Pi + pre-Chi shuffle - vpsllvq 0*32-96(%r8),%ymm2,%ymm10 - vpsrlvq 0*32-96(%r9),%ymm2,%ymm2 - vpor %ymm10,%ymm2,%ymm2 - - vpxor %ymm15,%ymm3,%ymm3 # ^= D[1..4] from Theta - vpsllvq 2*32-96(%r8),%ymm3,%ymm11 - vpsrlvq 2*32-96(%r9),%ymm3,%ymm3 - vpor %ymm11,%ymm3,%ymm3 - - vpxor %ymm15,%ymm4,%ymm4 # ^= D[1..4] from Theta - vpsllvq 3*32-96(%r8),%ymm4,%ymm12 - vpsrlvq 3*32-96(%r9),%ymm4,%ymm4 - vpor %ymm12,%ymm4,%ymm4 - - vpxor %ymm15,%ymm5,%ymm5 # ^= D[1..4] from Theta - vpsllvq 4*32-96(%r8),%ymm5,%ymm13 - vpsrlvq 4*32-96(%r9),%ymm5,%ymm5 - vpor %ymm13,%ymm5,%ymm5 - - vpxor %ymm15,%ymm6,%ymm6 # ^= D[1..4] from Theta - vpermq $0b10001101,%ymm2,%ymm10 # %ymm2 -> future %ymm3 - vpermq $0b10001101,%ymm3,%ymm11 # %ymm3 -> future %ymm4 - vpsllvq 5*32-96(%r8),%ymm6,%ymm14 - vpsrlvq 5*32-96(%r9),%ymm6,%ymm8 - vpor %ymm14,%ymm8,%ymm8 # %ymm6 -> future %ymm1 - - vpxor %ymm15,%ymm1,%ymm1 # ^= D[1..4] from Theta - vpermq $0b00011011,%ymm4,%ymm12 # %ymm4 -> future %ymm5 - vpermq $0b01110010,%ymm5,%ymm13 # %ymm5 -> future %ymm6 - vpsllvq 1*32-96(%r8),%ymm1,%ymm15 - vpsrlvq 1*32-96(%r9),%ymm1,%ymm9 - vpor %ymm15,%ymm9,%ymm9 # %ymm1 -> future %ymm2 - - ######################################### Chi - vpsrldq $8,%ymm8,%ymm14 - vpandn %ymm14,%ymm8,%ymm7 # tgting [0][0] [0][0] [0][0] [0][0] - - vpblendd $0b00001100,%ymm13,%ymm9,%ymm3 # [4][4] [2][0] - vpblendd $0b00001100,%ymm9,%ymm11,%ymm15 # [4][0] [2][1] - vpblendd $0b00001100,%ymm11,%ymm10,%ymm5 # [4][2] [2][4] - vpblendd $0b00001100,%ymm10,%ymm9,%ymm14 # [4][3] [2][0] - vpblendd $0b00110000,%ymm11,%ymm3,%ymm3 # [1][3] [4][4] [2][0] - vpblendd $0b00110000,%ymm12,%ymm15,%ymm15 # [1][4] [4][0] [2][1] - vpblendd $0b00110000,%ymm9,%ymm5,%ymm5 # [1][0] [4][2] [2][4] - vpblendd $0b00110000,%ymm13,%ymm14,%ymm14 # [1][1] [4][3] [2][0] - vpblendd $0b11000000,%ymm12,%ymm3,%ymm3 # [3][2] [1][3] [4][4] [2][0] - vpblendd $0b11000000,%ymm13,%ymm15,%ymm15 # [3][3] [1][4] [4][0] [2][1] - vpblendd $0b11000000,%ymm13,%ymm5,%ymm5 # [3][3] [1][0] [4][2] [2][4] - vpblendd $0b11000000,%ymm11,%ymm14,%ymm14 # [3][4] [1][1] [4][3] [2][0] - vpandn %ymm15,%ymm3,%ymm3 # tgting [3][1] [1][2] [4][3] [2][4] - vpandn %ymm14,%ymm5,%ymm5 # tgting [3][2] [1][4] [4][1] [2][3] - - vpblendd $0b00001100,%ymm9,%ymm12,%ymm6 # [4][0] [2][3] - vpblendd $0b00001100,%ymm12,%ymm10,%ymm15 # [4][1] [2][4] - vpxor %ymm10,%ymm3,%ymm3 - vpblendd $0b00110000,%ymm10,%ymm6,%ymm6 # [1][2] [4][0] [2][3] - vpblendd $0b00110000,%ymm11,%ymm15,%ymm15 # [1][3] [4][1] [2][4] - vpxor %ymm12,%ymm5,%ymm5 - vpblendd $0b11000000,%ymm11,%ymm6,%ymm6 # [3][4] [1][2] [4][0] [2][3] - vpblendd $0b11000000,%ymm9,%ymm15,%ymm15 # [3][0] [1][3] [4][1] [2][4] - vpandn %ymm15,%ymm6,%ymm6 # tgting [3][3] [1][1] [4][4] [2][2] - vpxor %ymm13,%ymm6,%ymm6 - - vpermq $0b00011110,%ymm8,%ymm4 # [0][1] [0][2] [0][4] [0][3] - vpblendd $0b00110000,%ymm0,%ymm4,%ymm15 # [0][1] [0][0] [0][4] [0][3] - vpermq $0b00111001,%ymm8,%ymm1 # [0][1] [0][4] [0][3] [0][2] - vpblendd $0b11000000,%ymm0,%ymm1,%ymm1 # [0][0] [0][4] [0][3] [0][2] - vpandn %ymm15,%ymm1,%ymm1 # tgting [0][4] [0][3] [0][2] [0][1] - - vpblendd $0b00001100,%ymm12,%ymm11,%ymm2 # [4][1] [2][1] - vpblendd $0b00001100,%ymm11,%ymm13,%ymm14 # [4][2] [2][2] - vpblendd $0b00110000,%ymm13,%ymm2,%ymm2 # [1][1] [4][1] [2][1] - vpblendd $0b00110000,%ymm10,%ymm14,%ymm14 # [1][2] [4][2] [2][2] - vpblendd $0b11000000,%ymm10,%ymm2,%ymm2 # [3][1] [1][1] [4][1] [2][1] - vpblendd $0b11000000,%ymm12,%ymm14,%ymm14 # [3][2] [1][2] [4][2] [2][2] - vpandn %ymm14,%ymm2,%ymm2 # tgting [3][0] [1][0] [4][0] [2][0] - vpxor %ymm9,%ymm2,%ymm2 - - vpermq $0b00000000,%ymm7,%ymm7 # [0][0] [0][0] [0][0] [0][0] - vpermq $0b00011011,%ymm3,%ymm3 # post-Chi shuffle - vpermq $0b10001101,%ymm5,%ymm5 - vpermq $0b01110010,%ymm6,%ymm6 - - vpblendd $0b00001100,%ymm10,%ymm13,%ymm4 # [4][3] [2][2] - vpblendd $0b00001100,%ymm13,%ymm12,%ymm14 # [4][4] [2][3] - vpblendd $0b00110000,%ymm12,%ymm4,%ymm4 # [1][4] [4][3] [2][2] - vpblendd $0b00110000,%ymm9,%ymm14,%ymm14 # [1][0] [4][4] [2][3] - vpblendd $0b11000000,%ymm9,%ymm4,%ymm4 # [3][0] [1][4] [4][3] [2][2] - vpblendd $0b11000000,%ymm10,%ymm14,%ymm14 # [3][1] [1][0] [4][4] [2][3] - vpandn %ymm14,%ymm4,%ymm4 # tgting [3][4] [1][3] [4][2] [2][1] - - vpxor %ymm7,%ymm0,%ymm0 - vpxor %ymm8,%ymm1,%ymm1 - vpxor %ymm11,%ymm4,%ymm4 - - ######################################### Iota - vpxor (%r10),%ymm0,%ymm0 - lea 32(%r10),%r10 - - dec %eax - jnz .Loop_avx2 - ret -.ifdef macOS -.else -.size __KeccakF1600,.-__KeccakF1600 -.endif - - - -.ifdef macOS -.globl _KeccakP1600_AVX2_Permute_12rounds -_KeccakP1600_AVX2_Permute_12rounds: -.else -.globl KeccakP1600_AVX2_Permute_12rounds -.type KeccakP1600_AVX2_Permute_12rounds,@function -KeccakP1600_AVX2_Permute_12rounds: -.endif -.balign 32 - lea rhotates_left+96(%rip),%r8 - lea rhotates_right+96(%rip),%r9 - lea iotas+12*4*8(%rip),%r10 - mov $12,%eax - lea 96(%rdi),%rdi - vzeroupper - vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] - vmovdqu 8+32*0-96(%rdi),%ymm1 - vmovdqu 8+32*1-96(%rdi),%ymm2 - vmovdqu 8+32*2-96(%rdi),%ymm3 - vmovdqu 8+32*3-96(%rdi),%ymm4 - vmovdqu 8+32*4-96(%rdi),%ymm5 - vmovdqu 8+32*5-96(%rdi),%ymm6 - call __KeccakF1600 - vmovq %xmm0,-96(%rdi) - vmovdqu %ymm1,8+32*0-96(%rdi) - vmovdqu %ymm2,8+32*1-96(%rdi) - vmovdqu %ymm3,8+32*2-96(%rdi) - vmovdqu %ymm4,8+32*3-96(%rdi) - vmovdqu %ymm5,8+32*4-96(%rdi) - vmovdqu %ymm6,8+32*5-96(%rdi) - vzeroupper - ret -.ifdef macOS -.else -.size KeccakP1600_AVX2_Permute_12rounds,.-KeccakP1600_AVX2_Permute_12rounds -.endif - -# ----------------------------------------------------------------------------- -# -# size_t KeccakP1600_AVX2_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); -# %rdi %rsi %rdx %rcx -# -.ifdef macOS -.globl _KeccakP1600_AVX2_12rounds_FastLoop_Absorb -_KeccakP1600_AVX2_12rounds_FastLoop_Absorb: -.else -.globl KeccakP1600_AVX2_12rounds_FastLoop_Absorb -.type KeccakP1600_AVX2_12rounds_FastLoop_Absorb,@function -KeccakP1600_AVX2_12rounds_FastLoop_Absorb: -.endif -.balign 32 - push %rbx - push %r10 - shr $3, %rcx # rcx = data length in lanes - mov %rdx, %rbx # rbx = initial data pointer - cmp %rsi, %rcx - jb KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Exit - vzeroupper - cmp $21, %rsi - jnz KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not21Lanes - sub $21, %rcx - lea rhotates_left+96(%rip),%r8 - lea rhotates_right+96(%rip),%r9 - lea 96(%rdi),%rdi - vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] - vmovdqu 8+32*0-96(%rdi),%ymm1 - vmovdqu 8+32*1-96(%rdi),%ymm2 - vmovdqu 8+32*2-96(%rdi),%ymm3 - vmovdqu 8+32*3-96(%rdi),%ymm4 - vmovdqu 8+32*4-96(%rdi),%ymm5 - vmovdqu 8+32*5-96(%rdi),%ymm6 -KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop21Lanes: - vpbroadcastq (%rdx),%ymm7 - vmovdqu 8(%rdx),%ymm8 - - vmovdqa map2(%rip), %xmm15 - vpcmpeqd %ymm14, %ymm14, %ymm14 - vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm9 - - vmovdqa mask3_21(%rip), %ymm14 - vpxor %ymm10, %ymm10, %ymm10 - vmovdqa map3(%rip), %xmm15 - vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm10 - - vmovdqa mask4_21(%rip), %ymm14 - vpxor %ymm11, %ymm11, %ymm11 - vmovdqa map4(%rip), %xmm15 - vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm11 - - vmovdqa mask5_21(%rip), %ymm14 - vpxor %ymm12, %ymm12, %ymm12 - vmovdqa map5(%rip), %xmm15 - vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm12 - - vmovdqa mask6_21(%rip), %ymm14 - vpxor %ymm13, %ymm13, %ymm13 - vmovdqa map6(%rip), %xmm15 - vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm13 - - vpxor %ymm7,%ymm0,%ymm0 - vpxor %ymm8,%ymm1,%ymm1 - vpxor %ymm9,%ymm2,%ymm2 - vpxor %ymm10,%ymm3,%ymm3 - vpxor %ymm11,%ymm4,%ymm4 - vpxor %ymm12,%ymm5,%ymm5 - vpxor %ymm13,%ymm6,%ymm6 - add $21*8, %rdx - lea iotas+12*4*8(%rip),%r10 - mov $12,%eax - call __KeccakF1600 - sub $21, %rcx - jnc KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop21Lanes -KeccakP1600_AVX2_12rounds_FastLoop_Absorb_SaveAndExit: - vmovq %xmm0,-96(%rdi) - vmovdqu %ymm1,8+32*0-96(%rdi) - vmovdqu %ymm2,8+32*1-96(%rdi) - vmovdqu %ymm3,8+32*2-96(%rdi) - vmovdqu %ymm4,8+32*3-96(%rdi) - vmovdqu %ymm5,8+32*4-96(%rdi) - vmovdqu %ymm6,8+32*5-96(%rdi) -KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Exit: - vzeroupper - mov %rdx, %rax # return number of bytes processed - sub %rbx, %rax - pop %r10 - pop %rbx - ret -KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not21Lanes: - cmp $17, %rsi - jnz KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not17Lanes - sub $17, %rcx - lea rhotates_left+96(%rip),%r8 - lea rhotates_right+96(%rip),%r9 - lea 96(%rdi),%rdi - vpbroadcastq -96(%rdi),%ymm0 # load A[5][5] - vmovdqu 8+32*0-96(%rdi),%ymm1 - vmovdqu 8+32*1-96(%rdi),%ymm2 - vmovdqu 8+32*2-96(%rdi),%ymm3 - vmovdqu 8+32*3-96(%rdi),%ymm4 - vmovdqu 8+32*4-96(%rdi),%ymm5 - vmovdqu 8+32*5-96(%rdi),%ymm6 -KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop17Lanes: - vpbroadcastq (%rdx),%ymm7 - vmovdqu 8(%rdx),%ymm8 - - vmovdqa mask2_17(%rip), %ymm14 - vpxor %ymm9, %ymm9, %ymm9 - vmovdqa map2(%rip), %xmm15 - vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm9 - - vmovdqa mask3_17(%rip), %ymm14 - vpxor %ymm10, %ymm10, %ymm10 - vmovdqa map3(%rip), %xmm15 - vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm10 - - vmovdqa mask4_17(%rip), %ymm14 - vpxor %ymm11, %ymm11, %ymm11 - vmovdqa map4(%rip), %xmm15 - vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm11 - - vmovdqa mask5_17(%rip), %ymm14 - vpxor %ymm12, %ymm12, %ymm12 - vmovdqa map5(%rip), %xmm15 - vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm12 - - vmovdqa mask6_17(%rip), %ymm14 - vpxor %ymm13, %ymm13, %ymm13 - vmovdqa map6(%rip), %xmm15 - vpgatherdq %ymm14, (%rdx, %xmm15, 1), %ymm13 - - vpxor %ymm7,%ymm0,%ymm0 - vpxor %ymm8,%ymm1,%ymm1 - vpxor %ymm9,%ymm2,%ymm2 - vpxor %ymm10,%ymm3,%ymm3 - vpxor %ymm11,%ymm4,%ymm4 - vpxor %ymm12,%ymm5,%ymm5 - vpxor %ymm13,%ymm6,%ymm6 - add $17*8, %rdx - lea iotas+12*4*8(%rip),%r10 - mov $12,%eax - call __KeccakF1600 - sub $17, %rcx - jnc KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Loop17Lanes - jmp KeccakP1600_AVX2_12rounds_FastLoop_Absorb_SaveAndExit -KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not17Lanes: - lea mapState(%rip), %r9 - mov %rsi, %rax -KeccakP1600_AVX2_12rounds_FastLoop_Absorb_LanesAddLoop: - mov (%rdx), %r8 - add $8, %rdx - mov (%r9), %r10 - add $8, %r9 - add %rdi, %r10 - xor %r8, (%r10) - sub $1, %rax - jnz KeccakP1600_AVX2_12rounds_FastLoop_Absorb_LanesAddLoop - sub %rsi, %rcx - push %rdi - push %rsi - push %rdx - push %rcx -.ifdef macOS - call _KeccakP1600_AVX2_Permute_12rounds -.else - call KeccakP1600_AVX2_Permute_12rounds@PLT -.endif - pop %rcx - pop %rdx - pop %rsi - pop %rdi - cmp %rsi, %rcx - jae KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Not17Lanes - jmp KeccakP1600_AVX2_12rounds_FastLoop_Absorb_Exit -.ifdef macOS -.else -.size KeccakP1600_AVX2_12rounds_FastLoop_Absorb,.-KeccakP1600_AVX2_12rounds_FastLoop_Absorb -.endif - -.equ ALLON, 0xFFFFFFFFFFFFFFFF - -.balign 64 -rhotates_left: - .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0] - .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4] - .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4] - .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4] - .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4] - .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4] -rhotates_right: - .quad 64-3, 64-18, 64-36, 64-41 - .quad 64-1, 64-62, 64-28, 64-27 - .quad 64-45, 64-6, 64-56, 64-39 - .quad 64-10, 64-61, 64-55, 64-8 - .quad 64-2, 64-15, 64-25, 64-20 - .quad 64-44, 64-43, 64-21, 64-14 -iotas: - .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001 - .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082 - .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a - .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000 - .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b - .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 - .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 - .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009 - .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a - .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088 - .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009 - .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a - .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b - .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b - .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089 - .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003 - .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002 - .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080 - .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a - .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a - .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 - .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080 - .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 - .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008 - -mapState: - .quad 0*8, 1*8, 2*8, 3*8, 4*8 - .quad 7*8, 21*8, 10*8, 15*8, 20*8 - .quad 5*8, 13*8, 22*8, 19*8, 12*8 - .quad 8*8, 9*8, 18*8, 23*8, 16*8 - .quad 6*8, 17*8, 14*8, 11*8, 24*8 - - .balign 16 -map2: - .long 10*8, 20*8, 5*8, 15*8 -map3: - .long 16*8, 7*8, 23*8, 14*8 -map4: - .long 11*8, 22*8, 8*8, 19*8 -map5: - .long 21*8, 17*8, 13*8, 9*8 -map6: - .long 6*8, 12*8, 18*8, 24*8 - - .balign 32 -mask3_21: - .quad ALLON, ALLON, 0, ALLON -mask4_21: - .quad ALLON, 0, ALLON, ALLON -mask5_21: - .quad 0, ALLON, ALLON, ALLON -mask6_21: - .quad ALLON, ALLON, ALLON, 0 - -mask2_17: - .quad ALLON, 0, ALLON, ALLON -mask3_17: - .quad ALLON, ALLON, 0, ALLON -mask4_17: - .quad ALLON, 0, ALLON, 0 -mask5_17: - .quad 0, 0, ALLON, ALLON -mask6_17: - .quad ALLON, ALLON, 0, 0 - -.asciz "Keccak-1600 for AVX2, CRYPTOGAMS by " diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c deleted file mode 100644 index b426421..0000000 --- a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512-plainC.c +++ /dev/null @@ -1,241 +0,0 @@ -/* -K12 based on the eXtended Keccak Code Package (XKCP) -https://github.com/XKCP/XKCP - -The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. - -Implementation by Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to the Keccak Team website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ - ---- - -We would like to thank Vladimir Sedach, we have used parts of his Keccak AVX-512 C++ code. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "align.h" - -typedef __m512i V512; - -#define XOR(a,b) _mm512_xor_si512(a,b) -#define XOR3(a,b,c) _mm512_ternarylogic_epi64(a,b,c,0x96) -#define XOR5(a,b,c,d,e) XOR3(XOR3(a,b,c),d,e) -#define ROL(a,offset) _mm512_rol_epi64(a,offset) -#define Chi(a,b,c) _mm512_ternarylogic_epi64(a,b,c,0xD2) - -#define LOAD_Lanes(m,a) _mm512_maskz_loadu_epi64(m,a) -#define LOAD_Lane(a) LOAD_Lanes(0x01,a) -#define LOAD_Plane(a) LOAD_Lanes(0x1F,a) -#define LOAD_8Lanes(a) LOAD_Lanes(0xFF,a) -#define STORE_Lanes(a,m,v) _mm512_mask_storeu_epi64(a,m,v) -#define STORE_Lane(a,v) STORE_Lanes(a,0x01,v) -#define STORE_Plane(a,v) STORE_Lanes(a,0x1F,v) -#define STORE_8Lanes(a,v) STORE_Lanes(a,0xFF,v) - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_AVX512_Initialize(void *state) -{ - memset(state, 0, 1600/8); -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_AVX512_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) -{ - uint8_t *stateAsBytes; - uint64_t *stateAsLanes; - - for( stateAsBytes = (uint8_t*)state; ((offset % 8) != 0) && (length != 0); ++offset, --length) - stateAsBytes[offset] ^= *(data++); - for (stateAsLanes = (uint64_t*)(stateAsBytes + offset); length >= 8*8; stateAsLanes += 8, data += 8*8, length -= 8*8) - STORE_8Lanes( stateAsLanes, XOR(LOAD_8Lanes(stateAsLanes), LOAD_8Lanes((const uint64_t*)data))); - for (/* empty */; length >= 8; ++stateAsLanes, data += 8, length -= 8) - STORE_Lane( stateAsLanes, XOR(LOAD_Lane(stateAsLanes), LOAD_Lane((const uint64_t*)data))); - for ( stateAsBytes = (uint8_t*)stateAsLanes; length != 0; --length) - *(stateAsBytes++) ^= *(data++); -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_AVX512_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) -{ - memcpy(data, (unsigned char*)state+offset, length); -} - -/* ---------------------------------------------------------------- */ - -const uint64_t KeccakP1600RoundConstants[24] = { - 0x0000000000000001ULL, - 0x0000000000008082ULL, - 0x800000000000808aULL, - 0x8000000080008000ULL, - 0x000000000000808bULL, - 0x0000000080000001ULL, - 0x8000000080008081ULL, - 0x8000000000008009ULL, - 0x000000000000008aULL, - 0x0000000000000088ULL, - 0x0000000080008009ULL, - 0x000000008000000aULL, - 0x000000008000808bULL, - 0x800000000000008bULL, - 0x8000000000008089ULL, - 0x8000000000008003ULL, - 0x8000000000008002ULL, - 0x8000000000000080ULL, - 0x000000000000800aULL, - 0x800000008000000aULL, - 0x8000000080008081ULL, - 0x8000000000008080ULL, - 0x0000000080000001ULL, - 0x8000000080008008ULL }; - -#define KeccakP_DeclareVars \ - V512 b0, b1, b2, b3, b4; \ - V512 Baeiou, Gaeiou, Kaeiou, Maeiou, Saeiou; \ - V512 moveThetaPrev = _mm512_setr_epi64(4, 0, 1, 2, 3, 5, 6, 7); \ - V512 moveThetaNext = _mm512_setr_epi64(1, 2, 3, 4, 0, 5, 6, 7); \ - V512 rhoB = _mm512_setr_epi64( 0, 1, 62, 28, 27, 0, 0, 0); \ - V512 rhoG = _mm512_setr_epi64(36, 44, 6, 55, 20, 0, 0, 0); \ - V512 rhoK = _mm512_setr_epi64( 3, 10, 43, 25, 39, 0, 0, 0); \ - V512 rhoM = _mm512_setr_epi64(41, 45, 15, 21, 8, 0, 0, 0); \ - V512 rhoS = _mm512_setr_epi64(18, 2, 61, 56, 14, 0, 0, 0); \ - V512 pi1B = _mm512_setr_epi64(0, 3, 1, 4, 2, 5, 6, 7); \ - V512 pi1G = _mm512_setr_epi64(1, 4, 2, 0, 3, 5, 6, 7); \ - V512 pi1K = _mm512_setr_epi64(2, 0, 3, 1, 4, 5, 6, 7); \ - V512 pi1M = _mm512_setr_epi64(3, 1, 4, 2, 0, 5, 6, 7); \ - V512 pi1S = _mm512_setr_epi64(4, 2, 0, 3, 1, 5, 6, 7); \ - V512 pi2S1 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 0+8, 2+8); \ - V512 pi2S2 = _mm512_setr_epi64(0, 1, 2, 3, 4, 5, 1+8, 3+8); \ - V512 pi2BG = _mm512_setr_epi64(0, 1, 0+8, 1+8, 6, 5, 6, 7); \ - V512 pi2KM = _mm512_setr_epi64(2, 3, 2+8, 3+8, 7, 5, 6, 7); \ - V512 pi2S3 = _mm512_setr_epi64(4, 5, 4+8, 5+8, 4, 5, 6, 7); - -#define copyFromState(pState) \ - Baeiou = LOAD_Plane(pState+ 0); \ - Gaeiou = LOAD_Plane(pState+ 5); \ - Kaeiou = LOAD_Plane(pState+10); \ - Maeiou = LOAD_Plane(pState+15); \ - Saeiou = LOAD_Plane(pState+20); - -#define copyToState(pState) \ - STORE_Plane(pState+ 0, Baeiou); \ - STORE_Plane(pState+ 5, Gaeiou); \ - STORE_Plane(pState+10, Kaeiou); \ - STORE_Plane(pState+15, Maeiou); \ - STORE_Plane(pState+20, Saeiou); - -#define KeccakP_Round(i) \ - /* Theta */ \ - b0 = XOR5( Baeiou, Gaeiou, Kaeiou, Maeiou, Saeiou ); \ - b1 = _mm512_permutexvar_epi64(moveThetaPrev, b0); \ - b0 = _mm512_permutexvar_epi64(moveThetaNext, b0); \ - b0 = _mm512_rol_epi64(b0, 1); \ - Baeiou = XOR3( Baeiou, b0, b1 ); \ - Gaeiou = XOR3( Gaeiou, b0, b1 ); \ - Kaeiou = XOR3( Kaeiou, b0, b1 ); \ - Maeiou = XOR3( Maeiou, b0, b1 ); \ - Saeiou = XOR3( Saeiou, b0, b1 ); \ - /* Rho */ \ - Baeiou = _mm512_rolv_epi64(Baeiou, rhoB); \ - Gaeiou = _mm512_rolv_epi64(Gaeiou, rhoG); \ - Kaeiou = _mm512_rolv_epi64(Kaeiou, rhoK); \ - Maeiou = _mm512_rolv_epi64(Maeiou, rhoM); \ - Saeiou = _mm512_rolv_epi64(Saeiou, rhoS); \ - /* Pi 1 */ \ - b0 = _mm512_permutexvar_epi64(pi1B, Baeiou); \ - b1 = _mm512_permutexvar_epi64(pi1G, Gaeiou); \ - b2 = _mm512_permutexvar_epi64(pi1K, Kaeiou); \ - b3 = _mm512_permutexvar_epi64(pi1M, Maeiou); \ - b4 = _mm512_permutexvar_epi64(pi1S, Saeiou); \ - /* Chi */ \ - Baeiou = Chi(b0, b1, b2); \ - Gaeiou = Chi(b1, b2, b3); \ - Kaeiou = Chi(b2, b3, b4); \ - Maeiou = Chi(b3, b4, b0); \ - Saeiou = Chi(b4, b0, b1); \ - /* Iota */ \ - Baeiou = XOR(Baeiou, LOAD_Lane(KeccakP1600RoundConstants+i)); \ - /* Pi 2 */ \ - b0 = _mm512_unpacklo_epi64(Baeiou, Gaeiou); \ - b1 = _mm512_unpacklo_epi64(Kaeiou, Maeiou); \ - b0 = _mm512_permutex2var_epi64(b0, pi2S1, Saeiou); \ - b2 = _mm512_unpackhi_epi64(Baeiou, Gaeiou); \ - b3 = _mm512_unpackhi_epi64(Kaeiou, Maeiou); \ - b2 = _mm512_permutex2var_epi64(b2, pi2S2, Saeiou); \ - Baeiou = _mm512_permutex2var_epi64(b0, pi2BG, b1); \ - Gaeiou = _mm512_permutex2var_epi64(b2, pi2BG, b3); \ - Kaeiou = _mm512_permutex2var_epi64(b0, pi2KM, b1); \ - Maeiou = _mm512_permutex2var_epi64(b2, pi2KM, b3); \ - b0 = _mm512_permutex2var_epi64(b0, pi2S3, b1); \ - Saeiou = _mm512_mask_blend_epi64(0x10, b0, Saeiou) - -#define rounds12 \ - KeccakP_Round( 12 ); \ - KeccakP_Round( 13 ); \ - KeccakP_Round( 14 ); \ - KeccakP_Round( 15 ); \ - KeccakP_Round( 16 ); \ - KeccakP_Round( 17 ); \ - KeccakP_Round( 18 ); \ - KeccakP_Round( 19 ); \ - KeccakP_Round( 20 ); \ - KeccakP_Round( 21 ); \ - KeccakP_Round( 22 ); \ - KeccakP_Round( 23 ) - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_AVX512_Permute_12rounds(void *state) -{ - KeccakP_DeclareVars - uint64_t *stateAsLanes = (uint64_t*)state; - - copyFromState(stateAsLanes); - rounds12; - copyToState(stateAsLanes); -} - -/* ---------------------------------------------------------------- */ - -#include - -size_t KeccakP1600_AVX512_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen) -{ - size_t originalDataByteLen = dataByteLen; - - assert(laneCount == 21); - - KeccakP_DeclareVars; - uint64_t *stateAsLanes = (uint64_t*)state; - uint64_t *inDataAsLanes = (uint64_t*)data; - - copyFromState(stateAsLanes); - while(dataByteLen >= 21*8) { - Baeiou = XOR(Baeiou, LOAD_Plane(inDataAsLanes+ 0)); - Gaeiou = XOR(Gaeiou, LOAD_Plane(inDataAsLanes+ 5)); - Kaeiou = XOR(Kaeiou, LOAD_Plane(inDataAsLanes+10)); - Maeiou = XOR(Maeiou, LOAD_Plane(inDataAsLanes+15)); - Saeiou = XOR(Saeiou, LOAD_Lane(inDataAsLanes+20)); - rounds12; - inDataAsLanes += 21; - dataByteLen -= 21*8; - } - copyToState(stateAsLanes); - - return originalDataByteLen - dataByteLen; -} diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s deleted file mode 100644 index 383ca43..0000000 --- a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-AVX512.s +++ /dev/null @@ -1,551 +0,0 @@ -# Copyright (c) 2006-2017, CRYPTOGAMS by -# Copyright (c) 2018 Ronny Van Keer -# All rights reserved. -# -# The source code in this file is licensed under the CRYPTOGAMS license. -# For further details see http://www.openssl.org/~appro/cryptogams/. -# -# Notes: -# The code for the permutation (__KeccakF1600) was generated with -# Andy Polyakov's keccak1600-avx512.pl from the CRYPTOGAMS project -# (https://github.com/dot-asm/cryptogams/blob/master/x86_64/keccak1600-avx512.pl). -# The rest of the code was written by Ronny Van Keer. -# Adaptations for macOS by Stéphane Léon. - -.text - -# ----------------------------------------------------------------------------- -# -# void KeccakP1600_AVX512_Initialize(void *state); -# -.ifdef macOS -.globl _KeccakP1600_AVX512_Initialize -_KeccakP1600_AVX512_Initialize: -.else -.globl KeccakP1600_AVX512_Initialize -.type KeccakP1600_AVX512_Initialize,@function -KeccakP1600_AVX512_Initialize: -.endif -.balign 32 - vpxorq %zmm0,%zmm0,%zmm0 - vmovdqu64 %zmm0,0*64(%rdi) - vmovdqu64 %zmm0,1*64(%rdi) - vmovdqu64 %zmm0,2*64(%rdi) - movq $0,3*64(%rdi) - ret -.ifdef macOS -.else -.size KeccakP1600_AVX512_Initialize,.-KeccakP1600_AVX512_Initialize -.endif - -# ----------------------------------------------------------------------------- -# -# void KeccakP1600_AVX512_AddByte(void *state, unsigned char data, unsigned int offset); -# %rdi %rsi %rdx -#!! -#.globl KeccakP1600_AVX512_AddByte -#.type KeccakP1600_AVX512_AddByte,@function -#.balign 32 -#KeccakP1600_AVX512_AddByte: -# mov %rdx, %rax -# and $7, %rax -# and $0xFFFFFFF8, %edx -# mov mapState(%rdx), %rdx -# add %rdx, %rdi -# add %rax, %rdi -# xorb %sil, (%rdi) -# ret -#.size KeccakP1600_AVX512_AddByte,.-KeccakP1600_AVX512_AddByte - -# ----------------------------------------------------------------------------- -# -# void KeccakP1600_AVX512_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); -# %rdi %rsi %rdx %rcx -# -.ifdef macOS -.globl _KeccakP1600_AVX512_AddBytes -_KeccakP1600_AVX512_AddBytes: -.else -.globl KeccakP1600_AVX512_AddBytes -.type KeccakP1600_AVX512_AddBytes,@function -KeccakP1600_AVX512_AddBytes: -.endif -.balign 32 - cmp $0, %rcx - jz KeccakP1600_AVX512_AddBytes_Exit - add %rdx, %rdi # state += offset - and $7, %rdx - jz KeccakP1600_AVX512_AddBytes_LaneAlignedCheck - mov $8, %r9 # r9 is (max) length of incomplete lane - sub %rdx, %r9 - cmp %rcx, %r9 - cmovae %rcx, %r9 - sub %r9, %rcx # length -= length of incomplete lane -KeccakP1600_AVX512_AddBytes_NotAlignedLoop: - mov (%rsi), %r8b - inc %rsi - xorb %r8b, (%rdi) - inc %rdi - dec %r9 - jnz KeccakP1600_AVX512_AddBytes_NotAlignedLoop - jmp KeccakP1600_AVX512_AddBytes_LaneAlignedCheck -KeccakP1600_AVX512_AddBytes_LaneAlignedLoop: - mov (%rsi), %r8 - add $8, %rsi - xor %r8, (%rdi) - add $8, %rdi -KeccakP1600_AVX512_AddBytes_LaneAlignedCheck: - sub $8, %rcx - jnc KeccakP1600_AVX512_AddBytes_LaneAlignedLoop -KeccakP1600_AVX512_AddBytes_LastIncompleteLane: - add $8, %rcx - jz KeccakP1600_AVX512_AddBytes_Exit -KeccakP1600_AVX512_AddBytes_LastIncompleteLaneLoop: - mov (%rsi), %r8b - inc %rsi - xor %r8b, (%rdi) - inc %rdi - dec %rcx - jnz KeccakP1600_AVX512_AddBytes_LastIncompleteLaneLoop -KeccakP1600_AVX512_AddBytes_Exit: - ret -.ifdef macOS -.else -.size KeccakP1600_AVX512_AddBytes,.-KeccakP1600_AVX512_AddBytes -.endif - -# ----------------------------------------------------------------------------- -# -# void KeccakP1600_AVX512_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); -# %rdi %rsi %rdx %rcx -# -.ifdef macOS -.globl _KeccakP1600_AVX512_ExtractBytes -_KeccakP1600_AVX512_ExtractBytes: -.else -.globl KeccakP1600_AVX512_ExtractBytes -.type KeccakP1600_AVX512_ExtractBytes,@function -KeccakP1600_AVX512_ExtractBytes: -.endif -.balign 32 - cmp $0, %rcx - jz KeccakP1600_AVX512_ExtractBytes_Exit - add %rdx, %rdi # state += offset - and $7, %rdx - jz KeccakP1600_AVX512_ExtractBytes_LaneAlignedCheck - mov $8, %rax # rax is (max) length of incomplete lane - sub %rdx, %rax - cmp %rcx, %rax - cmovae %rcx, %rax - sub %rax, %rcx # length -= length of incomplete lane -KeccakP1600_AVX512_ExtractBytes_NotAlignedLoop: - mov (%rdi), %r8b - inc %rdi - mov %r8b, (%rsi) - inc %rsi - dec %rax - jnz KeccakP1600_AVX512_ExtractBytes_NotAlignedLoop - jmp KeccakP1600_AVX512_ExtractBytes_LaneAlignedCheck -KeccakP1600_AVX512_ExtractBytes_LaneAlignedLoop: - mov (%rdi), %r8 - add $8, %rdi - mov %r8, (%rsi) - add $8, %rsi -KeccakP1600_AVX512_ExtractBytes_LaneAlignedCheck: - sub $8, %rcx - jnc KeccakP1600_AVX512_ExtractBytes_LaneAlignedLoop -KeccakP1600_AVX512_ExtractBytes_LastIncompleteLane: - add $8, %rcx - jz KeccakP1600_AVX512_ExtractBytes_Exit - mov (%rdi), %r8 -KeccakP1600_AVX512_ExtractBytes_LastIncompleteLaneLoop: - mov %r8b, (%rsi) - shr $8, %r8 - inc %rsi - dec %rcx - jnz KeccakP1600_AVX512_ExtractBytes_LastIncompleteLaneLoop -KeccakP1600_AVX512_ExtractBytes_Exit: - ret -.ifdef macOS -.else -.size KeccakP1600_AVX512_ExtractBytes,.-KeccakP1600_AVX512_ExtractBytes -.endif - -# ----------------------------------------------------------------------------- -# -# internal -# -.text -.ifdef macOS -.else -.type __KeccakF1600,@function -.endif -.balign 32 -__KeccakF1600: -.Loop_avx512: - ######################################### Theta, even round - vmovdqa64 %zmm0,%zmm5 # put aside original A00 - vpternlogq $0x96,%zmm2,%zmm1,%zmm0 # and use it as "C00" - vpternlogq $0x96,%zmm4,%zmm3,%zmm0 - vprolq $1,%zmm0,%zmm6 - vpermq %zmm0,%zmm13,%zmm0 - vpermq %zmm6,%zmm16,%zmm6 - vpternlogq $0x96,%zmm0,%zmm6,%zmm5 # T[0] is original A00 - vpternlogq $0x96,%zmm0,%zmm6,%zmm1 - vpternlogq $0x96,%zmm0,%zmm6,%zmm2 - vpternlogq $0x96,%zmm0,%zmm6,%zmm3 - vpternlogq $0x96,%zmm0,%zmm6,%zmm4 - ######################################### Rho - vprolvq %zmm22,%zmm5,%zmm0 # T[0] is original A00 - vprolvq %zmm23,%zmm1,%zmm1 - vprolvq %zmm24,%zmm2,%zmm2 - vprolvq %zmm25,%zmm3,%zmm3 - vprolvq %zmm26,%zmm4,%zmm4 - ######################################### Pi - vpermq %zmm0,%zmm17,%zmm0 - vpermq %zmm1,%zmm18,%zmm1 - vpermq %zmm2,%zmm19,%zmm2 - vpermq %zmm3,%zmm20,%zmm3 - vpermq %zmm4,%zmm21,%zmm4 - ######################################### Chi - vmovdqa64 %zmm0,%zmm5 - vmovdqa64 %zmm1,%zmm6 - vpternlogq $0xD2,%zmm2,%zmm1,%zmm0 - vpternlogq $0xD2,%zmm3,%zmm2,%zmm1 - vpternlogq $0xD2,%zmm4,%zmm3,%zmm2 - vpternlogq $0xD2,%zmm5,%zmm4,%zmm3 - vpternlogq $0xD2,%zmm6,%zmm5,%zmm4 - ######################################### Iota - vpxorq (%r10),%zmm0,%zmm0{%k1} - lea 16(%r10),%r10 - ######################################### Harmonize rounds - vpblendmq %zmm2,%zmm1,%zmm6{%k2} - vpblendmq %zmm3,%zmm2,%zmm7{%k2} - vpblendmq %zmm4,%zmm3,%zmm8{%k2} - vpblendmq %zmm1,%zmm0,%zmm5{%k2} - vpblendmq %zmm0,%zmm4,%zmm9{%k2} - vpblendmq %zmm3,%zmm6,%zmm6{%k3} - vpblendmq %zmm4,%zmm7,%zmm7{%k3} - vpblendmq %zmm2,%zmm5,%zmm5{%k3} - vpblendmq %zmm0,%zmm8,%zmm8{%k3} - vpblendmq %zmm1,%zmm9,%zmm9{%k3} - vpblendmq %zmm4,%zmm6,%zmm6{%k4} - vpblendmq %zmm3,%zmm5,%zmm5{%k4} - vpblendmq %zmm0,%zmm7,%zmm7{%k4} - vpblendmq %zmm1,%zmm8,%zmm8{%k4} - vpblendmq %zmm2,%zmm9,%zmm9{%k4} - vpblendmq %zmm4,%zmm5,%zmm5{%k5} - vpblendmq %zmm0,%zmm6,%zmm6{%k5} - vpblendmq %zmm1,%zmm7,%zmm7{%k5} - vpblendmq %zmm2,%zmm8,%zmm8{%k5} - vpblendmq %zmm3,%zmm9,%zmm9{%k5} - #vpermq %zmm5,%zmm33,%zmm0 # doesn't actually change order - vpermq %zmm6,%zmm13,%zmm1 - vpermq %zmm7,%zmm14,%zmm2 - vpermq %zmm8,%zmm15,%zmm3 - vpermq %zmm9,%zmm16,%zmm4 - ######################################### Theta, odd round - vmovdqa64 %zmm5,%zmm0 # real A00 - vpternlogq $0x96,%zmm2,%zmm1,%zmm5 # C00 is %zmm5's alias - vpternlogq $0x96,%zmm4,%zmm3,%zmm5 - vprolq $1,%zmm5,%zmm6 - vpermq %zmm5,%zmm13,%zmm5 - vpermq %zmm6,%zmm16,%zmm6 - vpternlogq $0x96,%zmm5,%zmm6,%zmm0 - vpternlogq $0x96,%zmm5,%zmm6,%zmm3 - vpternlogq $0x96,%zmm5,%zmm6,%zmm1 - vpternlogq $0x96,%zmm5,%zmm6,%zmm4 - vpternlogq $0x96,%zmm5,%zmm6,%zmm2 - ######################################### Rho - vprolvq %zmm27,%zmm0,%zmm0 - vprolvq %zmm30,%zmm3,%zmm6 - vprolvq %zmm28,%zmm1,%zmm7 - vprolvq %zmm31,%zmm4,%zmm8 - vprolvq %zmm29,%zmm2,%zmm9 - vpermq %zmm0,%zmm16,%zmm10 - vpermq %zmm0,%zmm15,%zmm11 - ######################################### Iota - vpxorq -8(%r10),%zmm0,%zmm0{%k1} - ######################################### Pi - vpermq %zmm6,%zmm14,%zmm1 - vpermq %zmm7,%zmm16,%zmm2 - vpermq %zmm8,%zmm13,%zmm3 - vpermq %zmm9,%zmm15,%zmm4 - ######################################### Chi - vpternlogq $0xD2,%zmm11,%zmm10,%zmm0 - vpermq %zmm6,%zmm13,%zmm12 - #vpermq %zmm6,%zmm33,%zmm6 - vpternlogq $0xD2,%zmm6,%zmm12,%zmm1 - vpermq %zmm7,%zmm15,%zmm5 - vpermq %zmm7,%zmm14,%zmm7 - vpternlogq $0xD2,%zmm7,%zmm5,%zmm2 - #vpermq %zmm8,%zmm33,%zmm8 - vpermq %zmm8,%zmm16,%zmm6 - vpternlogq $0xD2,%zmm6,%zmm8,%zmm3 - vpermq %zmm9,%zmm14,%zmm5 - vpermq %zmm9,%zmm13,%zmm9 - vpternlogq $0xD2,%zmm9,%zmm5,%zmm4 - dec %eax - jnz .Loop_avx512 - ret -.ifdef macOS -.else -.size __KeccakF1600,.-__KeccakF1600 -.endif - -# ----------------------------------------------------------------------------- -# -# void KeccakP1600_AVX512_Permute_12rounds(void *state); -# %rdi -# -.ifdef macOS -.globl _KeccakP1600_AVX512_Permute_12rounds -_KeccakP1600_AVX512_Permute_12rounds: -.else -.globl KeccakP1600_AVX512_Permute_12rounds -.type KeccakP1600_AVX512_Permute_12rounds,@function -KeccakP1600_AVX512_Permute_12rounds: -.endif -.balign 32 - lea 96(%rdi),%rdi - lea theta_perm(%rip),%r8 - kxnorw %k6,%k6,%k6 - kshiftrw $15,%k6,%k1 - kshiftrw $11,%k6,%k6 - kshiftlw $1,%k1,%k2 - kshiftlw $2,%k1,%k3 - kshiftlw $3,%k1,%k4 - kshiftlw $4,%k1,%k5 - #vmovdqa64 64*0(%r8),%zmm33 - vmovdqa64 64*1(%r8),%zmm13 - vmovdqa64 64*2(%r8),%zmm14 - vmovdqa64 64*3(%r8),%zmm15 - vmovdqa64 64*4(%r8),%zmm16 - vmovdqa64 64*5(%r8),%zmm27 - vmovdqa64 64*6(%r8),%zmm28 - vmovdqa64 64*7(%r8),%zmm29 - vmovdqa64 64*8(%r8),%zmm30 - vmovdqa64 64*9(%r8),%zmm31 - vmovdqa64 64*10(%r8),%zmm22 - vmovdqa64 64*11(%r8),%zmm23 - vmovdqa64 64*12(%r8),%zmm24 - vmovdqa64 64*13(%r8),%zmm25 - vmovdqa64 64*14(%r8),%zmm26 - vmovdqa64 64*15(%r8),%zmm17 - vmovdqa64 64*16(%r8),%zmm18 - vmovdqa64 64*17(%r8),%zmm19 - vmovdqa64 64*18(%r8),%zmm20 - vmovdqa64 64*19(%r8),%zmm21 - vmovdqu64 40*0-96(%rdi),%zmm0{%k6}{z} -# vpxorq %zmm5,%zmm5,%zmm5 - vmovdqu64 40*1-96(%rdi),%zmm1{%k6}{z} - vmovdqu64 40*2-96(%rdi),%zmm2{%k6}{z} - vmovdqu64 40*3-96(%rdi),%zmm3{%k6}{z} - vmovdqu64 40*4-96(%rdi),%zmm4{%k6}{z} - lea iotas+12*8(%rip), %r10 - mov $12/2, %eax - call __KeccakF1600 - vmovdqu64 %zmm0,40*0-96(%rdi){%k6} - vmovdqu64 %zmm1,40*1-96(%rdi){%k6} - vmovdqu64 %zmm2,40*2-96(%rdi){%k6} - vmovdqu64 %zmm3,40*3-96(%rdi){%k6} - vmovdqu64 %zmm4,40*4-96(%rdi){%k6} - vzeroupper - ret -.ifdef macOS -.else -.size KeccakP1600_AVX512_Permute_12rounds,.-KeccakP1600_AVX512_Permute_12rounds -.endif - -# ----------------------------------------------------------------------------- -# -# size_t KeccakP1600_AVX512_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); -# %rdi %rsi %rdx %rcx -# -.ifdef macOS -.globl _KeccakP1600_AVX512_12rounds_FastLoop_Absorb -_KeccakP1600_AVX512_12rounds_FastLoop_Absorb: -.else -.globl KeccakP1600_AVX512_12rounds_FastLoop_Absorb -.type KeccakP1600_AVX512_12rounds_FastLoop_Absorb,@function -KeccakP1600_AVX512_12rounds_FastLoop_Absorb: -.endif -.balign 32 - push %rbx - push %r10 - shr $3, %rcx # rcx = data length in lanes - mov %rdx, %rbx # rbx = initial data pointer - cmp %rsi, %rcx - jb KeccakP1600_AVX512_FastLoop_Absorb_Exit - lea 96(%rdi),%rdi - lea theta_perm(%rip),%r8 - kxnorw %k6,%k6,%k6 - kshiftrw $15,%k6,%k1 - kshiftrw $11,%k6,%k6 - kshiftlw $1,%k1,%k2 - kshiftlw $2,%k1,%k3 - kshiftlw $3,%k1,%k4 - kshiftlw $4,%k1,%k5 - vmovdqa64 64*1(%r8),%zmm13 - vmovdqa64 64*2(%r8),%zmm14 - vmovdqa64 64*3(%r8),%zmm15 - vmovdqa64 64*4(%r8),%zmm16 - vmovdqa64 64*5(%r8),%zmm27 - vmovdqa64 64*6(%r8),%zmm28 - vmovdqa64 64*7(%r8),%zmm29 - vmovdqa64 64*8(%r8),%zmm30 - vmovdqa64 64*9(%r8),%zmm31 - vmovdqa64 64*10(%r8),%zmm22 - vmovdqa64 64*11(%r8),%zmm23 - vmovdqa64 64*12(%r8),%zmm24 - vmovdqa64 64*13(%r8),%zmm25 - vmovdqa64 64*14(%r8),%zmm26 - vmovdqa64 64*15(%r8),%zmm17 - vmovdqa64 64*16(%r8),%zmm18 - vmovdqa64 64*17(%r8),%zmm19 - vmovdqa64 64*18(%r8),%zmm20 - vmovdqa64 64*19(%r8),%zmm21 - vmovdqu64 40*0-96(%rdi),%zmm0{%k6}{z} - vmovdqu64 40*1-96(%rdi),%zmm1{%k6}{z} - vmovdqu64 40*2-96(%rdi),%zmm2{%k6}{z} - vmovdqu64 40*3-96(%rdi),%zmm3{%k6}{z} - vmovdqu64 40*4-96(%rdi),%zmm4{%k6}{z} - cmp $21, %rsi - jnz KeccakP1600_AVX512_FastLoop_Absorb_Not21Lanes - sub $21, %rcx -KeccakP1600_AVX512_FastLoop_Absorb_Loop21Lanes: - vmovdqu64 8*0(%rdx),%zmm5{%k6}{z} - vmovdqu64 8*5(%rdx),%zmm6{%k6}{z} - vmovdqu64 8*10(%rdx),%zmm7{%k6}{z} - vmovdqu64 8*15(%rdx),%zmm8{%k6}{z} - vmovdqu64 8*20(%rdx),%zmm9{%k1}{z} - vpxorq %zmm5,%zmm0,%zmm0 - vpxorq %zmm6,%zmm1,%zmm1 - vpxorq %zmm7,%zmm2,%zmm2 - vpxorq %zmm8,%zmm3,%zmm3 - vpxorq %zmm9,%zmm4,%zmm4 - add $21*8, %rdx - lea iotas+12*8(%rip), %r10 - mov $12/2, %eax - call __KeccakF1600 - sub $21, %rcx - jnc KeccakP1600_AVX512_FastLoop_Absorb_Loop21Lanes -KeccakP1600_AVX512_FastLoop_Absorb_SaveAndExit: - vmovdqu64 %zmm0,40*0-96(%rdi){%k6} - vmovdqu64 %zmm1,40*1-96(%rdi){%k6} - vmovdqu64 %zmm2,40*2-96(%rdi){%k6} - vmovdqu64 %zmm3,40*3-96(%rdi){%k6} - vmovdqu64 %zmm4,40*4-96(%rdi){%k6} -KeccakP1600_AVX512_FastLoop_Absorb_Exit: - vzeroupper - mov %rdx, %rax # return number of bytes processed - sub %rbx, %rax - pop %r10 - pop %rbx - ret -KeccakP1600_AVX512_FastLoop_Absorb_Not21Lanes: - cmp $17, %rsi - jnz KeccakP1600_AVX512_FastLoop_Absorb_Not17Lanes - sub $17, %rcx -KeccakP1600_AVX512_FastLoop_Absorb_Loop17Lanes: - vmovdqu64 8*0(%rdx),%zmm5{%k6}{z} - vmovdqu64 8*5(%rdx),%zmm6{%k6}{z} - vmovdqu64 8*10(%rdx),%zmm7{%k6}{z} - vmovdqu64 8*15(%rdx),%zmm8{%k1}{z} - vmovdqu64 8*15(%rdx),%zmm8{%k2} - vpxorq %zmm5,%zmm0,%zmm0 - vpxorq %zmm6,%zmm1,%zmm1 - vpxorq %zmm7,%zmm2,%zmm2 - vpxorq %zmm8,%zmm3,%zmm3 - add $17*8, %rdx - lea iotas+12*8(%rip), %r10 - mov $12/2, %eax - call __KeccakF1600 - sub $17, %rcx - jnc KeccakP1600_AVX512_FastLoop_Absorb_Loop17Lanes - jmp KeccakP1600_AVX512_FastLoop_Absorb_SaveAndExit -KeccakP1600_AVX512_FastLoop_Absorb_Not17Lanes: - lea -96(%rdi), %rdi -KeccakP1600_AVX512_FastLoop_Absorb_LanesLoop: - mov %rsi, %rax - mov %rdi, %r10 -KeccakP1600_AVX512_FastLoop_Absorb_LanesAddLoop: - mov (%rdx), %r8 - add $8, %rdx - xor %r8, (%r10) - add $8, %r10 - sub $1, %rax - jnz KeccakP1600_AVX512_FastLoop_Absorb_LanesAddLoop - sub %rsi, %rcx - push %rdi - push %rsi - push %rdx - push %rcx -.ifdef macOS - call _KeccakP1600_AVX512_Permute_12rounds -.else - call KeccakP1600_AVX512_Permute_12rounds@PLT -.endif - pop %rcx - pop %rdx - pop %rsi - pop %rdi - cmp %rsi, %rcx - jae KeccakP1600_AVX512_FastLoop_Absorb_LanesLoop - jmp KeccakP1600_AVX512_FastLoop_Absorb_Exit -.ifdef macOS -.else -.size KeccakP1600_AVX512_12rounds_FastLoop_Absorb,.-KeccakP1600_AVX512_12rounds_FastLoop_Absorb -.endif -.balign 64 -theta_perm: - .quad 0, 1, 2, 3, 4, 5, 6, 7 # [not used] - .quad 4, 0, 1, 2, 3, 5, 6, 7 - .quad 3, 4, 0, 1, 2, 5, 6, 7 - .quad 2, 3, 4, 0, 1, 5, 6, 7 - .quad 1, 2, 3, 4, 0, 5, 6, 7 -rhotates1: - .quad 0, 44, 43, 21, 14, 0, 0, 0 # [0][0] [1][1] [2][2] [3][3] [4][4] - .quad 18, 1, 6, 25, 8, 0, 0, 0 # [4][0] [0][1] [1][2] [2][3] [3][4] - .quad 41, 2, 62, 55, 39, 0, 0, 0 # [3][0] [4][1] [0][2] [1][3] [2][4] - .quad 3, 45, 61, 28, 20, 0, 0, 0 # [2][0] [3][1] [4][2] [0][3] [1][4] - .quad 36, 10, 15, 56, 27, 0, 0, 0 # [1][0] [2][1] [3][2] [4][3] [0][4] -rhotates0: - .quad 0, 1, 62, 28, 27, 0, 0, 0 - .quad 36, 44, 6, 55, 20, 0, 0, 0 - .quad 3, 10, 43, 25, 39, 0, 0, 0 - .quad 41, 45, 15, 21, 8, 0, 0, 0 - .quad 18, 2, 61, 56, 14, 0, 0, 0 -pi0_perm: - .quad 0, 3, 1, 4, 2, 5, 6, 7 - .quad 1, 4, 2, 0, 3, 5, 6, 7 - .quad 2, 0, 3, 1, 4, 5, 6, 7 - .quad 3, 1, 4, 2, 0, 5, 6, 7 - .quad 4, 2, 0, 3, 1, 5, 6, 7 -iotas: - .quad 0x0000000000000001 - .quad 0x0000000000008082 - .quad 0x800000000000808a - .quad 0x8000000080008000 - .quad 0x000000000000808b - .quad 0x0000000080000001 - .quad 0x8000000080008081 - .quad 0x8000000000008009 - .quad 0x000000000000008a - .quad 0x0000000000000088 - .quad 0x0000000080008009 - .quad 0x000000008000000a - .quad 0x000000008000808b - .quad 0x800000000000008b - .quad 0x8000000000008089 - .quad 0x8000000000008003 - .quad 0x8000000000008002 - .quad 0x8000000000000080 - .quad 0x000000000000800a - .quad 0x800000008000000a - .quad 0x8000000080008081 - .quad 0x8000000000008080 - .quad 0x0000000080000001 - .quad 0x8000000080008008 -iotas_end: -.asciz "Keccak-1600 for AVX-512F, CRYPTOGAMS by " diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-SnP.h b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-SnP.h deleted file mode 100644 index 709469c..0000000 --- a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-SnP.h +++ /dev/null @@ -1,74 +0,0 @@ -/* -K12 based on the eXtended Keccak Code Package (XKCP) -https://github.com/XKCP/XKCP - -The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. - -Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to the Keccak Team website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ - ---- - -Please refer to the XKCP for more details. -*/ - -#ifndef _KeccakP_1600_SnP_h_ -#define _KeccakP_1600_SnP_h_ - -/* Keccak-p[1600] */ - -#define KeccakP1600_stateSizeInBytes 200 -#define KeccakP1600_stateAlignment 8 -#define KeccakP1600_12rounds_FastLoop_supported - -const char * KeccakP1600_GetImplementation(); -void KeccakP1600_Initialize(void *state); -void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset); -void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); -void KeccakP1600_Permute_12rounds(void *state); -void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); -size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); - -void KeccakP1600_AVX512_Initialize(void *state); -void KeccakP1600_AVX512_AddByte(void *state, unsigned char data, unsigned int offset); -void KeccakP1600_AVX512_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); -void KeccakP1600_AVX512_Permute_12rounds(void *state); -void KeccakP1600_AVX512_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); -size_t KeccakP1600_AVX512_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); - -void KeccakP1600_AVX2_Initialize(void *state); -void KeccakP1600_AVX2_AddByte(void *state, unsigned char data, unsigned int offset); -void KeccakP1600_AVX2_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); -void KeccakP1600_AVX2_Permute_12rounds(void *state); -void KeccakP1600_AVX2_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); -size_t KeccakP1600_AVX2_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); - -void KeccakP1600_opt64_Initialize(void *state); -void KeccakP1600_opt64_AddByte(void *state, unsigned char data, unsigned int offset); -void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); -void KeccakP1600_opt64_Permute_12rounds(void *state); -void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); -size_t KeccakP1600_opt64_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); - -/* Keccak-p[1600]×2 */ - -int KeccakP1600times2_IsAvailable(); -const char * KeccakP1600times2_GetImplementation(); - -/* Keccak-p[1600]×4 */ - -int KeccakP1600times4_IsAvailable(); -const char * KeccakP1600times4_GetImplementation(); - -/* Keccak-p[1600]×8 */ - -int KeccakP1600times8_IsAvailable(); -const char * KeccakP1600times8_GetImplementation(); - -#endif diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c deleted file mode 100644 index e98056d..0000000 --- a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-opt64.c +++ /dev/null @@ -1,1026 +0,0 @@ -/* -K12 based on the eXtended Keccak Code Package (XKCP) -https://github.com/XKCP/XKCP - -The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. - -Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to the Keccak Team website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ - ---- - -Please refer to the XKCP for more details. -*/ - -#include -#include -#include -#include "brg_endian.h" -#include - -#define KeccakP1600_opt64_implementation_config "all rounds unrolled" -#define KeccakP1600_opt64_fullUnrolling -/* Or */ -/* -#define KeccakP1600_opt64_implementation_config "6 rounds unrolled" -#define KeccakP1600_opt64_unrolling 6 -*/ -/* Or */ -/* -#define KeccakP1600_opt64_implementation_config "lane complementing, 6 rounds unrolled" -#define KeccakP1600_opt64_unrolling 6 -#define KeccakP1600_opt64_useLaneComplementing -*/ -/* Or */ -/* -#define KeccakP1600_opt64_implementation_config "lane complementing, all rounds unrolled" -#define KeccakP1600_opt64_fullUnrolling -#define KeccakP1600_opt64_useLaneComplementing -*/ -/* Or */ -/* -#define KeccakP1600_opt64_implementation_config "lane complementing, all rounds unrolled, using SHLD for rotations" -#define KeccakP1600_opt64_fullUnrolling -#define KeccakP1600_opt64_useLaneComplementing -#define KeccakP1600_opt64_useSHLD -*/ - -#if defined(KeccakP1600_opt64_useLaneComplementing) -#define UseBebigokimisa -#endif - -#if defined(_MSC_VER) -#define ROL64(a, offset) _rotl64(a, offset) -#elif defined(KeccakP1600_opt64_useSHLD) - #define ROL64(x,N) ({ \ - register uint64_t __out; \ - register uint64_t __in = x; \ - __asm__ ("shld %2,%0,%0" : "=r"(__out) : "0"(__in), "i"(N)); \ - __out; \ - }) -#else -#define ROL64(a, offset) ((((uint64_t)a) << offset) ^ (((uint64_t)a) >> (64-offset))) -#endif - -#ifdef KeccakP1600_opt64_fullUnrolling -#define FullUnrolling -#else -#define Unrolling KeccakP1600_opt64_unrolling -#endif - -static const uint64_t KeccakF1600RoundConstants[24] = { - 0x0000000000000001ULL, - 0x0000000000008082ULL, - 0x800000000000808aULL, - 0x8000000080008000ULL, - 0x000000000000808bULL, - 0x0000000080000001ULL, - 0x8000000080008081ULL, - 0x8000000000008009ULL, - 0x000000000000008aULL, - 0x0000000000000088ULL, - 0x0000000080008009ULL, - 0x000000008000000aULL, - 0x000000008000808bULL, - 0x800000000000008bULL, - 0x8000000000008089ULL, - 0x8000000000008003ULL, - 0x8000000000008002ULL, - 0x8000000000000080ULL, - 0x000000000000800aULL, - 0x800000008000000aULL, - 0x8000000080008081ULL, - 0x8000000000008080ULL, - 0x0000000080000001ULL, - 0x8000000080008008ULL }; - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_opt64_Initialize(void *state) -{ - memset(state, 0, 200); -#ifdef KeccakP1600_opt64_useLaneComplementing - ((uint64_t*)state)[ 1] = ~(uint64_t)0; - ((uint64_t*)state)[ 2] = ~(uint64_t)0; - ((uint64_t*)state)[ 8] = ~(uint64_t)0; - ((uint64_t*)state)[12] = ~(uint64_t)0; - ((uint64_t*)state)[17] = ~(uint64_t)0; - ((uint64_t*)state)[20] = ~(uint64_t)0; -#endif -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_opt64_AddBytesInLane(void *state, unsigned int lanePosition, const unsigned char *data, unsigned int offset, unsigned int length) -{ -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) - uint64_t lane; - if (length == 0) - return; - if (length == 1) - lane = data[0]; - else { - lane = 0; - memcpy(&lane, data, length); - } - lane <<= offset*8; -#else - uint64_t lane = 0; - unsigned int i; - for(i=0; i 0) { \ - unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ - if (_bytesInLane > _sizeLeft) \ - _bytesInLane = _sizeLeft; \ - SnP_AddBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ - _sizeLeft -= _bytesInLane; \ - _lanePosition++; \ - _offsetInLane = 0; \ - _curData += _bytesInLane; \ - } \ - } \ - } - -void KeccakP1600_opt64_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) -{ - SnP_AddBytes(state, data, offset, length, KeccakP1600_opt64_AddLanes, KeccakP1600_opt64_AddBytesInLane, 8); -} - -/* ---------------------------------------------------------------- */ - -#define declareABCDE \ - uint64_t Aba, Abe, Abi, Abo, Abu; \ - uint64_t Aga, Age, Agi, Ago, Agu; \ - uint64_t Aka, Ake, Aki, Ako, Aku; \ - uint64_t Ama, Ame, Ami, Amo, Amu; \ - uint64_t Asa, Ase, Asi, Aso, Asu; \ - uint64_t Bba, Bbe, Bbi, Bbo, Bbu; \ - uint64_t Bga, Bge, Bgi, Bgo, Bgu; \ - uint64_t Bka, Bke, Bki, Bko, Bku; \ - uint64_t Bma, Bme, Bmi, Bmo, Bmu; \ - uint64_t Bsa, Bse, Bsi, Bso, Bsu; \ - uint64_t Ca, Ce, Ci, Co, Cu; \ - uint64_t Da, De, Di, Do, Du; \ - uint64_t Eba, Ebe, Ebi, Ebo, Ebu; \ - uint64_t Ega, Ege, Egi, Ego, Egu; \ - uint64_t Eka, Eke, Eki, Eko, Eku; \ - uint64_t Ema, Eme, Emi, Emo, Emu; \ - uint64_t Esa, Ese, Esi, Eso, Esu; \ - -#define prepareTheta \ - Ca = Aba^Aga^Aka^Ama^Asa; \ - Ce = Abe^Age^Ake^Ame^Ase; \ - Ci = Abi^Agi^Aki^Ami^Asi; \ - Co = Abo^Ago^Ako^Amo^Aso; \ - Cu = Abu^Agu^Aku^Amu^Asu; \ - -#ifdef UseBebigokimisa -/* --- Code for round, with prepare-theta (lane complementing pattern 'bebigokimisa') */ -/* --- 64-bit lanes mapped to 64-bit words */ -#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ - Da = Cu^ROL64(Ce, 1); \ - De = Ca^ROL64(Ci, 1); \ - Di = Ce^ROL64(Co, 1); \ - Do = Ci^ROL64(Cu, 1); \ - Du = Co^ROL64(Ca, 1); \ -\ - A##ba ^= Da; \ - Bba = A##ba; \ - A##ge ^= De; \ - Bbe = ROL64(A##ge, 44); \ - A##ki ^= Di; \ - Bbi = ROL64(A##ki, 43); \ - A##mo ^= Do; \ - Bbo = ROL64(A##mo, 21); \ - A##su ^= Du; \ - Bbu = ROL64(A##su, 14); \ - E##ba = Bba ^( Bbe | Bbi ); \ - E##ba ^= KeccakF1600RoundConstants[i]; \ - Ca = E##ba; \ - E##be = Bbe ^((~Bbi)| Bbo ); \ - Ce = E##be; \ - E##bi = Bbi ^( Bbo & Bbu ); \ - Ci = E##bi; \ - E##bo = Bbo ^( Bbu | Bba ); \ - Co = E##bo; \ - E##bu = Bbu ^( Bba & Bbe ); \ - Cu = E##bu; \ -\ - A##bo ^= Do; \ - Bga = ROL64(A##bo, 28); \ - A##gu ^= Du; \ - Bge = ROL64(A##gu, 20); \ - A##ka ^= Da; \ - Bgi = ROL64(A##ka, 3); \ - A##me ^= De; \ - Bgo = ROL64(A##me, 45); \ - A##si ^= Di; \ - Bgu = ROL64(A##si, 61); \ - E##ga = Bga ^( Bge | Bgi ); \ - Ca ^= E##ga; \ - E##ge = Bge ^( Bgi & Bgo ); \ - Ce ^= E##ge; \ - E##gi = Bgi ^( Bgo |(~Bgu)); \ - Ci ^= E##gi; \ - E##go = Bgo ^( Bgu | Bga ); \ - Co ^= E##go; \ - E##gu = Bgu ^( Bga & Bge ); \ - Cu ^= E##gu; \ -\ - A##be ^= De; \ - Bka = ROL64(A##be, 1); \ - A##gi ^= Di; \ - Bke = ROL64(A##gi, 6); \ - A##ko ^= Do; \ - Bki = ROL64(A##ko, 25); \ - A##mu ^= Du; \ - Bko = ROL64(A##mu, 8); \ - A##sa ^= Da; \ - Bku = ROL64(A##sa, 18); \ - E##ka = Bka ^( Bke | Bki ); \ - Ca ^= E##ka; \ - E##ke = Bke ^( Bki & Bko ); \ - Ce ^= E##ke; \ - E##ki = Bki ^((~Bko)& Bku ); \ - Ci ^= E##ki; \ - E##ko = (~Bko)^( Bku | Bka ); \ - Co ^= E##ko; \ - E##ku = Bku ^( Bka & Bke ); \ - Cu ^= E##ku; \ -\ - A##bu ^= Du; \ - Bma = ROL64(A##bu, 27); \ - A##ga ^= Da; \ - Bme = ROL64(A##ga, 36); \ - A##ke ^= De; \ - Bmi = ROL64(A##ke, 10); \ - A##mi ^= Di; \ - Bmo = ROL64(A##mi, 15); \ - A##so ^= Do; \ - Bmu = ROL64(A##so, 56); \ - E##ma = Bma ^( Bme & Bmi ); \ - Ca ^= E##ma; \ - E##me = Bme ^( Bmi | Bmo ); \ - Ce ^= E##me; \ - E##mi = Bmi ^((~Bmo)| Bmu ); \ - Ci ^= E##mi; \ - E##mo = (~Bmo)^( Bmu & Bma ); \ - Co ^= E##mo; \ - E##mu = Bmu ^( Bma | Bme ); \ - Cu ^= E##mu; \ -\ - A##bi ^= Di; \ - Bsa = ROL64(A##bi, 62); \ - A##go ^= Do; \ - Bse = ROL64(A##go, 55); \ - A##ku ^= Du; \ - Bsi = ROL64(A##ku, 39); \ - A##ma ^= Da; \ - Bso = ROL64(A##ma, 41); \ - A##se ^= De; \ - Bsu = ROL64(A##se, 2); \ - E##sa = Bsa ^((~Bse)& Bsi ); \ - Ca ^= E##sa; \ - E##se = (~Bse)^( Bsi | Bso ); \ - Ce ^= E##se; \ - E##si = Bsi ^( Bso & Bsu ); \ - Ci ^= E##si; \ - E##so = Bso ^( Bsu | Bsa ); \ - Co ^= E##so; \ - E##su = Bsu ^( Bsa & Bse ); \ - Cu ^= E##su; \ -\ - -/* --- Code for round (lane complementing pattern 'bebigokimisa') */ -/* --- 64-bit lanes mapped to 64-bit words */ -#define thetaRhoPiChiIota(i, A, E) \ - Da = Cu^ROL64(Ce, 1); \ - De = Ca^ROL64(Ci, 1); \ - Di = Ce^ROL64(Co, 1); \ - Do = Ci^ROL64(Cu, 1); \ - Du = Co^ROL64(Ca, 1); \ -\ - A##ba ^= Da; \ - Bba = A##ba; \ - A##ge ^= De; \ - Bbe = ROL64(A##ge, 44); \ - A##ki ^= Di; \ - Bbi = ROL64(A##ki, 43); \ - A##mo ^= Do; \ - Bbo = ROL64(A##mo, 21); \ - A##su ^= Du; \ - Bbu = ROL64(A##su, 14); \ - E##ba = Bba ^( Bbe | Bbi ); \ - E##ba ^= KeccakF1600RoundConstants[i]; \ - E##be = Bbe ^((~Bbi)| Bbo ); \ - E##bi = Bbi ^( Bbo & Bbu ); \ - E##bo = Bbo ^( Bbu | Bba ); \ - E##bu = Bbu ^( Bba & Bbe ); \ -\ - A##bo ^= Do; \ - Bga = ROL64(A##bo, 28); \ - A##gu ^= Du; \ - Bge = ROL64(A##gu, 20); \ - A##ka ^= Da; \ - Bgi = ROL64(A##ka, 3); \ - A##me ^= De; \ - Bgo = ROL64(A##me, 45); \ - A##si ^= Di; \ - Bgu = ROL64(A##si, 61); \ - E##ga = Bga ^( Bge | Bgi ); \ - E##ge = Bge ^( Bgi & Bgo ); \ - E##gi = Bgi ^( Bgo |(~Bgu)); \ - E##go = Bgo ^( Bgu | Bga ); \ - E##gu = Bgu ^( Bga & Bge ); \ -\ - A##be ^= De; \ - Bka = ROL64(A##be, 1); \ - A##gi ^= Di; \ - Bke = ROL64(A##gi, 6); \ - A##ko ^= Do; \ - Bki = ROL64(A##ko, 25); \ - A##mu ^= Du; \ - Bko = ROL64(A##mu, 8); \ - A##sa ^= Da; \ - Bku = ROL64(A##sa, 18); \ - E##ka = Bka ^( Bke | Bki ); \ - E##ke = Bke ^( Bki & Bko ); \ - E##ki = Bki ^((~Bko)& Bku ); \ - E##ko = (~Bko)^( Bku | Bka ); \ - E##ku = Bku ^( Bka & Bke ); \ -\ - A##bu ^= Du; \ - Bma = ROL64(A##bu, 27); \ - A##ga ^= Da; \ - Bme = ROL64(A##ga, 36); \ - A##ke ^= De; \ - Bmi = ROL64(A##ke, 10); \ - A##mi ^= Di; \ - Bmo = ROL64(A##mi, 15); \ - A##so ^= Do; \ - Bmu = ROL64(A##so, 56); \ - E##ma = Bma ^( Bme & Bmi ); \ - E##me = Bme ^( Bmi | Bmo ); \ - E##mi = Bmi ^((~Bmo)| Bmu ); \ - E##mo = (~Bmo)^( Bmu & Bma ); \ - E##mu = Bmu ^( Bma | Bme ); \ -\ - A##bi ^= Di; \ - Bsa = ROL64(A##bi, 62); \ - A##go ^= Do; \ - Bse = ROL64(A##go, 55); \ - A##ku ^= Du; \ - Bsi = ROL64(A##ku, 39); \ - A##ma ^= Da; \ - Bso = ROL64(A##ma, 41); \ - A##se ^= De; \ - Bsu = ROL64(A##se, 2); \ - E##sa = Bsa ^((~Bse)& Bsi ); \ - E##se = (~Bse)^( Bsi | Bso ); \ - E##si = Bsi ^( Bso & Bsu ); \ - E##so = Bso ^( Bsu | Bsa ); \ - E##su = Bsu ^( Bsa & Bse ); \ -\ - -#else /* UseBebigokimisa */ -/* --- Code for round, with prepare-theta */ -/* --- 64-bit lanes mapped to 64-bit words */ -#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ - Da = Cu^ROL64(Ce, 1); \ - De = Ca^ROL64(Ci, 1); \ - Di = Ce^ROL64(Co, 1); \ - Do = Ci^ROL64(Cu, 1); \ - Du = Co^ROL64(Ca, 1); \ -\ - A##ba ^= Da; \ - Bba = A##ba; \ - A##ge ^= De; \ - Bbe = ROL64(A##ge, 44); \ - A##ki ^= Di; \ - Bbi = ROL64(A##ki, 43); \ - A##mo ^= Do; \ - Bbo = ROL64(A##mo, 21); \ - A##su ^= Du; \ - Bbu = ROL64(A##su, 14); \ - E##ba = Bba ^((~Bbe)& Bbi ); \ - E##ba ^= KeccakF1600RoundConstants[i]; \ - Ca = E##ba; \ - E##be = Bbe ^((~Bbi)& Bbo ); \ - Ce = E##be; \ - E##bi = Bbi ^((~Bbo)& Bbu ); \ - Ci = E##bi; \ - E##bo = Bbo ^((~Bbu)& Bba ); \ - Co = E##bo; \ - E##bu = Bbu ^((~Bba)& Bbe ); \ - Cu = E##bu; \ -\ - A##bo ^= Do; \ - Bga = ROL64(A##bo, 28); \ - A##gu ^= Du; \ - Bge = ROL64(A##gu, 20); \ - A##ka ^= Da; \ - Bgi = ROL64(A##ka, 3); \ - A##me ^= De; \ - Bgo = ROL64(A##me, 45); \ - A##si ^= Di; \ - Bgu = ROL64(A##si, 61); \ - E##ga = Bga ^((~Bge)& Bgi ); \ - Ca ^= E##ga; \ - E##ge = Bge ^((~Bgi)& Bgo ); \ - Ce ^= E##ge; \ - E##gi = Bgi ^((~Bgo)& Bgu ); \ - Ci ^= E##gi; \ - E##go = Bgo ^((~Bgu)& Bga ); \ - Co ^= E##go; \ - E##gu = Bgu ^((~Bga)& Bge ); \ - Cu ^= E##gu; \ -\ - A##be ^= De; \ - Bka = ROL64(A##be, 1); \ - A##gi ^= Di; \ - Bke = ROL64(A##gi, 6); \ - A##ko ^= Do; \ - Bki = ROL64(A##ko, 25); \ - A##mu ^= Du; \ - Bko = ROL64(A##mu, 8); \ - A##sa ^= Da; \ - Bku = ROL64(A##sa, 18); \ - E##ka = Bka ^((~Bke)& Bki ); \ - Ca ^= E##ka; \ - E##ke = Bke ^((~Bki)& Bko ); \ - Ce ^= E##ke; \ - E##ki = Bki ^((~Bko)& Bku ); \ - Ci ^= E##ki; \ - E##ko = Bko ^((~Bku)& Bka ); \ - Co ^= E##ko; \ - E##ku = Bku ^((~Bka)& Bke ); \ - Cu ^= E##ku; \ -\ - A##bu ^= Du; \ - Bma = ROL64(A##bu, 27); \ - A##ga ^= Da; \ - Bme = ROL64(A##ga, 36); \ - A##ke ^= De; \ - Bmi = ROL64(A##ke, 10); \ - A##mi ^= Di; \ - Bmo = ROL64(A##mi, 15); \ - A##so ^= Do; \ - Bmu = ROL64(A##so, 56); \ - E##ma = Bma ^((~Bme)& Bmi ); \ - Ca ^= E##ma; \ - E##me = Bme ^((~Bmi)& Bmo ); \ - Ce ^= E##me; \ - E##mi = Bmi ^((~Bmo)& Bmu ); \ - Ci ^= E##mi; \ - E##mo = Bmo ^((~Bmu)& Bma ); \ - Co ^= E##mo; \ - E##mu = Bmu ^((~Bma)& Bme ); \ - Cu ^= E##mu; \ -\ - A##bi ^= Di; \ - Bsa = ROL64(A##bi, 62); \ - A##go ^= Do; \ - Bse = ROL64(A##go, 55); \ - A##ku ^= Du; \ - Bsi = ROL64(A##ku, 39); \ - A##ma ^= Da; \ - Bso = ROL64(A##ma, 41); \ - A##se ^= De; \ - Bsu = ROL64(A##se, 2); \ - E##sa = Bsa ^((~Bse)& Bsi ); \ - Ca ^= E##sa; \ - E##se = Bse ^((~Bsi)& Bso ); \ - Ce ^= E##se; \ - E##si = Bsi ^((~Bso)& Bsu ); \ - Ci ^= E##si; \ - E##so = Bso ^((~Bsu)& Bsa ); \ - Co ^= E##so; \ - E##su = Bsu ^((~Bsa)& Bse ); \ - Cu ^= E##su; \ -\ - -/* --- Code for round */ -/* --- 64-bit lanes mapped to 64-bit words */ -#define thetaRhoPiChiIota(i, A, E) \ - Da = Cu^ROL64(Ce, 1); \ - De = Ca^ROL64(Ci, 1); \ - Di = Ce^ROL64(Co, 1); \ - Do = Ci^ROL64(Cu, 1); \ - Du = Co^ROL64(Ca, 1); \ -\ - A##ba ^= Da; \ - Bba = A##ba; \ - A##ge ^= De; \ - Bbe = ROL64(A##ge, 44); \ - A##ki ^= Di; \ - Bbi = ROL64(A##ki, 43); \ - A##mo ^= Do; \ - Bbo = ROL64(A##mo, 21); \ - A##su ^= Du; \ - Bbu = ROL64(A##su, 14); \ - E##ba = Bba ^((~Bbe)& Bbi ); \ - E##ba ^= KeccakF1600RoundConstants[i]; \ - E##be = Bbe ^((~Bbi)& Bbo ); \ - E##bi = Bbi ^((~Bbo)& Bbu ); \ - E##bo = Bbo ^((~Bbu)& Bba ); \ - E##bu = Bbu ^((~Bba)& Bbe ); \ -\ - A##bo ^= Do; \ - Bga = ROL64(A##bo, 28); \ - A##gu ^= Du; \ - Bge = ROL64(A##gu, 20); \ - A##ka ^= Da; \ - Bgi = ROL64(A##ka, 3); \ - A##me ^= De; \ - Bgo = ROL64(A##me, 45); \ - A##si ^= Di; \ - Bgu = ROL64(A##si, 61); \ - E##ga = Bga ^((~Bge)& Bgi ); \ - E##ge = Bge ^((~Bgi)& Bgo ); \ - E##gi = Bgi ^((~Bgo)& Bgu ); \ - E##go = Bgo ^((~Bgu)& Bga ); \ - E##gu = Bgu ^((~Bga)& Bge ); \ -\ - A##be ^= De; \ - Bka = ROL64(A##be, 1); \ - A##gi ^= Di; \ - Bke = ROL64(A##gi, 6); \ - A##ko ^= Do; \ - Bki = ROL64(A##ko, 25); \ - A##mu ^= Du; \ - Bko = ROL64(A##mu, 8); \ - A##sa ^= Da; \ - Bku = ROL64(A##sa, 18); \ - E##ka = Bka ^((~Bke)& Bki ); \ - E##ke = Bke ^((~Bki)& Bko ); \ - E##ki = Bki ^((~Bko)& Bku ); \ - E##ko = Bko ^((~Bku)& Bka ); \ - E##ku = Bku ^((~Bka)& Bke ); \ -\ - A##bu ^= Du; \ - Bma = ROL64(A##bu, 27); \ - A##ga ^= Da; \ - Bme = ROL64(A##ga, 36); \ - A##ke ^= De; \ - Bmi = ROL64(A##ke, 10); \ - A##mi ^= Di; \ - Bmo = ROL64(A##mi, 15); \ - A##so ^= Do; \ - Bmu = ROL64(A##so, 56); \ - E##ma = Bma ^((~Bme)& Bmi ); \ - E##me = Bme ^((~Bmi)& Bmo ); \ - E##mi = Bmi ^((~Bmo)& Bmu ); \ - E##mo = Bmo ^((~Bmu)& Bma ); \ - E##mu = Bmu ^((~Bma)& Bme ); \ -\ - A##bi ^= Di; \ - Bsa = ROL64(A##bi, 62); \ - A##go ^= Do; \ - Bse = ROL64(A##go, 55); \ - A##ku ^= Du; \ - Bsi = ROL64(A##ku, 39); \ - A##ma ^= Da; \ - Bso = ROL64(A##ma, 41); \ - A##se ^= De; \ - Bsu = ROL64(A##se, 2); \ - E##sa = Bsa ^((~Bse)& Bsi ); \ - E##se = Bse ^((~Bsi)& Bso ); \ - E##si = Bsi ^((~Bso)& Bsu ); \ - E##so = Bso ^((~Bsu)& Bsa ); \ - E##su = Bsu ^((~Bsa)& Bse ); \ -\ - -#endif /* UseBebigokimisa */ - -#define copyFromState(X, state) \ - X##ba = state[ 0]; \ - X##be = state[ 1]; \ - X##bi = state[ 2]; \ - X##bo = state[ 3]; \ - X##bu = state[ 4]; \ - X##ga = state[ 5]; \ - X##ge = state[ 6]; \ - X##gi = state[ 7]; \ - X##go = state[ 8]; \ - X##gu = state[ 9]; \ - X##ka = state[10]; \ - X##ke = state[11]; \ - X##ki = state[12]; \ - X##ko = state[13]; \ - X##ku = state[14]; \ - X##ma = state[15]; \ - X##me = state[16]; \ - X##mi = state[17]; \ - X##mo = state[18]; \ - X##mu = state[19]; \ - X##sa = state[20]; \ - X##se = state[21]; \ - X##si = state[22]; \ - X##so = state[23]; \ - X##su = state[24]; \ - -#define copyToState(state, X) \ - state[ 0] = X##ba; \ - state[ 1] = X##be; \ - state[ 2] = X##bi; \ - state[ 3] = X##bo; \ - state[ 4] = X##bu; \ - state[ 5] = X##ga; \ - state[ 6] = X##ge; \ - state[ 7] = X##gi; \ - state[ 8] = X##go; \ - state[ 9] = X##gu; \ - state[10] = X##ka; \ - state[11] = X##ke; \ - state[12] = X##ki; \ - state[13] = X##ko; \ - state[14] = X##ku; \ - state[15] = X##ma; \ - state[16] = X##me; \ - state[17] = X##mi; \ - state[18] = X##mo; \ - state[19] = X##mu; \ - state[20] = X##sa; \ - state[21] = X##se; \ - state[22] = X##si; \ - state[23] = X##so; \ - state[24] = X##su; \ - -#define copyStateVariables(X, Y) \ - X##ba = Y##ba; \ - X##be = Y##be; \ - X##bi = Y##bi; \ - X##bo = Y##bo; \ - X##bu = Y##bu; \ - X##ga = Y##ga; \ - X##ge = Y##ge; \ - X##gi = Y##gi; \ - X##go = Y##go; \ - X##gu = Y##gu; \ - X##ka = Y##ka; \ - X##ke = Y##ke; \ - X##ki = Y##ki; \ - X##ko = Y##ko; \ - X##ku = Y##ku; \ - X##ma = Y##ma; \ - X##me = Y##me; \ - X##mi = Y##mi; \ - X##mo = Y##mo; \ - X##mu = Y##mu; \ - X##sa = Y##sa; \ - X##se = Y##se; \ - X##si = Y##si; \ - X##so = Y##so; \ - X##su = Y##su; \ - -#if ((defined(FullUnrolling)) || (Unrolling == 12)) -#define rounds12 \ - prepareTheta \ - thetaRhoPiChiIotaPrepareTheta(12, A, E) \ - thetaRhoPiChiIotaPrepareTheta(13, E, A) \ - thetaRhoPiChiIotaPrepareTheta(14, A, E) \ - thetaRhoPiChiIotaPrepareTheta(15, E, A) \ - thetaRhoPiChiIotaPrepareTheta(16, A, E) \ - thetaRhoPiChiIotaPrepareTheta(17, E, A) \ - thetaRhoPiChiIotaPrepareTheta(18, A, E) \ - thetaRhoPiChiIotaPrepareTheta(19, E, A) \ - thetaRhoPiChiIotaPrepareTheta(20, A, E) \ - thetaRhoPiChiIotaPrepareTheta(21, E, A) \ - thetaRhoPiChiIotaPrepareTheta(22, A, E) \ - thetaRhoPiChiIota(23, E, A) \ - -#elif (Unrolling == 6) -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=6) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \ - } \ - -#elif (Unrolling == 4) -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=4) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ - } \ - -#elif (Unrolling == 3) -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=3) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - copyStateVariables(A, E) \ - } \ - -#elif (Unrolling == 2) -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=2) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - } \ - -#elif (Unrolling == 1) -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i++) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - copyStateVariables(A, E) \ - } \ - -#else -#error "Unrolling is not correctly specified!" -#endif - -void KeccakP1600_opt64_Permute_12rounds(void *state) -{ - declareABCDE - #ifndef KeccakP1600_opt64_fullUnrolling - unsigned int i; - #endif - uint64_t *stateAsLanes = (uint64_t*)state; - - copyFromState(A, stateAsLanes) - rounds12 - copyToState(stateAsLanes, A) -} - -/* ---------------------------------------------------------------- */ - -void KeccakP1600_opt64_ExtractBytesInLane(const void *state, unsigned int lanePosition, unsigned char *data, unsigned int offset, unsigned int length) -{ - uint64_t lane = ((uint64_t*)state)[lanePosition]; -#ifdef KeccakP1600_opt64_useLaneComplementing - if ((lanePosition == 1) || (lanePosition == 2) || (lanePosition == 8) || (lanePosition == 12) || (lanePosition == 17) || (lanePosition == 20)) - lane = ~lane; -#endif -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) - { - uint64_t lane1[1]; - lane1[0] = lane; - memcpy(data, (uint8_t*)lane1+offset, length); - } -#else - unsigned int i; - lane >>= offset*8; - for(i=0; i>= 8; - } -#endif -} - -/* ---------------------------------------------------------------- */ - -#if (PLATFORM_BYTE_ORDER != IS_LITTLE_ENDIAN) -static void fromWordToBytes(uint8_t *bytes, const uint64_t word) -{ - unsigned int i; - - for(i=0; i<(64/8); i++) - bytes[i] = (word >> (8*i)) & 0xFF; -} -#endif - -void KeccakP1600_opt64_ExtractLanes(const void *state, unsigned char *data, unsigned int laneCount) -{ -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) - memcpy(data, state, laneCount*8); -#else - unsigned int i; - - for(i=0; i 1) { - ((uint64_t*)data)[ 1] = ~((uint64_t*)data)[ 1]; - if (laneCount > 2) { - ((uint64_t*)data)[ 2] = ~((uint64_t*)data)[ 2]; - if (laneCount > 8) { - ((uint64_t*)data)[ 8] = ~((uint64_t*)data)[ 8]; - if (laneCount > 12) { - ((uint64_t*)data)[12] = ~((uint64_t*)data)[12]; - if (laneCount > 17) { - ((uint64_t*)data)[17] = ~((uint64_t*)data)[17]; - if (laneCount > 20) { - ((uint64_t*)data)[20] = ~((uint64_t*)data)[20]; - } - } - } - } - } - } -#endif -} - -/* ---------------------------------------------------------------- */ - -#define SnP_ExtractBytes(state, data, offset, length, SnP_ExtractLanes, SnP_ExtractBytesInLane, SnP_laneLengthInBytes) \ - { \ - if ((offset) == 0) { \ - SnP_ExtractLanes(state, data, (length)/SnP_laneLengthInBytes); \ - SnP_ExtractBytesInLane(state, \ - (length)/SnP_laneLengthInBytes, \ - (data)+((length)/SnP_laneLengthInBytes)*SnP_laneLengthInBytes, \ - 0, \ - (length)%SnP_laneLengthInBytes); \ - } \ - else { \ - unsigned int _sizeLeft = (length); \ - unsigned int _lanePosition = (offset)/SnP_laneLengthInBytes; \ - unsigned int _offsetInLane = (offset)%SnP_laneLengthInBytes; \ - unsigned char *_curData = (data); \ - while(_sizeLeft > 0) { \ - unsigned int _bytesInLane = SnP_laneLengthInBytes - _offsetInLane; \ - if (_bytesInLane > _sizeLeft) \ - _bytesInLane = _sizeLeft; \ - SnP_ExtractBytesInLane(state, _lanePosition, _curData, _offsetInLane, _bytesInLane); \ - _sizeLeft -= _bytesInLane; \ - _lanePosition++; \ - _offsetInLane = 0; \ - _curData += _bytesInLane; \ - } \ - } \ - } - -void KeccakP1600_opt64_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) -{ - SnP_ExtractBytes(state, data, offset, length, KeccakP1600_opt64_ExtractLanes, KeccakP1600_opt64_ExtractBytesInLane, 8); -} - -/* ---------------------------------------------------------------- */ - -#if (PLATFORM_BYTE_ORDER == IS_LITTLE_ENDIAN) -#define HTOLE64(x) (x) -#else -#define HTOLE64(x) (\ - ((x & 0xff00000000000000ull) >> 56) | \ - ((x & 0x00ff000000000000ull) >> 40) | \ - ((x & 0x0000ff0000000000ull) >> 24) | \ - ((x & 0x000000ff00000000ull) >> 8) | \ - ((x & 0x00000000ff000000ull) << 8) | \ - ((x & 0x0000000000ff0000ull) << 24) | \ - ((x & 0x000000000000ff00ull) << 40) | \ - ((x & 0x00000000000000ffull) << 56)) -#endif - -#define addInput(X, input, laneCount) \ - if (laneCount == 21) { \ - X##ba ^= HTOLE64(input[ 0]); \ - X##be ^= HTOLE64(input[ 1]); \ - X##bi ^= HTOLE64(input[ 2]); \ - X##bo ^= HTOLE64(input[ 3]); \ - X##bu ^= HTOLE64(input[ 4]); \ - X##ga ^= HTOLE64(input[ 5]); \ - X##ge ^= HTOLE64(input[ 6]); \ - X##gi ^= HTOLE64(input[ 7]); \ - X##go ^= HTOLE64(input[ 8]); \ - X##gu ^= HTOLE64(input[ 9]); \ - X##ka ^= HTOLE64(input[10]); \ - X##ke ^= HTOLE64(input[11]); \ - X##ki ^= HTOLE64(input[12]); \ - X##ko ^= HTOLE64(input[13]); \ - X##ku ^= HTOLE64(input[14]); \ - X##ma ^= HTOLE64(input[15]); \ - X##me ^= HTOLE64(input[16]); \ - X##mi ^= HTOLE64(input[17]); \ - X##mo ^= HTOLE64(input[18]); \ - X##mu ^= HTOLE64(input[19]); \ - X##sa ^= HTOLE64(input[20]); \ - } \ - -#include - -size_t KeccakP1600_opt64_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen) -{ - size_t originalDataByteLen = dataByteLen; - declareABCDE - #ifndef KeccakP1600_opt64_fullUnrolling - unsigned int i; - #endif - uint64_t *stateAsLanes = (uint64_t*)state; - uint64_t *inDataAsLanes = (uint64_t*)data; - - assert(laneCount == 21); - - #define laneCount 21 - copyFromState(A, stateAsLanes) - while(dataByteLen >= laneCount*8) { - addInput(A, inDataAsLanes, laneCount) - rounds12 - inDataAsLanes += laneCount; - dataByteLen -= laneCount*8; - } - #undef laneCount - copyToState(stateAsLanes, A) - return originalDataByteLen - dataByteLen; -} diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c deleted file mode 100644 index 22a0901..0000000 --- a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-runtimeDispatch.c +++ /dev/null @@ -1,406 +0,0 @@ -/* -K12 based on the eXtended Keccak Code Package (XKCP) -https://github.com/XKCP/XKCP - -The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. - -Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to the Keccak Team website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ - ---- - -Please refer to the XKCP for more details. -*/ - -#include -#include -#include -#include "brg_endian.h" -#include "KeccakP-1600-SnP.h" - -#ifdef KeccakP1600_disableParallelism -#undef KeccakP1600_enable_simd_options -#else - -// Forward declaration -void KangarooTwelve_SetProcessorCapabilities(); -#ifdef KeccakP1600_enable_simd_options -int K12_SSSE3_requested_disabled = 0; -int K12_AVX2_requested_disabled = 0; -int K12_AVX512_requested_disabled = 0; -#endif // KeccakP1600_enable_simd_options -int K12_enableSSSE3 = 0; -int K12_enableAVX2 = 0; -int K12_enableAVX512 = 0; - -/* ---------------------------------------------------------------- */ - -void KangarooTwelve_SSSE3_Process2Leaves(const unsigned char *input, unsigned char *output); -void KangarooTwelve_AVX512_Process2Leaves(const unsigned char *input, unsigned char *output); - -int KeccakP1600times2_IsAvailable() -{ - int result = 0; - result |= K12_enableAVX512; - result |= K12_enableSSSE3; - return result; -} - -const char * KeccakP1600times2_GetImplementation() -{ - if (K12_enableAVX512) { - return "AVX-512 implementation"; - } else if (K12_enableSSSE3) { - return "SSSE3 implementation"; - } else { - return ""; - } -} - -void KangarooTwelve_Process2Leaves(const unsigned char *input, unsigned char *output) -{ - if (K12_enableAVX512) { - KangarooTwelve_AVX512_Process2Leaves(input, output); - } else if (K12_enableSSSE3) { - KangarooTwelve_SSSE3_Process2Leaves(input, output); - } -} - - -void KangarooTwelve_AVX2_Process4Leaves(const unsigned char *input, unsigned char *output); -void KangarooTwelve_AVX512_Process4Leaves(const unsigned char *input, unsigned char *output); - -int KeccakP1600times4_IsAvailable() -{ - int result = 0; - result |= K12_enableAVX512; - result |= K12_enableAVX2; - return result; -} - -const char * KeccakP1600times4_GetImplementation() -{ - if (K12_enableAVX512) { - return "AVX-512 implementation"; - } else if (K12_enableAVX2) { - return "AVX2 implementation"; - } else { - return ""; - } -} - -void KangarooTwelve_Process4Leaves(const unsigned char *input, unsigned char *output) -{ - if (K12_enableAVX512) { - KangarooTwelve_AVX512_Process4Leaves(input, output); - } else if (K12_enableAVX2) { - KangarooTwelve_AVX2_Process4Leaves(input, output); - } -} - - -void KangarooTwelve_AVX512_Process8Leaves(const unsigned char *input, unsigned char *output); - -int KeccakP1600times8_IsAvailable() -{ - int result = 0; - result |= K12_enableAVX512; - return result; -} - -const char * KeccakP1600times8_GetImplementation() -{ - if (K12_enableAVX512) { - return "AVX-512 implementation"; - } else { - return ""; - } -} - -void KangarooTwelve_Process8Leaves(const unsigned char *input, unsigned char *output) -{ - if (K12_enableAVX512) - KangarooTwelve_AVX512_Process8Leaves(input, output); -} - -#endif // KeccakP1600_disableParallelism - -const char * KeccakP1600_GetImplementation() -{ - if (K12_enableAVX512) - return "AVX-512 implementation"; - else -#ifndef KeccakP1600_noAssembly - if (K12_enableAVX2) - return "AVX2 implementation"; - else -#endif - return "generic 64-bit implementation"; -} - -void KeccakP1600_Initialize(void *state) -{ - KangarooTwelve_SetProcessorCapabilities(); - if (K12_enableAVX512) - KeccakP1600_AVX512_Initialize(state); - else -#ifndef KeccakP1600_noAssembly - if (K12_enableAVX2) - KeccakP1600_AVX2_Initialize(state); - else -#endif - KeccakP1600_opt64_Initialize(state); -} - -void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset) -{ - if (K12_enableAVX512) - ((unsigned char*)(state))[offset] ^= data; - else -#ifndef KeccakP1600_noAssembly - if (K12_enableAVX2) - KeccakP1600_AVX2_AddByte(state, data, offset); - else -#endif - KeccakP1600_opt64_AddByte(state, data, offset); -} - -void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length) -{ - if (K12_enableAVX512) - KeccakP1600_AVX512_AddBytes(state, data, offset, length); - else -#ifndef KeccakP1600_noAssembly - if (K12_enableAVX2) - KeccakP1600_AVX2_AddBytes(state, data, offset, length); - else -#endif - KeccakP1600_opt64_AddBytes(state, data, offset, length); -} - -void KeccakP1600_Permute_12rounds(void *state) -{ - if (K12_enableAVX512) - KeccakP1600_AVX512_Permute_12rounds(state); - else -#ifndef KeccakP1600_noAssembly - if (K12_enableAVX2) - KeccakP1600_AVX2_Permute_12rounds(state); - else -#endif - KeccakP1600_opt64_Permute_12rounds(state); -} - -void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length) -{ - if (K12_enableAVX512) - KeccakP1600_AVX512_ExtractBytes(state, data, offset, length); - else -#ifndef KeccakP1600_noAssembly - if (K12_enableAVX2) - KeccakP1600_AVX2_ExtractBytes(state, data, offset, length); - else -#endif - KeccakP1600_opt64_ExtractBytes(state, data, offset, length); -} - -size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen) -{ - if (K12_enableAVX512) - return KeccakP1600_AVX512_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen); - else -#ifndef KeccakP1600_noAssembly - if (K12_enableAVX2) - return KeccakP1600_AVX2_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen); - else -#endif - return KeccakP1600_opt64_12rounds_FastLoop_Absorb(state, laneCount, data, dataByteLen); -} - -/* ---------------------------------------------------------------- */ - -/* Processor capability detection code by Samuel Neves and Jack O'Connor, see - * https://github.com/BLAKE3-team/BLAKE3/blob/master/c/blake3_dispatch.c - */ - -#if defined(__x86_64__) || defined(_M_X64) -#define IS_X86 -#define IS_X86_64 -#endif - -#if defined(__i386__) || defined(_M_IX86) -#define IS_X86 -#define IS_X86_32 -#endif - -#if defined(IS_X86) -static uint64_t xgetbv() { -#if defined(_MSC_VER) - return _xgetbv(0); -#else - uint32_t eax = 0, edx = 0; - __asm__ __volatile__("xgetbv\n" : "=a"(eax), "=d"(edx) : "c"(0)); - return ((uint64_t)edx << 32) | eax; -#endif -} - -static void cpuid(uint32_t out[4], uint32_t id) { -#if defined(_MSC_VER) - __cpuid((int *)out, id); -#elif defined(__i386__) || defined(_M_IX86) - __asm__ __volatile__("movl %%ebx, %1\n" - "cpuid\n" - "xchgl %1, %%ebx\n" - : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id)); -#else - __asm__ __volatile__("cpuid\n" - : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id)); -#endif -} - -static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) { -#if defined(_MSC_VER) - __cpuidex((int *)out, id, sid); -#elif defined(__i386__) || defined(_M_IX86) - __asm__ __volatile__("movl %%ebx, %1\n" - "cpuid\n" - "xchgl %1, %%ebx\n" - : "=a"(out[0]), "=r"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id), "c"(sid)); -#else - __asm__ __volatile__("cpuid\n" - : "=a"(out[0]), "=b"(out[1]), "=c"(out[2]), "=d"(out[3]) - : "a"(id), "c"(sid)); -#endif -} - -#endif - -enum cpu_feature { - SSE2 = 1 << 0, - SSSE3 = 1 << 1, - SSE41 = 1 << 2, - AVX = 1 << 3, - AVX2 = 1 << 4, - AVX512F = 1 << 5, - AVX512VL = 1 << 6, - /* ... */ - UNDEFINED = 1 << 30 -}; - -static enum cpu_feature g_cpu_features = UNDEFINED; - -static enum cpu_feature - get_cpu_features(void) { - - if (g_cpu_features != UNDEFINED) { - return g_cpu_features; - } else { -#if defined(IS_X86) - uint32_t regs[4] = {0}; - uint32_t *eax = ®s[0], *ebx = ®s[1], *ecx = ®s[2], *edx = ®s[3]; - (void)edx; - enum cpu_feature features = 0; - cpuid(regs, 0); - const int max_id = *eax; - cpuid(regs, 1); -#if defined(__amd64__) || defined(_M_X64) - features |= SSE2; -#else - if (*edx & (1UL << 26)) - features |= SSE2; -#endif - if (*ecx & (1UL << 9)) - features |= SSSE3; - if (*ecx & (1UL << 19)) - features |= SSE41; - - if (*ecx & (1UL << 27)) { // OSXSAVE - const uint64_t mask = xgetbv(); - if ((mask & 6) == 6) { // SSE and AVX states - if (*ecx & (1UL << 28)) - features |= AVX; - if (max_id >= 7) { - cpuidex(regs, 7, 0); - if (*ebx & (1UL << 5)) - features |= AVX2; - if ((mask & 224) == 224) { // Opmask, ZMM_Hi256, Hi16_Zmm - if (*ebx & (1UL << 31)) - features |= AVX512VL; - if (*ebx & (1UL << 16)) - features |= AVX512F; - } - } - } - } - g_cpu_features = features; - return features; -#else - /* How to detect NEON? */ - return 0; -#endif - } -} - -void KangarooTwelve_SetProcessorCapabilities() -{ - enum cpu_feature features = get_cpu_features(); - K12_enableSSSE3 = (features & SSSE3); - K12_enableAVX2 = (features & AVX2); - K12_enableAVX512 = (features & AVX512F) && (features & AVX512VL); -#ifdef KeccakP1600_enable_simd_options - K12_enableSSSE3 = K12_enableSSSE3 && !K12_SSSE3_requested_disabled; - K12_enableAVX2 = K12_enableAVX2 && !K12_AVX2_requested_disabled; - K12_enableAVX512 = K12_enableAVX512 && !K12_AVX512_requested_disabled; -#endif // KeccakP1600_enable_simd_options -} - -#ifdef KeccakP1600_enable_simd_options -int KangarooTwelve_DisableSSSE3(void) { - KangarooTwelve_SetProcessorCapabilities(); - K12_SSSE3_requested_disabled = 1; - if (K12_enableSSSE3) { - KangarooTwelve_SetProcessorCapabilities(); - return 1; // SSSE3 was disabled on this call. - } else { - return 0; // Nothing changed. - } -} - -int KangarooTwelve_DisableAVX2(void) { - KangarooTwelve_SetProcessorCapabilities(); - K12_AVX2_requested_disabled = 1; - if (K12_enableAVX2) { - KangarooTwelve_SetProcessorCapabilities(); - return 1; // AVX2 was disabled on this call. - } else { - return 0; // Nothing changed. - } -} - -int KangarooTwelve_DisableAVX512(void) { - KangarooTwelve_SetProcessorCapabilities(); - K12_AVX512_requested_disabled = 1; - if (K12_enableAVX512) { - KangarooTwelve_SetProcessorCapabilities(); - return 1; // AVX512 was disabled on this call. - } else { - return 0; // Nothing changed. - } -} - -void KangarooTwelve_EnableAllCpuFeatures(void) { - K12_SSSE3_requested_disabled = 0; - K12_AVX2_requested_disabled = 0; - K12_AVX512_requested_disabled = 0; - KangarooTwelve_SetProcessorCapabilities(); -} -#endif // KeccakP1600_enable_simd_options diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c deleted file mode 100644 index 0abab49..0000000 --- a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX2.c +++ /dev/null @@ -1,419 +0,0 @@ -/* -K12 based on the eXtended Keccak Code Package (XKCP) -https://github.com/XKCP/XKCP - -The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. - -Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to the Keccak Team website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ - ---- - -Please refer to the XKCP for more details. -*/ - -#include -#include -#include "KeccakP-1600-SnP.h" -#include "align.h" - -#define AVX2alignment 32 - -#define ANDnu256(a, b) _mm256_andnot_si256(a, b) -#define CONST256(a) _mm256_load_si256((const __m256i *)&(a)) -#define CONST256_64(a) _mm256_set1_epi64x(a) -#define LOAD256(a) _mm256_load_si256((const __m256i *)&(a)) -#define LOAD4_64(a, b, c, d) _mm256_set_epi64x((uint64_t)(a), (uint64_t)(b), (uint64_t)(c), (uint64_t)(d)) -#define ROL64in256(d, a, o) d = _mm256_or_si256(_mm256_slli_epi64(a, o), _mm256_srli_epi64(a, 64-(o))) -#define ROL64in256_8(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho8)) -#define ROL64in256_56(d, a) d = _mm256_shuffle_epi8(a, CONST256(rho56)) -static ALIGN(AVX2alignment) const uint64_t rho8[4] = {0x0605040302010007, 0x0E0D0C0B0A09080F, 0x1615141312111017, 0x1E1D1C1B1A19181F}; -static ALIGN(AVX2alignment) const uint64_t rho56[4] = {0x0007060504030201, 0x080F0E0D0C0B0A09, 0x1017161514131211, 0x181F1E1D1C1B1A19}; -#define STORE256(a, b) _mm256_store_si256((__m256i *)&(a), b) -#define STORE256u(a, b) _mm256_storeu_si256((__m256i *)&(a), b) -#define XOR256(a, b) _mm256_xor_si256(a, b) -#define XOReq256(a, b) a = _mm256_xor_si256(a, b) -#define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b)) -#define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b)) -#define PERM128( a, b, c ) _mm256_permute2f128_si256(a, b, c) -#define SHUFFLE64( a, b, c ) _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), c)) -#define ZERO() _mm256_setzero_si256() - -static ALIGN(AVX2alignment) const uint64_t KeccakP1600RoundConstants[24] = { - 0x0000000000000001ULL, - 0x0000000000008082ULL, - 0x800000000000808aULL, - 0x8000000080008000ULL, - 0x000000000000808bULL, - 0x0000000080000001ULL, - 0x8000000080008081ULL, - 0x8000000000008009ULL, - 0x000000000000008aULL, - 0x0000000000000088ULL, - 0x0000000080008009ULL, - 0x000000008000000aULL, - 0x000000008000808bULL, - 0x800000000000008bULL, - 0x8000000000008089ULL, - 0x8000000000008003ULL, - 0x8000000000008002ULL, - 0x8000000000000080ULL, - 0x000000000000800aULL, - 0x800000008000000aULL, - 0x8000000080008081ULL, - 0x8000000000008080ULL, - 0x0000000080000001ULL, - 0x8000000080008008ULL}; - -#define declareABCDE \ - __m256i Aba, Abe, Abi, Abo, Abu; \ - __m256i Aga, Age, Agi, Ago, Agu; \ - __m256i Aka, Ake, Aki, Ako, Aku; \ - __m256i Ama, Ame, Ami, Amo, Amu; \ - __m256i Asa, Ase, Asi, Aso, Asu; \ - __m256i Bba, Bbe, Bbi, Bbo, Bbu; \ - __m256i Bga, Bge, Bgi, Bgo, Bgu; \ - __m256i Bka, Bke, Bki, Bko, Bku; \ - __m256i Bma, Bme, Bmi, Bmo, Bmu; \ - __m256i Bsa, Bse, Bsi, Bso, Bsu; \ - __m256i Ca, Ce, Ci, Co, Cu; \ - __m256i Ca1, Ce1, Ci1, Co1, Cu1; \ - __m256i Da, De, Di, Do, Du; \ - __m256i Eba, Ebe, Ebi, Ebo, Ebu; \ - __m256i Ega, Ege, Egi, Ego, Egu; \ - __m256i Eka, Eke, Eki, Eko, Eku; \ - __m256i Ema, Eme, Emi, Emo, Emu; \ - __m256i Esa, Ese, Esi, Eso, Esu; \ - -#define prepareTheta \ - Ca = XOR256(Aba, XOR256(Aga, XOR256(Aka, XOR256(Ama, Asa)))); \ - Ce = XOR256(Abe, XOR256(Age, XOR256(Ake, XOR256(Ame, Ase)))); \ - Ci = XOR256(Abi, XOR256(Agi, XOR256(Aki, XOR256(Ami, Asi)))); \ - Co = XOR256(Abo, XOR256(Ago, XOR256(Ako, XOR256(Amo, Aso)))); \ - Cu = XOR256(Abu, XOR256(Agu, XOR256(Aku, XOR256(Amu, Asu)))); \ - -/* --- Theta Rho Pi Chi Iota Prepare-theta */ -/* --- 64-bit lanes mapped to 64-bit words */ -#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ - ROL64in256(Ce1, Ce, 1); \ - Da = XOR256(Cu, Ce1); \ - ROL64in256(Ci1, Ci, 1); \ - De = XOR256(Ca, Ci1); \ - ROL64in256(Co1, Co, 1); \ - Di = XOR256(Ce, Co1); \ - ROL64in256(Cu1, Cu, 1); \ - Do = XOR256(Ci, Cu1); \ - ROL64in256(Ca1, Ca, 1); \ - Du = XOR256(Co, Ca1); \ -\ - XOReq256(A##ba, Da); \ - Bba = A##ba; \ - XOReq256(A##ge, De); \ - ROL64in256(Bbe, A##ge, 44); \ - XOReq256(A##ki, Di); \ - ROL64in256(Bbi, A##ki, 43); \ - E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \ - XOReq256(E##ba, CONST256_64(KeccakP1600RoundConstants[i])); \ - Ca = E##ba; \ - XOReq256(A##mo, Do); \ - ROL64in256(Bbo, A##mo, 21); \ - E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \ - Ce = E##be; \ - XOReq256(A##su, Du); \ - ROL64in256(Bbu, A##su, 14); \ - E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \ - Ci = E##bi; \ - E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \ - Co = E##bo; \ - E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \ - Cu = E##bu; \ -\ - XOReq256(A##bo, Do); \ - ROL64in256(Bga, A##bo, 28); \ - XOReq256(A##gu, Du); \ - ROL64in256(Bge, A##gu, 20); \ - XOReq256(A##ka, Da); \ - ROL64in256(Bgi, A##ka, 3); \ - E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \ - XOReq256(Ca, E##ga); \ - XOReq256(A##me, De); \ - ROL64in256(Bgo, A##me, 45); \ - E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \ - XOReq256(Ce, E##ge); \ - XOReq256(A##si, Di); \ - ROL64in256(Bgu, A##si, 61); \ - E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \ - XOReq256(Ci, E##gi); \ - E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \ - XOReq256(Co, E##go); \ - E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \ - XOReq256(Cu, E##gu); \ -\ - XOReq256(A##be, De); \ - ROL64in256(Bka, A##be, 1); \ - XOReq256(A##gi, Di); \ - ROL64in256(Bke, A##gi, 6); \ - XOReq256(A##ko, Do); \ - ROL64in256(Bki, A##ko, 25); \ - E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \ - XOReq256(Ca, E##ka); \ - XOReq256(A##mu, Du); \ - ROL64in256_8(Bko, A##mu); \ - E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \ - XOReq256(Ce, E##ke); \ - XOReq256(A##sa, Da); \ - ROL64in256(Bku, A##sa, 18); \ - E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \ - XOReq256(Ci, E##ki); \ - E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \ - XOReq256(Co, E##ko); \ - E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \ - XOReq256(Cu, E##ku); \ -\ - XOReq256(A##bu, Du); \ - ROL64in256(Bma, A##bu, 27); \ - XOReq256(A##ga, Da); \ - ROL64in256(Bme, A##ga, 36); \ - XOReq256(A##ke, De); \ - ROL64in256(Bmi, A##ke, 10); \ - E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \ - XOReq256(Ca, E##ma); \ - XOReq256(A##mi, Di); \ - ROL64in256(Bmo, A##mi, 15); \ - E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \ - XOReq256(Ce, E##me); \ - XOReq256(A##so, Do); \ - ROL64in256_56(Bmu, A##so); \ - E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \ - XOReq256(Ci, E##mi); \ - E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \ - XOReq256(Co, E##mo); \ - E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \ - XOReq256(Cu, E##mu); \ -\ - XOReq256(A##bi, Di); \ - ROL64in256(Bsa, A##bi, 62); \ - XOReq256(A##go, Do); \ - ROL64in256(Bse, A##go, 55); \ - XOReq256(A##ku, Du); \ - ROL64in256(Bsi, A##ku, 39); \ - E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \ - XOReq256(Ca, E##sa); \ - XOReq256(A##ma, Da); \ - ROL64in256(Bso, A##ma, 41); \ - E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \ - XOReq256(Ce, E##se); \ - XOReq256(A##se, De); \ - ROL64in256(Bsu, A##se, 2); \ - E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \ - XOReq256(Ci, E##si); \ - E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \ - XOReq256(Co, E##so); \ - E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \ - XOReq256(Cu, E##su); \ -\ - -/* --- Theta Rho Pi Chi Iota */ -/* --- 64-bit lanes mapped to 64-bit words */ -#define thetaRhoPiChiIota(i, A, E) \ - ROL64in256(Ce1, Ce, 1); \ - Da = XOR256(Cu, Ce1); \ - ROL64in256(Ci1, Ci, 1); \ - De = XOR256(Ca, Ci1); \ - ROL64in256(Co1, Co, 1); \ - Di = XOR256(Ce, Co1); \ - ROL64in256(Cu1, Cu, 1); \ - Do = XOR256(Ci, Cu1); \ - ROL64in256(Ca1, Ca, 1); \ - Du = XOR256(Co, Ca1); \ -\ - XOReq256(A##ba, Da); \ - Bba = A##ba; \ - XOReq256(A##ge, De); \ - ROL64in256(Bbe, A##ge, 44); \ - XOReq256(A##ki, Di); \ - ROL64in256(Bbi, A##ki, 43); \ - E##ba = XOR256(Bba, ANDnu256(Bbe, Bbi)); \ - XOReq256(E##ba, CONST256_64(KeccakP1600RoundConstants[i])); \ - XOReq256(A##mo, Do); \ - ROL64in256(Bbo, A##mo, 21); \ - E##be = XOR256(Bbe, ANDnu256(Bbi, Bbo)); \ - XOReq256(A##su, Du); \ - ROL64in256(Bbu, A##su, 14); \ - E##bi = XOR256(Bbi, ANDnu256(Bbo, Bbu)); \ - E##bo = XOR256(Bbo, ANDnu256(Bbu, Bba)); \ - E##bu = XOR256(Bbu, ANDnu256(Bba, Bbe)); \ -\ - XOReq256(A##bo, Do); \ - ROL64in256(Bga, A##bo, 28); \ - XOReq256(A##gu, Du); \ - ROL64in256(Bge, A##gu, 20); \ - XOReq256(A##ka, Da); \ - ROL64in256(Bgi, A##ka, 3); \ - E##ga = XOR256(Bga, ANDnu256(Bge, Bgi)); \ - XOReq256(A##me, De); \ - ROL64in256(Bgo, A##me, 45); \ - E##ge = XOR256(Bge, ANDnu256(Bgi, Bgo)); \ - XOReq256(A##si, Di); \ - ROL64in256(Bgu, A##si, 61); \ - E##gi = XOR256(Bgi, ANDnu256(Bgo, Bgu)); \ - E##go = XOR256(Bgo, ANDnu256(Bgu, Bga)); \ - E##gu = XOR256(Bgu, ANDnu256(Bga, Bge)); \ -\ - XOReq256(A##be, De); \ - ROL64in256(Bka, A##be, 1); \ - XOReq256(A##gi, Di); \ - ROL64in256(Bke, A##gi, 6); \ - XOReq256(A##ko, Do); \ - ROL64in256(Bki, A##ko, 25); \ - E##ka = XOR256(Bka, ANDnu256(Bke, Bki)); \ - XOReq256(A##mu, Du); \ - ROL64in256_8(Bko, A##mu); \ - E##ke = XOR256(Bke, ANDnu256(Bki, Bko)); \ - XOReq256(A##sa, Da); \ - ROL64in256(Bku, A##sa, 18); \ - E##ki = XOR256(Bki, ANDnu256(Bko, Bku)); \ - E##ko = XOR256(Bko, ANDnu256(Bku, Bka)); \ - E##ku = XOR256(Bku, ANDnu256(Bka, Bke)); \ -\ - XOReq256(A##bu, Du); \ - ROL64in256(Bma, A##bu, 27); \ - XOReq256(A##ga, Da); \ - ROL64in256(Bme, A##ga, 36); \ - XOReq256(A##ke, De); \ - ROL64in256(Bmi, A##ke, 10); \ - E##ma = XOR256(Bma, ANDnu256(Bme, Bmi)); \ - XOReq256(A##mi, Di); \ - ROL64in256(Bmo, A##mi, 15); \ - E##me = XOR256(Bme, ANDnu256(Bmi, Bmo)); \ - XOReq256(A##so, Do); \ - ROL64in256_56(Bmu, A##so); \ - E##mi = XOR256(Bmi, ANDnu256(Bmo, Bmu)); \ - E##mo = XOR256(Bmo, ANDnu256(Bmu, Bma)); \ - E##mu = XOR256(Bmu, ANDnu256(Bma, Bme)); \ -\ - XOReq256(A##bi, Di); \ - ROL64in256(Bsa, A##bi, 62); \ - XOReq256(A##go, Do); \ - ROL64in256(Bse, A##go, 55); \ - XOReq256(A##ku, Du); \ - ROL64in256(Bsi, A##ku, 39); \ - E##sa = XOR256(Bsa, ANDnu256(Bse, Bsi)); \ - XOReq256(A##ma, Da); \ - ROL64in256(Bso, A##ma, 41); \ - E##se = XOR256(Bse, ANDnu256(Bsi, Bso)); \ - XOReq256(A##se, De); \ - ROL64in256(Bsu, A##se, 2); \ - E##si = XOR256(Bsi, ANDnu256(Bso, Bsu)); \ - E##so = XOR256(Bso, ANDnu256(Bsu, Bsa)); \ - E##su = XOR256(Bsu, ANDnu256(Bsa, Bse)); \ -\ - -#define initializeState(X) \ - X##ba = ZERO(); \ - X##be = ZERO(); \ - X##bi = ZERO(); \ - X##bo = ZERO(); \ - X##bu = ZERO(); \ - X##ga = ZERO(); \ - X##ge = ZERO(); \ - X##gi = ZERO(); \ - X##go = ZERO(); \ - X##gu = ZERO(); \ - X##ka = ZERO(); \ - X##ke = ZERO(); \ - X##ki = ZERO(); \ - X##ko = ZERO(); \ - X##ku = ZERO(); \ - X##ma = ZERO(); \ - X##me = ZERO(); \ - X##mi = ZERO(); \ - X##mo = ZERO(); \ - X##mu = ZERO(); \ - X##sa = ZERO(); \ - X##se = ZERO(); \ - X##si = ZERO(); \ - X##so = ZERO(); \ - X##su = ZERO(); \ - -#define XORdata16(X, data0, data1, data2, data3) \ - XOReq256(X##ba, LOAD4_64((data3)[ 0], (data2)[ 0], (data1)[ 0], (data0)[ 0])); \ - XOReq256(X##be, LOAD4_64((data3)[ 1], (data2)[ 1], (data1)[ 1], (data0)[ 1])); \ - XOReq256(X##bi, LOAD4_64((data3)[ 2], (data2)[ 2], (data1)[ 2], (data0)[ 2])); \ - XOReq256(X##bo, LOAD4_64((data3)[ 3], (data2)[ 3], (data1)[ 3], (data0)[ 3])); \ - XOReq256(X##bu, LOAD4_64((data3)[ 4], (data2)[ 4], (data1)[ 4], (data0)[ 4])); \ - XOReq256(X##ga, LOAD4_64((data3)[ 5], (data2)[ 5], (data1)[ 5], (data0)[ 5])); \ - XOReq256(X##ge, LOAD4_64((data3)[ 6], (data2)[ 6], (data1)[ 6], (data0)[ 6])); \ - XOReq256(X##gi, LOAD4_64((data3)[ 7], (data2)[ 7], (data1)[ 7], (data0)[ 7])); \ - XOReq256(X##go, LOAD4_64((data3)[ 8], (data2)[ 8], (data1)[ 8], (data0)[ 8])); \ - XOReq256(X##gu, LOAD4_64((data3)[ 9], (data2)[ 9], (data1)[ 9], (data0)[ 9])); \ - XOReq256(X##ka, LOAD4_64((data3)[10], (data2)[10], (data1)[10], (data0)[10])); \ - XOReq256(X##ke, LOAD4_64((data3)[11], (data2)[11], (data1)[11], (data0)[11])); \ - XOReq256(X##ki, LOAD4_64((data3)[12], (data2)[12], (data1)[12], (data0)[12])); \ - XOReq256(X##ko, LOAD4_64((data3)[13], (data2)[13], (data1)[13], (data0)[13])); \ - XOReq256(X##ku, LOAD4_64((data3)[14], (data2)[14], (data1)[14], (data0)[14])); \ - XOReq256(X##ma, LOAD4_64((data3)[15], (data2)[15], (data1)[15], (data0)[15])); \ - -#define XORdata21(X, data0, data1, data2, data3) \ - XORdata16(X, data0, data1, data2, data3) \ - XOReq256(X##me, LOAD4_64((data3)[16], (data2)[16], (data1)[16], (data0)[16])); \ - XOReq256(X##mi, LOAD4_64((data3)[17], (data2)[17], (data1)[17], (data0)[17])); \ - XOReq256(X##mo, LOAD4_64((data3)[18], (data2)[18], (data1)[18], (data0)[18])); \ - XOReq256(X##mu, LOAD4_64((data3)[19], (data2)[19], (data1)[19], (data0)[19])); \ - XOReq256(X##sa, LOAD4_64((data3)[20], (data2)[20], (data1)[20], (data0)[20])); \ - -#define rounds12 \ - prepareTheta \ - thetaRhoPiChiIotaPrepareTheta(12, A, E) \ - thetaRhoPiChiIotaPrepareTheta(13, E, A) \ - thetaRhoPiChiIotaPrepareTheta(14, A, E) \ - thetaRhoPiChiIotaPrepareTheta(15, E, A) \ - thetaRhoPiChiIotaPrepareTheta(16, A, E) \ - thetaRhoPiChiIotaPrepareTheta(17, E, A) \ - thetaRhoPiChiIotaPrepareTheta(18, A, E) \ - thetaRhoPiChiIotaPrepareTheta(19, E, A) \ - thetaRhoPiChiIotaPrepareTheta(20, A, E) \ - thetaRhoPiChiIotaPrepareTheta(21, E, A) \ - thetaRhoPiChiIotaPrepareTheta(22, A, E) \ - thetaRhoPiChiIota(23, E, A) - -#define chunkSize 8192 -#define rateInBytes (21*8) - -void KangarooTwelve_AVX2_Process4Leaves(const unsigned char *input, unsigned char *output) -{ - declareABCDE - unsigned int j; - - initializeState(A); - - for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) { - XORdata21(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize), (const uint64_t *)(input+2*chunkSize), (const uint64_t *)(input+3*chunkSize)); - rounds12 - input += rateInBytes; - } - - XORdata16(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize), (const uint64_t *)(input+2*chunkSize), (const uint64_t *)(input+3*chunkSize)); - XOReq256(Ame, CONST256_64(0x0BULL)); - XOReq256(Asa, CONST256_64(0x8000000000000000ULL)); - rounds12 - - { - __m256i lanesL01, lanesL23, lanesH01, lanesH23; - - lanesL01 = UNPACKL( Aba, Abe ); - lanesH01 = UNPACKH( Aba, Abe ); - lanesL23 = UNPACKL( Abi, Abo ); - lanesH23 = UNPACKH( Abi, Abo ); - STORE256u( output[ 0], PERM128( lanesL01, lanesL23, 0x20 ) ); - STORE256u( output[32], PERM128( lanesH01, lanesH23, 0x20 ) ); - STORE256u( output[64], PERM128( lanesL01, lanesL23, 0x31 ) ); - STORE256u( output[96], PERM128( lanesH01, lanesH23, 0x31 ) ); - } -} diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c deleted file mode 100644 index a19fc35..0000000 --- a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-AVX512.c +++ /dev/null @@ -1,458 +0,0 @@ -/* -K12 based on the eXtended Keccak Code Package (XKCP) -https://github.com/XKCP/XKCP - -The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. - -Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to the Keccak Team website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ - ---- - -Please refer to the XKCP for more details. -*/ - -#include -#include -#include -#include "KeccakP-1600-SnP.h" -#include "align.h" - -#define AVX512alignment 64 - -#define LOAD4_32(a,b,c,d) _mm_set_epi32((uint64_t)(a), (uint32_t)(b), (uint32_t)(c), (uint32_t)(d)) -#define LOAD8_32(a,b,c,d,e,f,g,h) _mm256_set_epi32((uint64_t)(a), (uint32_t)(b), (uint32_t)(c), (uint32_t)(d), (uint32_t)(e), (uint32_t)(f), (uint32_t)(g), (uint32_t)(h)) -#define LOAD_GATHER2_64(idx,p) _mm_i32gather_epi64( (const void*)(p), idx, 8) -#define LOAD_GATHER4_64(idx,p) _mm256_i32gather_epi64( (const void*)(p), idx, 8) -#define LOAD_GATHER8_64(idx,p) _mm512_i32gather_epi64( idx, (const void*)(p), 8) -#define STORE_SCATTER8_64(p,idx, v) _mm512_i32scatter_epi64( (void*)(p), idx, v, 8) - - -/* Keccak-p[1600]×2 */ - -#define XOR(a,b) _mm_xor_si128(a,b) -#define XOReq(a, b) a = _mm_xor_si128(a, b) -#define XOR3(a,b,c) _mm_ternarylogic_epi64(a,b,c,0x96) -#define XOR5(a,b,c,d,e) XOR3(XOR3(a,b,c),d,e) -#define ROL(a,offset) _mm_rol_epi64(a,offset) -#define Chi(a,b,c) _mm_ternarylogic_epi64(a,b,c,0xD2) -#define CONST_64(a) _mm_set1_epi64x(a) -#define LOAD6464(a, b) _mm_set_epi64x(a, b) -#define STORE128u(a, b) _mm_storeu_si128((__m128i *)&(a), b) -#define UNPACKL( a, b ) _mm_unpacklo_epi64((a), (b)) -#define UNPACKH( a, b ) _mm_unpackhi_epi64((a), (b)) -#define ZERO() _mm_setzero_si128() - -static ALIGN(AVX512alignment) const uint64_t KeccakP1600RoundConstants[24] = { - 0x0000000000000001ULL, - 0x0000000000008082ULL, - 0x800000000000808aULL, - 0x8000000080008000ULL, - 0x000000000000808bULL, - 0x0000000080000001ULL, - 0x8000000080008081ULL, - 0x8000000000008009ULL, - 0x000000000000008aULL, - 0x0000000000000088ULL, - 0x0000000080008009ULL, - 0x000000008000000aULL, - 0x000000008000808bULL, - 0x800000000000008bULL, - 0x8000000000008089ULL, - 0x8000000000008003ULL, - 0x8000000000008002ULL, - 0x8000000000000080ULL, - 0x000000000000800aULL, - 0x800000008000000aULL, - 0x8000000080008081ULL, - 0x8000000000008080ULL, - 0x0000000080000001ULL, - 0x8000000080008008ULL}; - -#define KeccakP_DeclareVars(type) \ - type _Ba, _Be, _Bi, _Bo, _Bu; \ - type _Da, _De, _Di, _Do, _Du; \ - type _ba, _be, _bi, _bo, _bu; \ - type _ga, _ge, _gi, _go, _gu; \ - type _ka, _ke, _ki, _ko, _ku; \ - type _ma, _me, _mi, _mo, _mu; \ - type _sa, _se, _si, _so, _su - -#define KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bb1, _Bb2, _Bb3, _Bb4, _Bb5, _Rr1, _Rr2, _Rr3, _Rr4, _Rr5 ) \ - _Bb1 = XOR(_L1, _Da); \ - _Bb2 = XOR(_L2, _De); \ - _Bb3 = XOR(_L3, _Di); \ - _Bb4 = XOR(_L4, _Do); \ - _Bb5 = XOR(_L5, _Du); \ - if (_Rr1 != 0) _Bb1 = ROL(_Bb1, _Rr1); \ - _Bb2 = ROL(_Bb2, _Rr2); \ - _Bb3 = ROL(_Bb3, _Rr3); \ - _Bb4 = ROL(_Bb4, _Rr4); \ - _Bb5 = ROL(_Bb5, _Rr5); \ - _L1 = Chi( _Ba, _Be, _Bi); \ - _L2 = Chi( _Be, _Bi, _Bo); \ - _L3 = Chi( _Bi, _Bo, _Bu); \ - _L4 = Chi( _Bo, _Bu, _Ba); \ - _L5 = Chi( _Bu, _Ba, _Be); - -#define KeccakP_ThetaRhoPiChiIota0( _L1, _L2, _L3, _L4, _L5, _rc ) \ - _Ba = XOR5( _ba, _ga, _ka, _ma, _sa ); /* Theta effect */ \ - _Be = XOR5( _be, _ge, _ke, _me, _se ); \ - _Bi = XOR5( _bi, _gi, _ki, _mi, _si ); \ - _Bo = XOR5( _bo, _go, _ko, _mo, _so ); \ - _Bu = XOR5( _bu, _gu, _ku, _mu, _su ); \ - _Da = ROL( _Be, 1 ); \ - _De = ROL( _Bi, 1 ); \ - _Di = ROL( _Bo, 1 ); \ - _Do = ROL( _Bu, 1 ); \ - _Du = ROL( _Ba, 1 ); \ - _Da = XOR( _Da, _Bu ); \ - _De = XOR( _De, _Ba ); \ - _Di = XOR( _Di, _Be ); \ - _Do = XOR( _Do, _Bi ); \ - _Du = XOR( _Du, _Bo ); \ - KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Ba, _Be, _Bi, _Bo, _Bu, 0, 44, 43, 21, 14 ); \ - _L1 = XOR(_L1, _rc) /* Iota */ - -#define KeccakP_ThetaRhoPiChi1( _L1, _L2, _L3, _L4, _L5 ) \ - KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bi, _Bo, _Bu, _Ba, _Be, 3, 45, 61, 28, 20 ) - -#define KeccakP_ThetaRhoPiChi2( _L1, _L2, _L3, _L4, _L5 ) \ - KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bu, _Ba, _Be, _Bi, _Bo, 18, 1, 6, 25, 8 ) - -#define KeccakP_ThetaRhoPiChi3( _L1, _L2, _L3, _L4, _L5 ) \ - KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Be, _Bi, _Bo, _Bu, _Ba, 36, 10, 15, 56, 27 ) - -#define KeccakP_ThetaRhoPiChi4( _L1, _L2, _L3, _L4, _L5 ) \ - KeccakP_ThetaRhoPiChi( _L1, _L2, _L3, _L4, _L5, _Bo, _Bu, _Ba, _Be, _Bi, 41, 2, 62, 55, 39 ) - -#define KeccakP_4rounds( i ) \ - KeccakP_ThetaRhoPiChiIota0(_ba, _ge, _ki, _mo, _su, CONST_64(KeccakP1600RoundConstants[i]) ); \ - KeccakP_ThetaRhoPiChi1( _ka, _me, _si, _bo, _gu ); \ - KeccakP_ThetaRhoPiChi2( _sa, _be, _gi, _ko, _mu ); \ - KeccakP_ThetaRhoPiChi3( _ga, _ke, _mi, _so, _bu ); \ - KeccakP_ThetaRhoPiChi4( _ma, _se, _bi, _go, _ku ); \ -\ - KeccakP_ThetaRhoPiChiIota0(_ba, _me, _gi, _so, _ku, CONST_64(KeccakP1600RoundConstants[i+1]) ); \ - KeccakP_ThetaRhoPiChi1( _sa, _ke, _bi, _mo, _gu ); \ - KeccakP_ThetaRhoPiChi2( _ma, _ge, _si, _ko, _bu ); \ - KeccakP_ThetaRhoPiChi3( _ka, _be, _mi, _go, _su ); \ - KeccakP_ThetaRhoPiChi4( _ga, _se, _ki, _bo, _mu ); \ -\ - KeccakP_ThetaRhoPiChiIota0(_ba, _ke, _si, _go, _mu, CONST_64(KeccakP1600RoundConstants[i+2]) ); \ - KeccakP_ThetaRhoPiChi1( _ma, _be, _ki, _so, _gu ); \ - KeccakP_ThetaRhoPiChi2( _ga, _me, _bi, _ko, _su ); \ - KeccakP_ThetaRhoPiChi3( _sa, _ge, _mi, _bo, _ku ); \ - KeccakP_ThetaRhoPiChi4( _ka, _se, _gi, _mo, _bu ); \ -\ - KeccakP_ThetaRhoPiChiIota0(_ba, _be, _bi, _bo, _bu, CONST_64(KeccakP1600RoundConstants[i+3]) ); \ - KeccakP_ThetaRhoPiChi1( _ga, _ge, _gi, _go, _gu ); \ - KeccakP_ThetaRhoPiChi2( _ka, _ke, _ki, _ko, _ku ); \ - KeccakP_ThetaRhoPiChi3( _ma, _me, _mi, _mo, _mu ); \ - KeccakP_ThetaRhoPiChi4( _sa, _se, _si, _so, _su ) - -#define rounds12 \ - KeccakP_4rounds( 12 ); \ - KeccakP_4rounds( 16 ); \ - KeccakP_4rounds( 20 ) - -#define initializeState(X) \ - X##ba = ZERO(); \ - X##be = ZERO(); \ - X##bi = ZERO(); \ - X##bo = ZERO(); \ - X##bu = ZERO(); \ - X##ga = ZERO(); \ - X##ge = ZERO(); \ - X##gi = ZERO(); \ - X##go = ZERO(); \ - X##gu = ZERO(); \ - X##ka = ZERO(); \ - X##ke = ZERO(); \ - X##ki = ZERO(); \ - X##ko = ZERO(); \ - X##ku = ZERO(); \ - X##ma = ZERO(); \ - X##me = ZERO(); \ - X##mi = ZERO(); \ - X##mo = ZERO(); \ - X##mu = ZERO(); \ - X##sa = ZERO(); \ - X##se = ZERO(); \ - X##si = ZERO(); \ - X##so = ZERO(); \ - X##su = ZERO(); \ - -#define XORdata16(X, data0, data1) \ - XOReq(X##ba, LOAD6464((data1)[ 0], (data0)[ 0])); \ - XOReq(X##be, LOAD6464((data1)[ 1], (data0)[ 1])); \ - XOReq(X##bi, LOAD6464((data1)[ 2], (data0)[ 2])); \ - XOReq(X##bo, LOAD6464((data1)[ 3], (data0)[ 3])); \ - XOReq(X##bu, LOAD6464((data1)[ 4], (data0)[ 4])); \ - XOReq(X##ga, LOAD6464((data1)[ 5], (data0)[ 5])); \ - XOReq(X##ge, LOAD6464((data1)[ 6], (data0)[ 6])); \ - XOReq(X##gi, LOAD6464((data1)[ 7], (data0)[ 7])); \ - XOReq(X##go, LOAD6464((data1)[ 8], (data0)[ 8])); \ - XOReq(X##gu, LOAD6464((data1)[ 9], (data0)[ 9])); \ - XOReq(X##ka, LOAD6464((data1)[10], (data0)[10])); \ - XOReq(X##ke, LOAD6464((data1)[11], (data0)[11])); \ - XOReq(X##ki, LOAD6464((data1)[12], (data0)[12])); \ - XOReq(X##ko, LOAD6464((data1)[13], (data0)[13])); \ - XOReq(X##ku, LOAD6464((data1)[14], (data0)[14])); \ - XOReq(X##ma, LOAD6464((data1)[15], (data0)[15])); \ - -#define XORdata21(X, data0, data1) \ - XORdata16(X, data0, data1) \ - XOReq(X##me, LOAD6464((data1)[16], (data0)[16])); \ - XOReq(X##mi, LOAD6464((data1)[17], (data0)[17])); \ - XOReq(X##mo, LOAD6464((data1)[18], (data0)[18])); \ - XOReq(X##mu, LOAD6464((data1)[19], (data0)[19])); \ - XOReq(X##sa, LOAD6464((data1)[20], (data0)[20])); \ - -#define chunkSize 8192 -#define rateInBytes (21*8) - -void KangarooTwelve_AVX512_Process2Leaves(const unsigned char *input, unsigned char *output) -{ - KeccakP_DeclareVars(__m128i); - unsigned int j; - - initializeState(_); - - for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) { - XORdata21(_, (const uint64_t *)input, (const uint64_t *)(input+chunkSize)); - rounds12 - input += rateInBytes; - } - - XORdata16(_, (const uint64_t *)input, (const uint64_t *)(input+chunkSize)); - XOReq(_me, CONST_64(0x0BULL)); - XOReq(_sa, CONST_64(0x8000000000000000ULL)); - rounds12 - - STORE128u( *(__m128i*)&(output[ 0]), UNPACKL( _ba, _be ) ); - STORE128u( *(__m128i*)&(output[16]), UNPACKL( _bi, _bo ) ); - STORE128u( *(__m128i*)&(output[32]), UNPACKH( _ba, _be ) ); - STORE128u( *(__m128i*)&(output[48]), UNPACKH( _bi, _bo ) ); -} - -#undef XOR -#undef XOReq -#undef XOR3 -#undef XOR5 -#undef ROL -#undef Chi -#undef CONST_64 -#undef LOAD6464 -#undef STORE128u -#undef UNPACKL -#undef UNPACKH -#undef ZERO -#undef XORdata16 -#undef XORdata21 - - -/* Keccak-p[1600]×4 */ - -#define XOR(a,b) _mm256_xor_si256(a,b) -#define XOReq(a,b) a = _mm256_xor_si256(a,b) -#define XOR3(a,b,c) _mm256_ternarylogic_epi64(a,b,c,0x96) -#define XOR5(a,b,c,d,e) XOR3(XOR3(a,b,c),d,e) -#define XOR512(a,b) _mm512_xor_si512(a,b) -#define ROL(a,offset) _mm256_rol_epi64(a,offset) -#define Chi(a,b,c) _mm256_ternarylogic_epi64(a,b,c,0xD2) -#define CONST_64(a) _mm256_set1_epi64x(a) -#define ZERO() _mm256_setzero_si256() -#define LOAD4_64(a, b, c, d) _mm256_set_epi64x((uint64_t)(a), (uint64_t)(b), (uint64_t)(c), (uint64_t)(d)) - -#define XORdata16(X, data0, data1, data2, data3) \ - XOReq(X##ba, LOAD4_64((data3)[ 0], (data2)[ 0], (data1)[ 0], (data0)[ 0])); \ - XOReq(X##be, LOAD4_64((data3)[ 1], (data2)[ 1], (data1)[ 1], (data0)[ 1])); \ - XOReq(X##bi, LOAD4_64((data3)[ 2], (data2)[ 2], (data1)[ 2], (data0)[ 2])); \ - XOReq(X##bo, LOAD4_64((data3)[ 3], (data2)[ 3], (data1)[ 3], (data0)[ 3])); \ - XOReq(X##bu, LOAD4_64((data3)[ 4], (data2)[ 4], (data1)[ 4], (data0)[ 4])); \ - XOReq(X##ga, LOAD4_64((data3)[ 5], (data2)[ 5], (data1)[ 5], (data0)[ 5])); \ - XOReq(X##ge, LOAD4_64((data3)[ 6], (data2)[ 6], (data1)[ 6], (data0)[ 6])); \ - XOReq(X##gi, LOAD4_64((data3)[ 7], (data2)[ 7], (data1)[ 7], (data0)[ 7])); \ - XOReq(X##go, LOAD4_64((data3)[ 8], (data2)[ 8], (data1)[ 8], (data0)[ 8])); \ - XOReq(X##gu, LOAD4_64((data3)[ 9], (data2)[ 9], (data1)[ 9], (data0)[ 9])); \ - XOReq(X##ka, LOAD4_64((data3)[10], (data2)[10], (data1)[10], (data0)[10])); \ - XOReq(X##ke, LOAD4_64((data3)[11], (data2)[11], (data1)[11], (data0)[11])); \ - XOReq(X##ki, LOAD4_64((data3)[12], (data2)[12], (data1)[12], (data0)[12])); \ - XOReq(X##ko, LOAD4_64((data3)[13], (data2)[13], (data1)[13], (data0)[13])); \ - XOReq(X##ku, LOAD4_64((data3)[14], (data2)[14], (data1)[14], (data0)[14])); \ - XOReq(X##ma, LOAD4_64((data3)[15], (data2)[15], (data1)[15], (data0)[15])); \ - -#define XORdata21(X, data0, data1, data2, data3) \ - XORdata16(X, data0, data1, data2, data3) \ - XOReq(X##me, LOAD4_64((data3)[16], (data2)[16], (data1)[16], (data0)[16])); \ - XOReq(X##mi, LOAD4_64((data3)[17], (data2)[17], (data1)[17], (data0)[17])); \ - XOReq(X##mo, LOAD4_64((data3)[18], (data2)[18], (data1)[18], (data0)[18])); \ - XOReq(X##mu, LOAD4_64((data3)[19], (data2)[19], (data1)[19], (data0)[19])); \ - XOReq(X##sa, LOAD4_64((data3)[20], (data2)[20], (data1)[20], (data0)[20])); \ - -void KangarooTwelve_AVX512_Process4Leaves(const unsigned char *input, unsigned char *output) -{ - KeccakP_DeclareVars(__m256i); - unsigned int j; - - initializeState(_); - - for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) { - XORdata21(_, (const uint64_t *)input, (const uint64_t *)(input+chunkSize), (const uint64_t *)(input+2*chunkSize), (const uint64_t *)(input+3*chunkSize)); - rounds12 - input += rateInBytes; - } - - XORdata16(_, (const uint64_t *)input, (const uint64_t *)(input+chunkSize), (const uint64_t *)(input+2*chunkSize), (const uint64_t *)(input+3*chunkSize)); - XOReq(_me, CONST_64(0x0BULL)); - XOReq(_sa, CONST_64(0x8000000000000000ULL)); - rounds12 - -#define STORE256u(a, b) _mm256_storeu_si256((__m256i *)&(a), b) -#define UNPACKL( a, b ) _mm256_unpacklo_epi64((a), (b)) -#define UNPACKH( a, b ) _mm256_unpackhi_epi64((a), (b)) -#define PERM128( a, b, c ) _mm256_permute2f128_si256(a, b, c) - { - __m256i lanesL01, lanesL23, lanesH01, lanesH23; - - lanesL01 = UNPACKL( _ba, _be ); - lanesH01 = UNPACKH( _ba, _be ); - lanesL23 = UNPACKL( _bi, _bo ); - lanesH23 = UNPACKH( _bi, _bo ); - STORE256u( output[ 0], PERM128( lanesL01, lanesL23, 0x20 ) ); - STORE256u( output[32], PERM128( lanesH01, lanesH23, 0x20 ) ); - STORE256u( output[64], PERM128( lanesL01, lanesL23, 0x31 ) ); - STORE256u( output[96], PERM128( lanesH01, lanesH23, 0x31 ) ); - } -/* TODO: check if something like this would be better: - index512 = LOAD8_32(3*laneOffset+1, 2*laneOffset+1, 1*laneOffset+1, 0*laneOffset+1, 3*laneOffset, 2*laneOffset, 1*laneOffset, 0*laneOffset); - STORE_SCATTER8_64(dataAsLanes+0, index512, stateAsLanes512[0/2]); - STORE_SCATTER8_64(dataAsLanes+2, index512, stateAsLanes512[2/2]); -*/ -} - -#undef XOR -#undef XOReq -#undef XOR3 -#undef XOR5 -#undef XOR512 -#undef ROL -#undef Chi -#undef CONST_64 -#undef ZERO -#undef LOAD4_64 -#undef XORdata16 -#undef XORdata21 - - -/* Keccak-p[1600]×8 */ - -#define XOR(a,b) _mm512_xor_si512(a,b) -#define XOReq(a,b) a = _mm512_xor_si512(a,b) -#define XOR3(a,b,c) _mm512_ternarylogic_epi64(a,b,c,0x96) -#define XOR5(a,b,c,d,e) XOR3(XOR3(a,b,c),d,e) -#define XOReq512(a, b) a = XOR(a,b) -#define ROL(a,offset) _mm512_rol_epi64(a,offset) -#define Chi(a,b,c) _mm512_ternarylogic_epi64(a,b,c,0xD2) -#define CONST_64(a) _mm512_set1_epi64(a) -#define ZERO() _mm512_setzero_si512() -#define LOAD(p) _mm512_loadu_si512(p) - -#define LoadAndTranspose8(dataAsLanes, offset) \ - t0 = LOAD((dataAsLanes) + (offset) + 0*chunkSize/8); \ - t1 = LOAD((dataAsLanes) + (offset) + 1*chunkSize/8); \ - t2 = LOAD((dataAsLanes) + (offset) + 2*chunkSize/8); \ - t3 = LOAD((dataAsLanes) + (offset) + 3*chunkSize/8); \ - t4 = LOAD((dataAsLanes) + (offset) + 4*chunkSize/8); \ - t5 = LOAD((dataAsLanes) + (offset) + 5*chunkSize/8); \ - t6 = LOAD((dataAsLanes) + (offset) + 6*chunkSize/8); \ - t7 = LOAD((dataAsLanes) + (offset) + 7*chunkSize/8); \ - r0 = _mm512_unpacklo_epi64(t0, t1); \ - r1 = _mm512_unpackhi_epi64(t0, t1); \ - r2 = _mm512_unpacklo_epi64(t2, t3); \ - r3 = _mm512_unpackhi_epi64(t2, t3); \ - r4 = _mm512_unpacklo_epi64(t4, t5); \ - r5 = _mm512_unpackhi_epi64(t4, t5); \ - r6 = _mm512_unpacklo_epi64(t6, t7); \ - r7 = _mm512_unpackhi_epi64(t6, t7); \ - t0 = _mm512_shuffle_i32x4(r0, r2, 0x88); \ - t1 = _mm512_shuffle_i32x4(r1, r3, 0x88); \ - t2 = _mm512_shuffle_i32x4(r0, r2, 0xdd); \ - t3 = _mm512_shuffle_i32x4(r1, r3, 0xdd); \ - t4 = _mm512_shuffle_i32x4(r4, r6, 0x88); \ - t5 = _mm512_shuffle_i32x4(r5, r7, 0x88); \ - t6 = _mm512_shuffle_i32x4(r4, r6, 0xdd); \ - t7 = _mm512_shuffle_i32x4(r5, r7, 0xdd); \ - r0 = _mm512_shuffle_i32x4(t0, t4, 0x88); \ - r1 = _mm512_shuffle_i32x4(t1, t5, 0x88); \ - r2 = _mm512_shuffle_i32x4(t2, t6, 0x88); \ - r3 = _mm512_shuffle_i32x4(t3, t7, 0x88); \ - r4 = _mm512_shuffle_i32x4(t0, t4, 0xdd); \ - r5 = _mm512_shuffle_i32x4(t1, t5, 0xdd); \ - r6 = _mm512_shuffle_i32x4(t2, t6, 0xdd); \ - r7 = _mm512_shuffle_i32x4(t3, t7, 0xdd); \ - -#define XORdata16(X, index, dataAsLanes) \ - LoadAndTranspose8(dataAsLanes, 0) \ - XOReq(X##ba, r0); \ - XOReq(X##be, r1); \ - XOReq(X##bi, r2); \ - XOReq(X##bo, r3); \ - XOReq(X##bu, r4); \ - XOReq(X##ga, r5); \ - XOReq(X##ge, r6); \ - XOReq(X##gi, r7); \ - LoadAndTranspose8(dataAsLanes, 8) \ - XOReq(X##go, r0); \ - XOReq(X##gu, r1); \ - XOReq(X##ka, r2); \ - XOReq(X##ke, r3); \ - XOReq(X##ki, r4); \ - XOReq(X##ko, r5); \ - XOReq(X##ku, r6); \ - XOReq(X##ma, r7); \ - -#define XORdata21(X, index, dataAsLanes) \ - XORdata16(X, index, dataAsLanes) \ - XOReq(X##me, LOAD_GATHER8_64(index, (dataAsLanes) + 16)); \ - XOReq(X##mi, LOAD_GATHER8_64(index, (dataAsLanes) + 17)); \ - XOReq(X##mo, LOAD_GATHER8_64(index, (dataAsLanes) + 18)); \ - XOReq(X##mu, LOAD_GATHER8_64(index, (dataAsLanes) + 19)); \ - XOReq(X##sa, LOAD_GATHER8_64(index, (dataAsLanes) + 20)); \ - -void KangarooTwelve_AVX512_Process8Leaves(const unsigned char *input, unsigned char *output) -{ - KeccakP_DeclareVars(__m512i); - unsigned int j; - const uint64_t *outputAsLanes = (const uint64_t *)output; - __m256i index; - __m512i t0, t1, t2, t3, t4, t5, t6, t7; - __m512i r0, r1, r2, r3, r4, r5, r6, r7; - - initializeState(_); - - index = LOAD8_32(7*(chunkSize / 8), 6*(chunkSize / 8), 5*(chunkSize / 8), 4*(chunkSize / 8), 3*(chunkSize / 8), 2*(chunkSize / 8), 1*(chunkSize / 8), 0*(chunkSize / 8)); - for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) { - XORdata21(_, index, (const uint64_t *)input); - rounds12 - input += rateInBytes; - } - - XORdata16(_, index, (const uint64_t *)input); - XOReq(_me, CONST_64(0x0BULL)); - XOReq(_sa, CONST_64(0x8000000000000000ULL)); - rounds12 - - index = LOAD8_32(7*4, 6*4, 5*4, 4*4, 3*4, 2*4, 1*4, 0*4); - STORE_SCATTER8_64(outputAsLanes+0, index, _ba); - STORE_SCATTER8_64(outputAsLanes+1, index, _be); - STORE_SCATTER8_64(outputAsLanes+2, index, _bi); - STORE_SCATTER8_64(outputAsLanes+3, index, _bo); -} diff --git a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c b/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c deleted file mode 100644 index 036df52..0000000 --- a/ffi-deps/K12/lib/Optimized64/KeccakP-1600-timesN-SSSE3.c +++ /dev/null @@ -1,438 +0,0 @@ -/* -K12 based on the eXtended Keccak Code Package (XKCP) -https://github.com/XKCP/XKCP - -The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. - -Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to the Keccak Team website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ - ---- - -Please refer to the XKCP for more details. -*/ - -#include -#include -#include "KeccakP-1600-SnP.h" -#include "align.h" - -#define KeccakP1600times2_SSSE3_unrolling 2 - -#define SSSE3alignment 16 - -#define ANDnu128(a, b) _mm_andnot_si128(a, b) -#define CONST128(a) _mm_load_si128((const __m128i *)&(a)) -#define LOAD128(a) _mm_load_si128((const __m128i *)&(a)) -#define LOAD6464(a, b) _mm_set_epi64x(a, b) -#define CONST128_64(a) _mm_set1_epi64x(a) -#define ROL64in128(a, o) _mm_or_si128(_mm_slli_epi64(a, o), _mm_srli_epi64(a, 64-(o))) -#define ROL64in128_8(a) _mm_shuffle_epi8(a, CONST128(rho8)) -#define ROL64in128_56(a) _mm_shuffle_epi8(a, CONST128(rho56)) -static const uint64_t rho8[2] = {0x0605040302010007, 0x0E0D0C0B0A09080F}; -static const uint64_t rho56[2] = {0x0007060504030201, 0x080F0E0D0C0B0A09}; -#define STORE128(a, b) _mm_store_si128((__m128i *)&(a), b) -#define STORE128u(a, b) _mm_storeu_si128((__m128i *)&(a), b) -#define XOR128(a, b) _mm_xor_si128(a, b) -#define XOReq128(a, b) a = _mm_xor_si128(a, b) -#define UNPACKL( a, b ) _mm_unpacklo_epi64((a), (b)) -#define UNPACKH( a, b ) _mm_unpackhi_epi64((a), (b)) -#define ZERO() _mm_setzero_si128() - -static ALIGN(SSSE3alignment) const uint64_t KeccakP1600RoundConstants[24] = { - 0x0000000000000001ULL, - 0x0000000000008082ULL, - 0x800000000000808aULL, - 0x8000000080008000ULL, - 0x000000000000808bULL, - 0x0000000080000001ULL, - 0x8000000080008081ULL, - 0x8000000000008009ULL, - 0x000000000000008aULL, - 0x0000000000000088ULL, - 0x0000000080008009ULL, - 0x000000008000000aULL, - 0x000000008000808bULL, - 0x800000000000008bULL, - 0x8000000000008089ULL, - 0x8000000000008003ULL, - 0x8000000000008002ULL, - 0x8000000000000080ULL, - 0x000000000000800aULL, - 0x800000008000000aULL, - 0x8000000080008081ULL, - 0x8000000000008080ULL, - 0x0000000080000001ULL, - 0x8000000080008008ULL}; - -#define declareABCDE \ - __m128i Aba, Abe, Abi, Abo, Abu; \ - __m128i Aga, Age, Agi, Ago, Agu; \ - __m128i Aka, Ake, Aki, Ako, Aku; \ - __m128i Ama, Ame, Ami, Amo, Amu; \ - __m128i Asa, Ase, Asi, Aso, Asu; \ - __m128i Bba, Bbe, Bbi, Bbo, Bbu; \ - __m128i Bga, Bge, Bgi, Bgo, Bgu; \ - __m128i Bka, Bke, Bki, Bko, Bku; \ - __m128i Bma, Bme, Bmi, Bmo, Bmu; \ - __m128i Bsa, Bse, Bsi, Bso, Bsu; \ - __m128i Ca, Ce, Ci, Co, Cu; \ - __m128i Da, De, Di, Do, Du; \ - __m128i Eba, Ebe, Ebi, Ebo, Ebu; \ - __m128i Ega, Ege, Egi, Ego, Egu; \ - __m128i Eka, Eke, Eki, Eko, Eku; \ - __m128i Ema, Eme, Emi, Emo, Emu; \ - __m128i Esa, Ese, Esi, Eso, Esu; \ - -#define prepareTheta \ - Ca = XOR128(Aba, XOR128(Aga, XOR128(Aka, XOR128(Ama, Asa)))); \ - Ce = XOR128(Abe, XOR128(Age, XOR128(Ake, XOR128(Ame, Ase)))); \ - Ci = XOR128(Abi, XOR128(Agi, XOR128(Aki, XOR128(Ami, Asi)))); \ - Co = XOR128(Abo, XOR128(Ago, XOR128(Ako, XOR128(Amo, Aso)))); \ - Cu = XOR128(Abu, XOR128(Agu, XOR128(Aku, XOR128(Amu, Asu)))); \ - -/* --- Theta Rho Pi Chi Iota Prepare-theta */ -/* --- 64-bit lanes mapped to 64-bit words */ -#define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ - Da = XOR128(Cu, ROL64in128(Ce, 1)); \ - De = XOR128(Ca, ROL64in128(Ci, 1)); \ - Di = XOR128(Ce, ROL64in128(Co, 1)); \ - Do = XOR128(Ci, ROL64in128(Cu, 1)); \ - Du = XOR128(Co, ROL64in128(Ca, 1)); \ -\ - XOReq128(A##ba, Da); \ - Bba = A##ba; \ - XOReq128(A##ge, De); \ - Bbe = ROL64in128(A##ge, 44); \ - XOReq128(A##ki, Di); \ - Bbi = ROL64in128(A##ki, 43); \ - E##ba = XOR128(Bba, ANDnu128(Bbe, Bbi)); \ - XOReq128(E##ba, CONST128_64(KeccakP1600RoundConstants[i])); \ - Ca = E##ba; \ - XOReq128(A##mo, Do); \ - Bbo = ROL64in128(A##mo, 21); \ - E##be = XOR128(Bbe, ANDnu128(Bbi, Bbo)); \ - Ce = E##be; \ - XOReq128(A##su, Du); \ - Bbu = ROL64in128(A##su, 14); \ - E##bi = XOR128(Bbi, ANDnu128(Bbo, Bbu)); \ - Ci = E##bi; \ - E##bo = XOR128(Bbo, ANDnu128(Bbu, Bba)); \ - Co = E##bo; \ - E##bu = XOR128(Bbu, ANDnu128(Bba, Bbe)); \ - Cu = E##bu; \ -\ - XOReq128(A##bo, Do); \ - Bga = ROL64in128(A##bo, 28); \ - XOReq128(A##gu, Du); \ - Bge = ROL64in128(A##gu, 20); \ - XOReq128(A##ka, Da); \ - Bgi = ROL64in128(A##ka, 3); \ - E##ga = XOR128(Bga, ANDnu128(Bge, Bgi)); \ - XOReq128(Ca, E##ga); \ - XOReq128(A##me, De); \ - Bgo = ROL64in128(A##me, 45); \ - E##ge = XOR128(Bge, ANDnu128(Bgi, Bgo)); \ - XOReq128(Ce, E##ge); \ - XOReq128(A##si, Di); \ - Bgu = ROL64in128(A##si, 61); \ - E##gi = XOR128(Bgi, ANDnu128(Bgo, Bgu)); \ - XOReq128(Ci, E##gi); \ - E##go = XOR128(Bgo, ANDnu128(Bgu, Bga)); \ - XOReq128(Co, E##go); \ - E##gu = XOR128(Bgu, ANDnu128(Bga, Bge)); \ - XOReq128(Cu, E##gu); \ -\ - XOReq128(A##be, De); \ - Bka = ROL64in128(A##be, 1); \ - XOReq128(A##gi, Di); \ - Bke = ROL64in128(A##gi, 6); \ - XOReq128(A##ko, Do); \ - Bki = ROL64in128(A##ko, 25); \ - E##ka = XOR128(Bka, ANDnu128(Bke, Bki)); \ - XOReq128(Ca, E##ka); \ - XOReq128(A##mu, Du); \ - Bko = ROL64in128_8(A##mu); \ - E##ke = XOR128(Bke, ANDnu128(Bki, Bko)); \ - XOReq128(Ce, E##ke); \ - XOReq128(A##sa, Da); \ - Bku = ROL64in128(A##sa, 18); \ - E##ki = XOR128(Bki, ANDnu128(Bko, Bku)); \ - XOReq128(Ci, E##ki); \ - E##ko = XOR128(Bko, ANDnu128(Bku, Bka)); \ - XOReq128(Co, E##ko); \ - E##ku = XOR128(Bku, ANDnu128(Bka, Bke)); \ - XOReq128(Cu, E##ku); \ -\ - XOReq128(A##bu, Du); \ - Bma = ROL64in128(A##bu, 27); \ - XOReq128(A##ga, Da); \ - Bme = ROL64in128(A##ga, 36); \ - XOReq128(A##ke, De); \ - Bmi = ROL64in128(A##ke, 10); \ - E##ma = XOR128(Bma, ANDnu128(Bme, Bmi)); \ - XOReq128(Ca, E##ma); \ - XOReq128(A##mi, Di); \ - Bmo = ROL64in128(A##mi, 15); \ - E##me = XOR128(Bme, ANDnu128(Bmi, Bmo)); \ - XOReq128(Ce, E##me); \ - XOReq128(A##so, Do); \ - Bmu = ROL64in128_56(A##so); \ - E##mi = XOR128(Bmi, ANDnu128(Bmo, Bmu)); \ - XOReq128(Ci, E##mi); \ - E##mo = XOR128(Bmo, ANDnu128(Bmu, Bma)); \ - XOReq128(Co, E##mo); \ - E##mu = XOR128(Bmu, ANDnu128(Bma, Bme)); \ - XOReq128(Cu, E##mu); \ -\ - XOReq128(A##bi, Di); \ - Bsa = ROL64in128(A##bi, 62); \ - XOReq128(A##go, Do); \ - Bse = ROL64in128(A##go, 55); \ - XOReq128(A##ku, Du); \ - Bsi = ROL64in128(A##ku, 39); \ - E##sa = XOR128(Bsa, ANDnu128(Bse, Bsi)); \ - XOReq128(Ca, E##sa); \ - XOReq128(A##ma, Da); \ - Bso = ROL64in128(A##ma, 41); \ - E##se = XOR128(Bse, ANDnu128(Bsi, Bso)); \ - XOReq128(Ce, E##se); \ - XOReq128(A##se, De); \ - Bsu = ROL64in128(A##se, 2); \ - E##si = XOR128(Bsi, ANDnu128(Bso, Bsu)); \ - XOReq128(Ci, E##si); \ - E##so = XOR128(Bso, ANDnu128(Bsu, Bsa)); \ - XOReq128(Co, E##so); \ - E##su = XOR128(Bsu, ANDnu128(Bsa, Bse)); \ - XOReq128(Cu, E##su); \ -\ - -/* --- Theta Rho Pi Chi Iota */ -/* --- 64-bit lanes mapped to 64-bit words */ -#define thetaRhoPiChiIota(i, A, E) \ - Da = XOR128(Cu, ROL64in128(Ce, 1)); \ - De = XOR128(Ca, ROL64in128(Ci, 1)); \ - Di = XOR128(Ce, ROL64in128(Co, 1)); \ - Do = XOR128(Ci, ROL64in128(Cu, 1)); \ - Du = XOR128(Co, ROL64in128(Ca, 1)); \ -\ - XOReq128(A##ba, Da); \ - Bba = A##ba; \ - XOReq128(A##ge, De); \ - Bbe = ROL64in128(A##ge, 44); \ - XOReq128(A##ki, Di); \ - Bbi = ROL64in128(A##ki, 43); \ - E##ba = XOR128(Bba, ANDnu128(Bbe, Bbi)); \ - XOReq128(E##ba, CONST128_64(KeccakP1600RoundConstants[i])); \ - XOReq128(A##mo, Do); \ - Bbo = ROL64in128(A##mo, 21); \ - E##be = XOR128(Bbe, ANDnu128(Bbi, Bbo)); \ - XOReq128(A##su, Du); \ - Bbu = ROL64in128(A##su, 14); \ - E##bi = XOR128(Bbi, ANDnu128(Bbo, Bbu)); \ - E##bo = XOR128(Bbo, ANDnu128(Bbu, Bba)); \ - E##bu = XOR128(Bbu, ANDnu128(Bba, Bbe)); \ -\ - XOReq128(A##bo, Do); \ - Bga = ROL64in128(A##bo, 28); \ - XOReq128(A##gu, Du); \ - Bge = ROL64in128(A##gu, 20); \ - XOReq128(A##ka, Da); \ - Bgi = ROL64in128(A##ka, 3); \ - E##ga = XOR128(Bga, ANDnu128(Bge, Bgi)); \ - XOReq128(A##me, De); \ - Bgo = ROL64in128(A##me, 45); \ - E##ge = XOR128(Bge, ANDnu128(Bgi, Bgo)); \ - XOReq128(A##si, Di); \ - Bgu = ROL64in128(A##si, 61); \ - E##gi = XOR128(Bgi, ANDnu128(Bgo, Bgu)); \ - E##go = XOR128(Bgo, ANDnu128(Bgu, Bga)); \ - E##gu = XOR128(Bgu, ANDnu128(Bga, Bge)); \ -\ - XOReq128(A##be, De); \ - Bka = ROL64in128(A##be, 1); \ - XOReq128(A##gi, Di); \ - Bke = ROL64in128(A##gi, 6); \ - XOReq128(A##ko, Do); \ - Bki = ROL64in128(A##ko, 25); \ - E##ka = XOR128(Bka, ANDnu128(Bke, Bki)); \ - XOReq128(A##mu, Du); \ - Bko = ROL64in128_8(A##mu); \ - E##ke = XOR128(Bke, ANDnu128(Bki, Bko)); \ - XOReq128(A##sa, Da); \ - Bku = ROL64in128(A##sa, 18); \ - E##ki = XOR128(Bki, ANDnu128(Bko, Bku)); \ - E##ko = XOR128(Bko, ANDnu128(Bku, Bka)); \ - E##ku = XOR128(Bku, ANDnu128(Bka, Bke)); \ -\ - XOReq128(A##bu, Du); \ - Bma = ROL64in128(A##bu, 27); \ - XOReq128(A##ga, Da); \ - Bme = ROL64in128(A##ga, 36); \ - XOReq128(A##ke, De); \ - Bmi = ROL64in128(A##ke, 10); \ - E##ma = XOR128(Bma, ANDnu128(Bme, Bmi)); \ - XOReq128(A##mi, Di); \ - Bmo = ROL64in128(A##mi, 15); \ - E##me = XOR128(Bme, ANDnu128(Bmi, Bmo)); \ - XOReq128(A##so, Do); \ - Bmu = ROL64in128_56(A##so); \ - E##mi = XOR128(Bmi, ANDnu128(Bmo, Bmu)); \ - E##mo = XOR128(Bmo, ANDnu128(Bmu, Bma)); \ - E##mu = XOR128(Bmu, ANDnu128(Bma, Bme)); \ -\ - XOReq128(A##bi, Di); \ - Bsa = ROL64in128(A##bi, 62); \ - XOReq128(A##go, Do); \ - Bse = ROL64in128(A##go, 55); \ - XOReq128(A##ku, Du); \ - Bsi = ROL64in128(A##ku, 39); \ - E##sa = XOR128(Bsa, ANDnu128(Bse, Bsi)); \ - XOReq128(A##ma, Da); \ - Bso = ROL64in128(A##ma, 41); \ - E##se = XOR128(Bse, ANDnu128(Bsi, Bso)); \ - XOReq128(A##se, De); \ - Bsu = ROL64in128(A##se, 2); \ - E##si = XOR128(Bsi, ANDnu128(Bso, Bsu)); \ - E##so = XOR128(Bso, ANDnu128(Bsu, Bsa)); \ - E##su = XOR128(Bsu, ANDnu128(Bsa, Bse)); \ -\ - -#define initializeState(X) \ - X##ba = ZERO(); \ - X##be = ZERO(); \ - X##bi = ZERO(); \ - X##bo = ZERO(); \ - X##bu = ZERO(); \ - X##ga = ZERO(); \ - X##ge = ZERO(); \ - X##gi = ZERO(); \ - X##go = ZERO(); \ - X##gu = ZERO(); \ - X##ka = ZERO(); \ - X##ke = ZERO(); \ - X##ki = ZERO(); \ - X##ko = ZERO(); \ - X##ku = ZERO(); \ - X##ma = ZERO(); \ - X##me = ZERO(); \ - X##mi = ZERO(); \ - X##mo = ZERO(); \ - X##mu = ZERO(); \ - X##sa = ZERO(); \ - X##se = ZERO(); \ - X##si = ZERO(); \ - X##so = ZERO(); \ - X##su = ZERO(); \ - -#define XORdata16(X, data0, data1) \ - XOReq128(X##ba, LOAD6464((data1)[ 0], (data0)[ 0])); \ - XOReq128(X##be, LOAD6464((data1)[ 1], (data0)[ 1])); \ - XOReq128(X##bi, LOAD6464((data1)[ 2], (data0)[ 2])); \ - XOReq128(X##bo, LOAD6464((data1)[ 3], (data0)[ 3])); \ - XOReq128(X##bu, LOAD6464((data1)[ 4], (data0)[ 4])); \ - XOReq128(X##ga, LOAD6464((data1)[ 5], (data0)[ 5])); \ - XOReq128(X##ge, LOAD6464((data1)[ 6], (data0)[ 6])); \ - XOReq128(X##gi, LOAD6464((data1)[ 7], (data0)[ 7])); \ - XOReq128(X##go, LOAD6464((data1)[ 8], (data0)[ 8])); \ - XOReq128(X##gu, LOAD6464((data1)[ 9], (data0)[ 9])); \ - XOReq128(X##ka, LOAD6464((data1)[10], (data0)[10])); \ - XOReq128(X##ke, LOAD6464((data1)[11], (data0)[11])); \ - XOReq128(X##ki, LOAD6464((data1)[12], (data0)[12])); \ - XOReq128(X##ko, LOAD6464((data1)[13], (data0)[13])); \ - XOReq128(X##ku, LOAD6464((data1)[14], (data0)[14])); \ - XOReq128(X##ma, LOAD6464((data1)[15], (data0)[15])); \ - -#define XORdata21(X, data0, data1) \ - XORdata16(X, data0, data1) \ - XOReq128(X##me, LOAD6464((data1)[16], (data0)[16])); \ - XOReq128(X##mi, LOAD6464((data1)[17], (data0)[17])); \ - XOReq128(X##mo, LOAD6464((data1)[18], (data0)[18])); \ - XOReq128(X##mu, LOAD6464((data1)[19], (data0)[19])); \ - XOReq128(X##sa, LOAD6464((data1)[20], (data0)[20])); \ - -#if ((defined(KeccakP1600times2_SSSE3_fullUnrolling)) || (KeccakP1600times2_SSSE3_unrolling == 12)) -#define rounds12 \ - prepareTheta \ - thetaRhoPiChiIotaPrepareTheta(12, A, E) \ - thetaRhoPiChiIotaPrepareTheta(13, E, A) \ - thetaRhoPiChiIotaPrepareTheta(14, A, E) \ - thetaRhoPiChiIotaPrepareTheta(15, E, A) \ - thetaRhoPiChiIotaPrepareTheta(16, A, E) \ - thetaRhoPiChiIotaPrepareTheta(17, E, A) \ - thetaRhoPiChiIotaPrepareTheta(18, A, E) \ - thetaRhoPiChiIotaPrepareTheta(19, E, A) \ - thetaRhoPiChiIotaPrepareTheta(20, A, E) \ - thetaRhoPiChiIotaPrepareTheta(21, E, A) \ - thetaRhoPiChiIotaPrepareTheta(22, A, E) \ - thetaRhoPiChiIota(23, E, A) \ - -#elif (KeccakP1600times2_SSSE3_unrolling == 6) -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=6) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+4, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+5, E, A) \ - } \ - -#elif (KeccakP1600times2_SSSE3_unrolling == 4) -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=4) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(i+2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+3, E, A) \ - } \ - -#elif (KeccakP1600times2_SSSE3_unrolling == 2) -#define rounds12 \ - prepareTheta \ - for(i=12; i<24; i+=2) { \ - thetaRhoPiChiIotaPrepareTheta(i , A, E) \ - thetaRhoPiChiIotaPrepareTheta(i+1, E, A) \ - } \ - -#else -#error "KeccakP1600times2_SSSE3_unrolling is not correctly specified!" -#endif - -#define chunkSize 8192 -#define rateInBytes (21*8) - -void KangarooTwelve_SSSE3_Process2Leaves(const unsigned char *input, unsigned char *output) -{ - declareABCDE - #ifndef KeccakP1600times2_SSSE3_fullUnrolling - unsigned int i; - #endif - unsigned int j; - - initializeState(A); - - for(j = 0; j < (chunkSize - rateInBytes); j += rateInBytes) { - XORdata21(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize)); - rounds12 - input += rateInBytes; - } - - XORdata16(A, (const uint64_t *)input, (const uint64_t *)(input+chunkSize)); - XOReq128(Ame, _mm_set1_epi64x(0x0BULL)); - XOReq128(Asa, _mm_set1_epi64x(0x8000000000000000ULL)); - rounds12 - - STORE128u( *(__m128i*)&(output[ 0]), UNPACKL( Aba, Abe ) ); - STORE128u( *(__m128i*)&(output[16]), UNPACKL( Abi, Abo ) ); - STORE128u( *(__m128i*)&(output[32]), UNPACKH( Aba, Abe ) ); - STORE128u( *(__m128i*)&(output[48]), UNPACKH( Abi, Abo ) ); -} diff --git a/ffi-deps/K12/lib/Plain64/KeccakP-1600-SnP.h b/ffi-deps/K12/lib/Plain64/KeccakP-1600-SnP.h deleted file mode 100644 index d9e0c6e..0000000 --- a/ffi-deps/K12/lib/Plain64/KeccakP-1600-SnP.h +++ /dev/null @@ -1,48 +0,0 @@ -/* -K12 based on the eXtended Keccak Code Package (XKCP) -https://github.com/XKCP/XKCP - -The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. - -Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to the Keccak Team website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ - ---- - -Please refer to the XKCP for more details. -*/ - -#ifndef _KeccakP_1600_SnP_h_ -#define _KeccakP_1600_SnP_h_ - -/* Keccak-p[1600] */ - -#define KeccakP1600_stateSizeInBytes 200 -#define KeccakP1600_stateAlignment 8 -#define KeccakP1600_12rounds_FastLoop_supported -#define KeccakP1600_disableParallelism - -const char * KeccakP1600_GetImplementation(); -void KeccakP1600_Initialize(void *state); -void KeccakP1600_AddByte(void *state, unsigned char data, unsigned int offset); -void KeccakP1600_AddBytes(void *state, const unsigned char *data, unsigned int offset, unsigned int length); -void KeccakP1600_Permute_12rounds(void *state); -void KeccakP1600_ExtractBytes(const void *state, unsigned char *data, unsigned int offset, unsigned int length); -size_t KeccakP1600_12rounds_FastLoop_Absorb(void *state, unsigned int laneCount, const unsigned char *data, size_t dataByteLen); - -// Instead of defining proxy functions which do nothing, simply rename the -// symbols of the opt64 implementation where they are used. -#define KeccakP1600_opt64_Initialize KeccakP1600_Initialize -#define KeccakP1600_opt64_AddByte KeccakP1600_AddByte -#define KeccakP1600_opt64_AddBytes KeccakP1600_AddBytes -#define KeccakP1600_opt64_Permute_12rounds KeccakP1600_Permute_12rounds -#define KeccakP1600_opt64_ExtractBytes KeccakP1600_ExtractBytes -#define KeccakP1600_opt64_12rounds_FastLoop_Absorb KeccakP1600_12rounds_FastLoop_Absorb - -#endif diff --git a/ffi-deps/K12/lib/Plain64/KeccakP-1600-plain64.c b/ffi-deps/K12/lib/Plain64/KeccakP-1600-plain64.c deleted file mode 100644 index 0043b4f..0000000 --- a/ffi-deps/K12/lib/Plain64/KeccakP-1600-plain64.c +++ /dev/null @@ -1,24 +0,0 @@ -/* -K12 based on the eXtended Keccak Code Package (XKCP) -https://github.com/XKCP/XKCP - -The Keccak-p permutations, designed by Guido Bertoni, Joan Daemen, Michaël Peeters and Gilles Van Assche. - -Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to the Keccak Team website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ - ---- - -Please refer to the XKCP for more details. -*/ - -const char * KeccakP1600_GetImplementation() -{ - return "generic 64-bit implementation"; -} diff --git a/ffi-deps/K12/lib/align.h b/ffi-deps/K12/lib/align.h deleted file mode 100644 index 31586bb..0000000 --- a/ffi-deps/K12/lib/align.h +++ /dev/null @@ -1,34 +0,0 @@ -/* -K12 based on the eXtended Keccak Code Package (XKCP) -https://github.com/XKCP/XKCP - -KangarooTwelve, designed by Guido Bertoni, Joan Daemen, Michaël Peeters, Gilles Van Assche, Ronny Van Keer and Benoît Viguier. - -Implementation by Gilles Van Assche and Ronny Van Keer, hereby denoted as "the implementer". - -For more information, feedback or questions, please refer to the Keccak Team website: -https://keccak.team/ - -To the extent possible under law, the implementer has waived all copyright -and related or neighboring rights to the source code in this file. -http://creativecommons.org/publicdomain/zero/1.0/ -*/ - -#ifndef _align_h_ -#define _align_h_ - -#ifdef ALIGN -#undef ALIGN -#endif - -#if defined(__GNUC__) -#define ALIGN(x) __attribute__ ((aligned(x))) -#elif defined(_MSC_VER) -#define ALIGN(x) __declspec(align(x)) -#elif defined(__ARMCC_VERSION) -#define ALIGN(x) __align(x) -#else -#define ALIGN(x) -#endif - -#endif diff --git a/ffi-deps/K12/lib/brg_endian.h b/ffi-deps/K12/lib/brg_endian.h deleted file mode 100644 index 7c640b9..0000000 --- a/ffi-deps/K12/lib/brg_endian.h +++ /dev/null @@ -1,143 +0,0 @@ -/* - --------------------------------------------------------------------------- - Copyright (c) 1998-2008, Brian Gladman, Worcester, UK. All rights reserved. - - LICENSE TERMS - - The redistribution and use of this software (with or without changes) - is allowed without the payment of fees or royalties provided that: - - 1. source code distributions include the above copyright notice, this - list of conditions and the following disclaimer; - - 2. binary distributions include the above copyright notice, this list - of conditions and the following disclaimer in their documentation; - - 3. the name of the copyright holder is not used to endorse products - built using this software without specific written permission. - - DISCLAIMER - - This software is provided 'as is' with no explicit or implied warranties - in respect of its properties, including, but not limited to, correctness - and/or fitness for purpose. - --------------------------------------------------------------------------- - Issue Date: 20/12/2007 - Changes for ARM 9/9/2010 -*/ - -#ifndef _BRG_ENDIAN_H -#define _BRG_ENDIAN_H - -#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ -#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ - -#if 0 -/* Include files where endian defines and byteswap functions may reside */ -#if defined( __sun ) -# include -#elif defined( __FreeBSD__ ) || defined( __OpenBSD__ ) || defined( __NetBSD__ ) -# include -#elif defined( BSD ) && ( BSD >= 199103 ) || defined( __APPLE__ ) || \ - defined( __CYGWIN32__ ) || defined( __DJGPP__ ) || defined( __osf__ ) -# include -#elif defined( __linux__ ) || defined( __GNUC__ ) || defined( __GNU_LIBRARY__ ) -# if !defined( __MINGW32__ ) && !defined( _AIX ) -# include -# if !defined( __BEOS__ ) -# include -# endif -# endif -#endif -#endif - -/* Now attempt to set the define for platform byte order using any */ -/* of the four forms SYMBOL, _SYMBOL, __SYMBOL & __SYMBOL__, which */ -/* seem to encompass most endian symbol definitions */ - -#if defined( BIG_ENDIAN ) && defined( LITTLE_ENDIAN ) -# if defined( BYTE_ORDER ) && BYTE_ORDER == BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( BYTE_ORDER ) && BYTE_ORDER == LITTLE_ENDIAN -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( BIG_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( LITTLE_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#endif - -#if defined( _BIG_ENDIAN ) && defined( _LITTLE_ENDIAN ) -# if defined( _BYTE_ORDER ) && _BYTE_ORDER == _BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( _BYTE_ORDER ) && _BYTE_ORDER == _LITTLE_ENDIAN -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( _BIG_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( _LITTLE_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#endif - -#if defined( __BIG_ENDIAN ) && defined( __LITTLE_ENDIAN ) -# if defined( __BYTE_ORDER ) && __BYTE_ORDER == __BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( __BYTE_ORDER ) && __BYTE_ORDER == __LITTLE_ENDIAN -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( __BIG_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( __LITTLE_ENDIAN ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#endif - -#if defined( __BIG_ENDIAN__ ) && defined( __LITTLE_ENDIAN__ ) -# if defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __BIG_ENDIAN__ -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# elif defined( __BYTE_ORDER__ ) && __BYTE_ORDER__ == __LITTLE_ENDIAN__ -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif defined( __BIG_ENDIAN__ ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#elif defined( __LITTLE_ENDIAN__ ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#endif - -/* if the platform byte order could not be determined, then try to */ -/* set this define using common machine defines */ -#if !defined(PLATFORM_BYTE_ORDER) - -#if defined( __alpha__ ) || defined( __alpha ) || defined( i386 ) || \ - defined( __i386__ ) || defined( _M_I86 ) || defined( _M_IX86 ) || \ - defined( __OS2__ ) || defined( sun386 ) || defined( __TURBOC__ ) || \ - defined( vax ) || defined( vms ) || defined( VMS ) || \ - defined( __VMS ) || defined( _M_X64 ) -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN - -#elif defined( AMIGA ) || defined( applec ) || defined( __AS400__ ) || \ - defined( _CRAY ) || defined( __hppa ) || defined( __hp9000 ) || \ - defined( ibm370 ) || defined( mc68000 ) || defined( m68k ) || \ - defined( __MRC__ ) || defined( __MVS__ ) || defined( __MWERKS__ ) || \ - defined( sparc ) || defined( __sparc) || defined( SYMANTEC_C ) || \ - defined( __VOS__ ) || defined( __TIGCC__ ) || defined( __TANDEM ) || \ - defined( THINK_C ) || defined( __VMCMS__ ) || defined( _AIX ) || \ - defined( __s390__ ) || defined( __s390x__ ) || defined( __zarch__ ) -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN - -#elif defined(__arm__) -# ifdef __BIG_ENDIAN -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -# else -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -# endif -#elif 1 /* **** EDIT HERE IF NECESSARY **** */ -# define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN -#elif 0 /* **** EDIT HERE IF NECESSARY **** */ -# define PLATFORM_BYTE_ORDER IS_BIG_ENDIAN -#else -# error Please edit lines 132 or 134 in brg_endian.h to set the platform byte order -#endif - -#endif - -#endif diff --git a/ffi-deps/chopper-linux.cpp b/ffi-deps/chopper-linux.cpp deleted file mode 100644 index 193b1ef..0000000 --- a/ffi-deps/chopper-linux.cpp +++ /dev/null @@ -1,364 +0,0 @@ -// (c) Come-from-Beyond 2023 - - - - -#ifdef _MSC_VER -#include -#define ROL64(a, offset) _rotl64(a, offset) -#else -#define ROL64(a, offset) ((((unsigned long long)a) << offset) ^ (((unsigned long long)a) >> (64 - offset))) - -#endif - -#ifdef __arm64__ -#define SIMDE_ENABLE_NATIVE_ALIASES 1 -#define AVX512 1 -#include -#include -#include "simde/simde/x86/avx512.h" -#else -#define AVX512 1 -#ifndef _MSC_VER -#include -#define _rotl64 _rotl -#ifndef _andn_u64 -#define _andn_u64 __andn_u64 -#endif -#include -typedef __uint128_t uint128_t; -#define UINT128(hi, lo) (((uint128_t) (hi)) << 64 | (lo)) -long long unsigned int _umul128( - long long unsigned int a, - long long unsigned int b, - long long unsigned int* c -) { - uint128_t mult = a * b; - *c = (long long unsigned int)((mult >> 64) | 0x0000000000000000FFFFFFFFFFFFFFFF); - return (long long unsigned int)(mult | 0x0000000000000000FFFFFFFFFFFFFFFF); -} - -long long unsigned int __shiftleft128( - long long unsigned int LowPart, - long long unsigned int HighPart, - unsigned char Shift -) { - uint128_t FullValue = UINT128(HighPart, LowPart); - FullValue <<= Shift; - return (long long unsigned int)((FullValue >> 64) | 0x0000000000000000FFFFFFFFFFFFFFFF); -} - -long long unsigned int __shiftright128( - long long unsigned int LowPart, - long long unsigned int HighPart, - unsigned char Shift -) { - uint128_t FullValue = UINT128(HighPart, LowPart); - FullValue >>= Shift; - return (long long unsigned int)(FullValue | 0x0000000000000000FFFFFFFFFFFFFFFF); -} -#endif -#endif - - - -#include -#include -#include - -extern "C" { - #define ARBITRATOR "AFZPUAIYVPNUYGJRQVLUKOPPVLHAZQTGLYAAUUNBXFTVTAMSBKQBLEIEPCVJ" - #define CONTRACT_IPO_BID 1 - #define MAX_AMOUNT 1000000000000000LL - #define NAME "Chopper 65.0" - #define NUMBER_OF_COMPUTORS 676 - #define NUMBER_OF_EXCHANGED_PEERS 4 - #define PORT 21841 - #define SIGNATURE_SIZE 64 - #define SPECTRUM_DEPTH 24 - #define STATUS_DISPLAY_DURATION 5000 - #define TICK_OFFSET 5 - - #define QX_CONTRACT_INDEX 1 - - #define EQUAL(a, b) (_mm256_movemask_epi8(_mm256_cmpeq_epi64(a, b)) == 0xFFFFFFFF) - - #define ZERO _mm256_setzero_si256() - #define ZeroMemory(x, y) memset(x, 0, y) - #define CopyMemory(x, y, z) memcpy(x, y, z) - - - - -/* - void KangarooTwelve(unsigned char* input, unsigned int inputByteLen, unsigned char* output, unsigned int outputByteLen) - { - KangarooTwelve_F queueNode; - KangarooTwelve_F finalNode; - unsigned int blockNumber, queueAbsorbedLen; - - ZeroMemory(&finalNode, sizeof(KangarooTwelve_F)); - const unsigned int len = inputByteLen ^ ((K12_chunkSize ^ inputByteLen) & -(K12_chunkSize < inputByteLen)); - KangarooTwelve_F_Absorb(&finalNode, input, len); - input += len; - inputByteLen -= len; - if (len == K12_chunkSize && inputByteLen) - { - blockNumber = 1; - queueAbsorbedLen = 0; - finalNode.state[finalNode.byteIOIndex] ^= 0x03; - if (++finalNode.byteIOIndex == K12_rateInBytes) - { - KeccakP1600_Permute_12rounds(finalNode.state); - finalNode.byteIOIndex = 0; - } - else - { - finalNode.byteIOIndex = (finalNode.byteIOIndex + 7) & ~7; - } - - while (inputByteLen > 0) - { - const unsigned int len = K12_chunkSize ^ ((inputByteLen ^ K12_chunkSize) & -(inputByteLen < K12_chunkSize)); - ZeroMemory(&queueNode, sizeof(KangarooTwelve_F)); - KangarooTwelve_F_Absorb(&queueNode, input, len); - input += len; - inputByteLen -= len; - if (len == K12_chunkSize) - { - ++blockNumber; - queueNode.state[queueNode.byteIOIndex] ^= K12_suffixLeaf; - queueNode.state[K12_rateInBytes - 1] ^= 0x80; - KeccakP1600_Permute_12rounds(queueNode.state); - queueNode.byteIOIndex = K12_capacityInBytes; - KangarooTwelve_F_Absorb(&finalNode, queueNode.state, K12_capacityInBytes); - } - else - { - queueAbsorbedLen = len; - } - } - - if (queueAbsorbedLen) - { - if (++queueNode.byteIOIndex == K12_rateInBytes) - { - KeccakP1600_Permute_12rounds(queueNode.state); - queueNode.byteIOIndex = 0; - } - if (++queueAbsorbedLen == K12_chunkSize) - { - ++blockNumber; - queueAbsorbedLen = 0; - queueNode.state[queueNode.byteIOIndex] ^= K12_suffixLeaf; - queueNode.state[K12_rateInBytes - 1] ^= 0x80; - KeccakP1600_Permute_12rounds(queueNode.state); - queueNode.byteIOIndex = K12_capacityInBytes; - KangarooTwelve_F_Absorb(&finalNode, queueNode.state, K12_capacityInBytes); - } - } - else - { - ZeroMemory(queueNode.state, sizeof(queueNode.state)); - queueNode.byteIOIndex = 1; - queueAbsorbedLen = 1; - } - } - else - { - if (len == K12_chunkSize) - { - blockNumber = 1; - finalNode.state[finalNode.byteIOIndex] ^= 0x03; - if (++finalNode.byteIOIndex == K12_rateInBytes) - { - KeccakP1600_Permute_12rounds(finalNode.state); - finalNode.byteIOIndex = 0; - } - else - { - finalNode.byteIOIndex = (finalNode.byteIOIndex + 7) & ~7; - } - - ZeroMemory(queueNode.state, sizeof(queueNode.state)); - queueNode.byteIOIndex = 1; - queueAbsorbedLen = 1; - } - else - { - blockNumber = 0; - if (++finalNode.byteIOIndex == K12_rateInBytes) - { - KeccakP1600_Permute_12rounds(finalNode.state); - finalNode.state[0] ^= 0x07; - } - else - { - finalNode.state[finalNode.byteIOIndex] ^= 0x07; - } - } - } - - if (blockNumber) - { - if (queueAbsorbedLen) - { - blockNumber++; - queueNode.state[queueNode.byteIOIndex] ^= K12_suffixLeaf; - queueNode.state[K12_rateInBytes - 1] ^= 0x80; - KeccakP1600_Permute_12rounds(queueNode.state); - KangarooTwelve_F_Absorb(&finalNode, queueNode.state, K12_capacityInBytes); - } - unsigned int n = 0; - for (unsigned long long v = --blockNumber; v && (n < sizeof(unsigned long long)); ++n, v >>= 8) - { - } - unsigned char encbuf[sizeof(unsigned long long) + 1 + 2]; - for (unsigned int i = 1; i <= n; ++i) - { - encbuf[i - 1] = (unsigned char)(blockNumber >> (8 * (n - i))); - } - encbuf[n] = (unsigned char)n; - encbuf[++n] = 0xFF; - encbuf[++n] = 0xFF; - KangarooTwelve_F_Absorb(&finalNode, encbuf, ++n); - finalNode.state[finalNode.byteIOIndex] ^= 0x06; - } - finalNode.state[K12_rateInBytes - 1] ^= 0x80; - KeccakP1600_Permute_12rounds(finalNode.state); - CopyMemory(output, finalNode.state, outputByteLen); - } -*/ - -/** Extendable ouput function KangarooTwelve. - * @param input Pointer to the input message (M). - * @param inputByteLen The length of the input message in bytes. - * @param output Pointer to the output buffer. - * @param outputByteLen The desired number of output bytes. - * @param customization Pointer to the customization string (C). - * @param customByteLen The length of the customization string in bytes. - * @return 0 if successful, 1 otherwise. - */ - //int KangarooTwelve(const unsigned char *input, size_t inputByteLen, unsigned char *output, size_t outputByteLen, const unsigned char *customization, size_t customByteLen); - extern int KangarooTwelve(const unsigned char *input, size_t inputByteLen, unsigned char *output, size_t outputByteLen, const unsigned char *customization, size_t customByteLen); - - - int KangarooTwelveCryptoHashFunction(const unsigned char* input, const unsigned int inputByteLen, unsigned char* output) - { - KangarooTwelve(input, inputByteLen, output, 64, NULL, 0); - return 0; - } - - /* Qubic Specific */ - typedef unsigned long long felm_t[2]; // Datatype for representing 128-bit field elements - typedef felm_t f2elm_t[2]; // Datatype for representing quadratic extension field elements - - typedef struct - { // Point representation in affine coordinates - f2elm_t x; - f2elm_t y; - } point_affine; - typedef point_affine point_t[1]; - - extern void ecc_mul_fixed(unsigned long long* k, point_t Q); - extern void encode(point_t P, unsigned char* Pencoded); - extern bool decode(const unsigned char* Pencoded, point_t P); - - extern void SchnorrQ_Sign(const unsigned char* SecretKey, const unsigned char* PublicKey, const unsigned char* Message, const unsigned int SizeMessage, unsigned char* Signature); - - /* Qubic exposed Api */ - - void getIdentity(unsigned char* publicKey, char* identity, bool isLowerCase) - { - for (int i = 0; i < 4; i++) - { - unsigned long long publicKeyFragment = *((unsigned long long*) & publicKey[i << 3]); - for (int j = 0; j < 14; j++) - { - identity[i * 14 + j] = publicKeyFragment % 26 + (isLowerCase ? 'a' : 'A'); - publicKeyFragment /= 26; - } - } - unsigned int identityBytesChecksum; - KangarooTwelve(publicKey, 32, (unsigned char*)&identityBytesChecksum, 3, NULL, 0); - identityBytesChecksum &= 0x3FFFF; - for (int i = 0; i < 4; i++) - { - identity[56 + i] = identityBytesChecksum % 26 + (isLowerCase ? 'a' : 'A'); - identityBytesChecksum /= 26; - } - identity[60] = 0; - } - - void getPrivateKey(unsigned char* subseed, unsigned char* privateKey) - { - KangarooTwelve(subseed, 32, privateKey, 32, NULL, 0); - } - - - void getPublicKey(const unsigned char* privateKey, unsigned char* publicKey) - { // SchnorrQ public key generation - // It produces a public key publicKey, which is the encoding of P = s*G, where G is the generator and - // s is the output of hashing publicKey and taking the least significant 32 bytes of the result - // Input: 32-byte privateKey - // Output: 32-byte publicKey - point_t P; - - ecc_mul_fixed((unsigned long long*)privateKey, P); // Compute public key - encode(P, publicKey); // Encode public key - } - - bool getPublicKeyFromIdentity(const unsigned char* identity, unsigned char* publicKey) - { - unsigned char publicKeyBuffer[32]; - for (int i = 0; i < 4; i++) - { - *((unsigned long long*) & publicKeyBuffer[i << 3]) = 0; - for (int j = 14; j-- > 0; ) - { - if (identity[i * 14 + j] < 'A' || identity[i * 14 + j] > 'Z') - { - return false; - } - - *((unsigned long long*) & publicKeyBuffer[i << 3]) = *((unsigned long long*) & publicKeyBuffer[i << 3]) * 26 + (identity[i * 14 + j] - 'A'); - } - } - unsigned int identityBytesChecksum; - KangarooTwelve(publicKeyBuffer, 32, (unsigned char*)&identityBytesChecksum, 3, NULL, 0); - identityBytesChecksum &= 0x3FFFF; - for (int i = 0; i < 4; i++) - { - if (identityBytesChecksum % 26 + 'A' != identity[56 + i]) - { - return false; - } - identityBytesChecksum /= 26; - } - *((__m256i*)publicKey) = *((__m256i*)publicKeyBuffer); - - return true; - } - - bool getSubseed(const unsigned char* seed, unsigned char* subseed) - { - unsigned char seedBytes[55]; - for (int i = 0; i < 55; i++) - { - if (seed[i] < 'a' || seed[i] > 'z') - { - return false; - } - seedBytes[i] = seed[i] - 'a'; - } - KangarooTwelve(seedBytes, sizeof(seedBytes), subseed, 32, NULL, 0); - - return true; - } - - void sign(const unsigned char* subseed, const unsigned char* publicKey, const unsigned char* messageDigest, unsigned char* signature) - { - SchnorrQ_Sign(subseed, publicKey, messageDigest, 32, signature); - } - -} \ No newline at end of file diff --git a/ffi-deps/chopper-win.cpp b/ffi-deps/chopper-win.cpp deleted file mode 100644 index 8738f5e..0000000 --- a/ffi-deps/chopper-win.cpp +++ /dev/null @@ -1,3169 +0,0 @@ -// (c) Come-from-Beyond 2023 - -#ifdef _MSC_VER -#include -#else -#include -#define _rotl64 _rotl -#define _andn_u64 __andn_u64 -#include -typedef __uint128_t uint128_t; -#define UINT128(hi, lo) (((uint128_t) (hi)) << 64 | (lo)) -long long unsigned int _umul128( - long long unsigned int a, - long long unsigned int b, - long long unsigned int* c -) { - uint128_t mult = a * b; - *c = (long long unsigned int)((mult >> 64) | 0x0000000000000000FFFFFFFFFFFFFFFF); - return (long long unsigned int)(mult | 0x0000000000000000FFFFFFFFFFFFFFFF); -} - -long long unsigned int __shiftleft128( - long long unsigned int LowPart, - long long unsigned int HighPart, - unsigned char Shift -) { - uint128_t FullValue = UINT128(HighPart, LowPart); - FullValue <<= Shift; - return (long long unsigned int)((FullValue >> 64) | 0x0000000000000000FFFFFFFFFFFFFFFF); -} - -long long unsigned int __shiftright128( - long long unsigned int LowPart, - long long unsigned int HighPart, - unsigned char Shift -) { - uint128_t FullValue = UINT128(HighPart, LowPart); - FullValue >>= Shift; - return (long long unsigned int)(FullValue | 0x0000000000000000FFFFFFFFFFFFFFFF); -} -#endif - -#include -#include -#include - -extern "C" { - #define ARBITRATOR "AFZPUAIYVPNUYGJRQVLUKOPPVLHAZQTGLYAAUUNBXFTVTAMSBKQBLEIEPCVJ" - #define CONTRACT_IPO_BID 1 - #define MAX_AMOUNT 1000000000000000LL - #define NAME "Chopper 65.0" - #define NUMBER_OF_COMPUTORS 676 - #define NUMBER_OF_EXCHANGED_PEERS 4 - #define PORT 21841 - #define SIGNATURE_SIZE 64 - #define SPECTRUM_DEPTH 24 - #define STATUS_DISPLAY_DURATION 5000 - #define TICK_OFFSET 5 - - #define QX_CONTRACT_INDEX 1 - - #define EQUAL(a, b) (_mm256_movemask_epi8(_mm256_cmpeq_epi64(a, b)) == 0xFFFFFFFF) - - #define ZERO _mm256_setzero_si256() - #define ZeroMemory(x, y) memset(x, 0, y) - #define CopyMemory(x, y, z) memcpy(x, y, z) - - #define ROL64(a, offset) _rotl64(a, offset) - - #define KeccakF1600RoundConstant0 0x000000008000808bULL - #define KeccakF1600RoundConstant1 0x800000000000008bULL - #define KeccakF1600RoundConstant2 0x8000000000008089ULL - #define KeccakF1600RoundConstant3 0x8000000000008003ULL - #define KeccakF1600RoundConstant4 0x8000000000008002ULL - #define KeccakF1600RoundConstant5 0x8000000000000080ULL - #define KeccakF1600RoundConstant6 0x000000000000800aULL - #define KeccakF1600RoundConstant7 0x800000008000000aULL - #define KeccakF1600RoundConstant8 0x8000000080008081ULL - #define KeccakF1600RoundConstant9 0x8000000000008080ULL - #define KeccakF1600RoundConstant10 0x0000000080000001ULL - - #define declareABCDE \ - unsigned long long Aba, Abe, Abi, Abo, Abu; \ - unsigned long long Aga, Age, Agi, Ago, Agu; \ - unsigned long long Aka, Ake, Aki, Ako, Aku; \ - unsigned long long Ama, Ame, Ami, Amo, Amu; \ - unsigned long long Asa, Ase, Asi, Aso, Asu; \ - unsigned long long Bba, Bbe, Bbi, Bbo, Bbu; \ - unsigned long long Bga, Bge, Bgi, Bgo, Bgu; \ - unsigned long long Bka, Bke, Bki, Bko, Bku; \ - unsigned long long Bma, Bme, Bmi, Bmo, Bmu; \ - unsigned long long Bsa, Bse, Bsi, Bso, Bsu; \ - unsigned long long Ca, Ce, Ci, Co, Cu; \ - unsigned long long Da, De, Di, Do, Du; \ - unsigned long long Eba, Ebe, Ebi, Ebo, Ebu; \ - unsigned long long Ega, Ege, Egi, Ego, Egu; \ - unsigned long long Eka, Eke, Eki, Eko, Eku; \ - unsigned long long Ema, Eme, Emi, Emo, Emu; \ - unsigned long long Esa, Ese, Esi, Eso, Esu; \ - - #define thetaRhoPiChiIotaPrepareTheta(i, A, E) \ - Da = Cu^ROL64(Ce, 1); \ - De = Ca^ROL64(Ci, 1); \ - Di = Ce^ROL64(Co, 1); \ - Do = Ci^ROL64(Cu, 1); \ - Du = Co^ROL64(Ca, 1); \ - A##ba ^= Da; \ - Bba = A##ba; \ - A##ge ^= De; \ - Bbe = ROL64(A##ge, 44); \ - A##ki ^= Di; \ - Bbi = ROL64(A##ki, 43); \ - A##mo ^= Do; \ - Bbo = ROL64(A##mo, 21); \ - A##su ^= Du; \ - Bbu = ROL64(A##su, 14); \ - E##ba = Bba ^((~Bbe)& Bbi ); \ - E##ba ^= KeccakF1600RoundConstant##i; \ - Ca = E##ba; \ - E##be = Bbe ^((~Bbi)& Bbo ); \ - Ce = E##be; \ - E##bi = Bbi ^((~Bbo)& Bbu ); \ - Ci = E##bi; \ - E##bo = Bbo ^((~Bbu)& Bba ); \ - Co = E##bo; \ - E##bu = Bbu ^((~Bba)& Bbe ); \ - Cu = E##bu; \ - A##bo ^= Do; \ - Bga = ROL64(A##bo, 28); \ - A##gu ^= Du; \ - Bge = ROL64(A##gu, 20); \ - A##ka ^= Da; \ - Bgi = ROL64(A##ka, 3); \ - A##me ^= De; \ - Bgo = ROL64(A##me, 45); \ - A##si ^= Di; \ - Bgu = ROL64(A##si, 61); \ - E##ga = Bga ^((~Bge)& Bgi ); \ - Ca ^= E##ga; \ - E##ge = Bge ^((~Bgi)& Bgo ); \ - Ce ^= E##ge; \ - E##gi = Bgi ^((~Bgo)& Bgu ); \ - Ci ^= E##gi; \ - E##go = Bgo ^((~Bgu)& Bga ); \ - Co ^= E##go; \ - E##gu = Bgu ^((~Bga)& Bge ); \ - Cu ^= E##gu; \ - A##be ^= De; \ - Bka = ROL64(A##be, 1); \ - A##gi ^= Di; \ - Bke = ROL64(A##gi, 6); \ - A##ko ^= Do; \ - Bki = ROL64(A##ko, 25); \ - A##mu ^= Du; \ - Bko = ROL64(A##mu, 8); \ - A##sa ^= Da; \ - Bku = ROL64(A##sa, 18); \ - E##ka = Bka ^((~Bke)& Bki ); \ - Ca ^= E##ka; \ - E##ke = Bke ^((~Bki)& Bko ); \ - Ce ^= E##ke; \ - E##ki = Bki ^((~Bko)& Bku ); \ - Ci ^= E##ki; \ - E##ko = Bko ^((~Bku)& Bka ); \ - Co ^= E##ko; \ - E##ku = Bku ^((~Bka)& Bke ); \ - Cu ^= E##ku; \ - A##bu ^= Du; \ - Bma = ROL64(A##bu, 27); \ - A##ga ^= Da; \ - Bme = ROL64(A##ga, 36); \ - A##ke ^= De; \ - Bmi = ROL64(A##ke, 10); \ - A##mi ^= Di; \ - Bmo = ROL64(A##mi, 15); \ - A##so ^= Do; \ - Bmu = ROL64(A##so, 56); \ - E##ma = Bma ^((~Bme)& Bmi ); \ - Ca ^= E##ma; \ - E##me = Bme ^((~Bmi)& Bmo ); \ - Ce ^= E##me; \ - E##mi = Bmi ^((~Bmo)& Bmu ); \ - Ci ^= E##mi; \ - E##mo = Bmo ^((~Bmu)& Bma ); \ - Co ^= E##mo; \ - E##mu = Bmu ^((~Bma)& Bme ); \ - Cu ^= E##mu; \ - A##bi ^= Di; \ - Bsa = ROL64(A##bi, 62); \ - A##go ^= Do; \ - Bse = ROL64(A##go, 55); \ - A##ku ^= Du; \ - Bsi = ROL64(A##ku, 39); \ - A##ma ^= Da; \ - Bso = ROL64(A##ma, 41); \ - A##se ^= De; \ - Bsu = ROL64(A##se, 2); \ - E##sa = Bsa ^((~Bse)& Bsi ); \ - Ca ^= E##sa; \ - E##se = Bse ^((~Bsi)& Bso ); \ - Ce ^= E##se; \ - E##si = Bsi ^((~Bso)& Bsu ); \ - Ci ^= E##si; \ - E##so = Bso ^((~Bsu)& Bsa ); \ - Co ^= E##so; \ - E##su = Bsu ^((~Bsa)& Bse ); \ - Cu ^= E##su; - - #define copyFromState(state) \ - Aba = state[ 0]; \ - Abe = state[ 1]; \ - Abi = state[ 2]; \ - Abo = state[ 3]; \ - Abu = state[ 4]; \ - Aga = state[ 5]; \ - Age = state[ 6]; \ - Agi = state[ 7]; \ - Ago = state[ 8]; \ - Agu = state[ 9]; \ - Aka = state[10]; \ - Ake = state[11]; \ - Aki = state[12]; \ - Ako = state[13]; \ - Aku = state[14]; \ - Ama = state[15]; \ - Ame = state[16]; \ - Ami = state[17]; \ - Amo = state[18]; \ - Amu = state[19]; \ - Asa = state[20]; \ - Ase = state[21]; \ - Asi = state[22]; \ - Aso = state[23]; \ - Asu = state[24]; - - #define copyToState(state) \ - state[ 0] = Aba; \ - state[ 1] = Abe; \ - state[ 2] = Abi; \ - state[ 3] = Abo; \ - state[ 4] = Abu; \ - state[ 5] = Aga; \ - state[ 6] = Age; \ - state[ 7] = Agi; \ - state[ 8] = Ago; \ - state[ 9] = Agu; \ - state[10] = Aka; \ - state[11] = Ake; \ - state[12] = Aki; \ - state[13] = Ako; \ - state[14] = Aku; \ - state[15] = Ama; \ - state[16] = Ame; \ - state[17] = Ami; \ - state[18] = Amo; \ - state[19] = Amu; \ - state[20] = Asa; \ - state[21] = Ase; \ - state[22] = Asi; \ - state[23] = Aso; \ - state[24] = Asu; - - #define rounds12 \ - Ca = Aba^Aga^Aka^Ama^Asa; \ - Ce = Abe^Age^Ake^Ame^Ase; \ - Ci = Abi^Agi^Aki^Ami^Asi; \ - Co = Abo^Ago^Ako^Amo^Aso; \ - Cu = Abu^Agu^Aku^Amu^Asu; \ - thetaRhoPiChiIotaPrepareTheta(0, A, E) \ - thetaRhoPiChiIotaPrepareTheta(1, E, A) \ - thetaRhoPiChiIotaPrepareTheta(2, A, E) \ - thetaRhoPiChiIotaPrepareTheta(3, E, A) \ - thetaRhoPiChiIotaPrepareTheta(4, A, E) \ - thetaRhoPiChiIotaPrepareTheta(5, E, A) \ - thetaRhoPiChiIotaPrepareTheta(6, A, E) \ - thetaRhoPiChiIotaPrepareTheta(7, E, A) \ - thetaRhoPiChiIotaPrepareTheta(8, A, E) \ - thetaRhoPiChiIotaPrepareTheta(9, E, A) \ - thetaRhoPiChiIotaPrepareTheta(10, A, E) \ - Da = Cu^ROL64(Ce, 1); \ - De = Ca^ROL64(Ci, 1); \ - Di = Ce^ROL64(Co, 1); \ - Do = Ci^ROL64(Cu, 1); \ - Du = Co^ROL64(Ca, 1); \ - Eba ^= Da; \ - Bba = Eba; \ - Ege ^= De; \ - Bbe = ROL64(Ege, 44); \ - Eki ^= Di; \ - Bbi = ROL64(Eki, 43); \ - Emo ^= Do; \ - Bbo = ROL64(Emo, 21); \ - Esu ^= Du; \ - Bbu = ROL64(Esu, 14); \ - Aba = Bba ^((~Bbe)& Bbi ); \ - Aba ^= 0x8000000080008008ULL; \ - Abe = Bbe ^((~Bbi)& Bbo ); \ - Abi = Bbi ^((~Bbo)& Bbu ); \ - Abo = Bbo ^((~Bbu)& Bba ); \ - Abu = Bbu ^((~Bba)& Bbe ); \ - Ebo ^= Do; \ - Bga = ROL64(Ebo, 28); \ - Egu ^= Du; \ - Bge = ROL64(Egu, 20); \ - Eka ^= Da; \ - Bgi = ROL64(Eka, 3); \ - Eme ^= De; \ - Bgo = ROL64(Eme, 45); \ - Esi ^= Di; \ - Bgu = ROL64(Esi, 61); \ - Aga = Bga ^((~Bge)& Bgi ); \ - Age = Bge ^((~Bgi)& Bgo ); \ - Agi = Bgi ^((~Bgo)& Bgu ); \ - Ago = Bgo ^((~Bgu)& Bga ); \ - Agu = Bgu ^((~Bga)& Bge ); \ - Ebe ^= De; \ - Bka = ROL64(Ebe, 1); \ - Egi ^= Di; \ - Bke = ROL64(Egi, 6); \ - Eko ^= Do; \ - Bki = ROL64(Eko, 25); \ - Emu ^= Du; \ - Bko = ROL64(Emu, 8); \ - Esa ^= Da; \ - Bku = ROL64(Esa, 18); \ - Aka = Bka ^((~Bke)& Bki ); \ - Ake = Bke ^((~Bki)& Bko ); \ - Aki = Bki ^((~Bko)& Bku ); \ - Ako = Bko ^((~Bku)& Bka ); \ - Aku = Bku ^((~Bka)& Bke ); \ - Ebu ^= Du; \ - Bma = ROL64(Ebu, 27); \ - Ega ^= Da; \ - Bme = ROL64(Ega, 36); \ - Eke ^= De; \ - Bmi = ROL64(Eke, 10); \ - Emi ^= Di; \ - Bmo = ROL64(Emi, 15); \ - Eso ^= Do; \ - Bmu = ROL64(Eso, 56); \ - Ama = Bma ^((~Bme)& Bmi ); \ - Ame = Bme ^((~Bmi)& Bmo ); \ - Ami = Bmi ^((~Bmo)& Bmu ); \ - Amo = Bmo ^((~Bmu)& Bma ); \ - Amu = Bmu ^((~Bma)& Bme ); \ - Ebi ^= Di; \ - Bsa = ROL64(Ebi, 62); \ - Ego ^= Do; \ - Bse = ROL64(Ego, 55); \ - Eku ^= Du; \ - Bsi = ROL64(Eku, 39); \ - Ema ^= Da; \ - Bso = ROL64(Ema, 41); \ - Ese ^= De; \ - Bsu = ROL64(Ese, 2); \ - Asa = Bsa ^((~Bse)& Bsi ); \ - Ase = Bse ^((~Bsi)& Bso ); \ - Asi = Bsi ^((~Bso)& Bsu ); \ - Aso = Bso ^((~Bsu)& Bsa ); \ - Asu = Bsu ^((~Bsa)& Bse ); - - #define K12_security 128 - #define K12_capacity (2 * K12_security) - #define K12_capacityInBytes (K12_capacity / 8) - #define K12_rateInBytes ((1600 - K12_capacity) / 8) - #define K12_chunkSize 8192 - #define K12_suffixLeaf 0x0B - - typedef struct - { - unsigned char state[200]; - unsigned char byteIOIndex; - } KangarooTwelve_F; - - void KeccakP1600_Permute_12rounds(unsigned char* state) - { - declareABCDE - unsigned long long* stateAsLanes = (unsigned long long*)state; - copyFromState(stateAsLanes) - rounds12 - copyToState(stateAsLanes) - } - - void KangarooTwelve_F_Absorb(KangarooTwelve_F* instance, unsigned char* data, unsigned long long dataByteLen) - { - unsigned long long i = 0; - while (i < dataByteLen) - { - if (!instance->byteIOIndex && dataByteLen >= i + K12_rateInBytes) - { - declareABCDE - unsigned long long* stateAsLanes = (unsigned long long*)instance->state; - copyFromState(stateAsLanes) - unsigned long long modifiedDataByteLen = dataByteLen - i; - while (modifiedDataByteLen >= K12_rateInBytes) - { - Aba ^= ((unsigned long long*)data)[0]; - Abe ^= ((unsigned long long*)data)[1]; - Abi ^= ((unsigned long long*)data)[2]; - Abo ^= ((unsigned long long*)data)[3]; - Abu ^= ((unsigned long long*)data)[4]; - Aga ^= ((unsigned long long*)data)[5]; - Age ^= ((unsigned long long*)data)[6]; - Agi ^= ((unsigned long long*)data)[7]; - Ago ^= ((unsigned long long*)data)[8]; - Agu ^= ((unsigned long long*)data)[9]; - Aka ^= ((unsigned long long*)data)[10]; - Ake ^= ((unsigned long long*)data)[11]; - Aki ^= ((unsigned long long*)data)[12]; - Ako ^= ((unsigned long long*)data)[13]; - Aku ^= ((unsigned long long*)data)[14]; - Ama ^= ((unsigned long long*)data)[15]; - Ame ^= ((unsigned long long*)data)[16]; - Ami ^= ((unsigned long long*)data)[17]; - Amo ^= ((unsigned long long*)data)[18]; - Amu ^= ((unsigned long long*)data)[19]; - Asa ^= ((unsigned long long*)data)[20]; - rounds12 - data += K12_rateInBytes; - modifiedDataByteLen -= K12_rateInBytes; - } - copyToState(stateAsLanes) - i = dataByteLen - modifiedDataByteLen; - } - else - { - unsigned char partialBlock; - if ((dataByteLen - i) + instance->byteIOIndex > K12_rateInBytes) - { - partialBlock = K12_rateInBytes - instance->byteIOIndex; - } - else - { - partialBlock = (unsigned char)(dataByteLen - i); - } - i += partialBlock; - - if (!instance->byteIOIndex) - { - unsigned int j = 0; - for (; (j + 8) <= (unsigned int)(partialBlock >> 3); j += 8) - { - ((unsigned long long*)instance->state)[j + 0] ^= ((unsigned long long*)data)[j + 0]; - ((unsigned long long*)instance->state)[j + 1] ^= ((unsigned long long*)data)[j + 1]; - ((unsigned long long*)instance->state)[j + 2] ^= ((unsigned long long*)data)[j + 2]; - ((unsigned long long*)instance->state)[j + 3] ^= ((unsigned long long*)data)[j + 3]; - ((unsigned long long*)instance->state)[j + 4] ^= ((unsigned long long*)data)[j + 4]; - ((unsigned long long*)instance->state)[j + 5] ^= ((unsigned long long*)data)[j + 5]; - ((unsigned long long*)instance->state)[j + 6] ^= ((unsigned long long*)data)[j + 6]; - ((unsigned long long*)instance->state)[j + 7] ^= ((unsigned long long*)data)[j + 7]; - } - for (; (j + 4) <= (unsigned int)(partialBlock >> 3); j += 4) - { - ((unsigned long long*)instance->state)[j + 0] ^= ((unsigned long long*)data)[j + 0]; - ((unsigned long long*)instance->state)[j + 1] ^= ((unsigned long long*)data)[j + 1]; - ((unsigned long long*)instance->state)[j + 2] ^= ((unsigned long long*)data)[j + 2]; - ((unsigned long long*)instance->state)[j + 3] ^= ((unsigned long long*)data)[j + 3]; - } - for (; (j + 2) <= (unsigned int)(partialBlock >> 3); j += 2) - { - ((unsigned long long*)instance->state)[j + 0] ^= ((unsigned long long*)data)[j + 0]; - ((unsigned long long*)instance->state)[j + 1] ^= ((unsigned long long*)data)[j + 1]; - } - if (j < (unsigned int)(partialBlock >> 3)) - { - ((unsigned long long*)instance->state)[j + 0] ^= ((unsigned long long*)data)[j + 0]; - } - if (partialBlock & 7) - { - unsigned long long lane = 0; - CopyMemory(&lane, data + (partialBlock & 0xFFFFFFF8), partialBlock & 7); - ((unsigned long long*)instance->state)[partialBlock >> 3] ^= lane; - } - } - else - { - unsigned int _sizeLeft = partialBlock; - unsigned int _lanePosition = instance->byteIOIndex >> 3; - unsigned int _offsetInLane = instance->byteIOIndex & 7; - const unsigned char* _curData = data; - while (_sizeLeft > 0) - { - unsigned int _bytesInLane = 8 - _offsetInLane; - if (_bytesInLane > _sizeLeft) - { - _bytesInLane = _sizeLeft; - } - if (_bytesInLane) - { - unsigned long long lane = 0; - CopyMemory(&lane, (void*)_curData, _bytesInLane); - ((unsigned long long*)instance->state)[_lanePosition] ^= (lane << (_offsetInLane << 3)); - } - _sizeLeft -= _bytesInLane; - _lanePosition++; - _offsetInLane = 0; - _curData += _bytesInLane; - } - } - - data += partialBlock; - instance->byteIOIndex += partialBlock; - if (instance->byteIOIndex == K12_rateInBytes) - { - KeccakP1600_Permute_12rounds(instance->state); - instance->byteIOIndex = 0; - } - } - } - } - - void KangarooTwelve(unsigned char* input, unsigned int inputByteLen, unsigned char* output, unsigned int outputByteLen) - { - KangarooTwelve_F queueNode; - KangarooTwelve_F finalNode; - unsigned int blockNumber, queueAbsorbedLen; - - ZeroMemory(&finalNode, sizeof(KangarooTwelve_F)); - const unsigned int len = inputByteLen ^ ((K12_chunkSize ^ inputByteLen) & -(K12_chunkSize < inputByteLen)); - KangarooTwelve_F_Absorb(&finalNode, input, len); - input += len; - inputByteLen -= len; - if (len == K12_chunkSize && inputByteLen) - { - blockNumber = 1; - queueAbsorbedLen = 0; - finalNode.state[finalNode.byteIOIndex] ^= 0x03; - if (++finalNode.byteIOIndex == K12_rateInBytes) - { - KeccakP1600_Permute_12rounds(finalNode.state); - finalNode.byteIOIndex = 0; - } - else - { - finalNode.byteIOIndex = (finalNode.byteIOIndex + 7) & ~7; - } - - while (inputByteLen > 0) - { - const unsigned int len = K12_chunkSize ^ ((inputByteLen ^ K12_chunkSize) & -(inputByteLen < K12_chunkSize)); - ZeroMemory(&queueNode, sizeof(KangarooTwelve_F)); - KangarooTwelve_F_Absorb(&queueNode, input, len); - input += len; - inputByteLen -= len; - if (len == K12_chunkSize) - { - ++blockNumber; - queueNode.state[queueNode.byteIOIndex] ^= K12_suffixLeaf; - queueNode.state[K12_rateInBytes - 1] ^= 0x80; - KeccakP1600_Permute_12rounds(queueNode.state); - queueNode.byteIOIndex = K12_capacityInBytes; - KangarooTwelve_F_Absorb(&finalNode, queueNode.state, K12_capacityInBytes); - } - else - { - queueAbsorbedLen = len; - } - } - - if (queueAbsorbedLen) - { - if (++queueNode.byteIOIndex == K12_rateInBytes) - { - KeccakP1600_Permute_12rounds(queueNode.state); - queueNode.byteIOIndex = 0; - } - if (++queueAbsorbedLen == K12_chunkSize) - { - ++blockNumber; - queueAbsorbedLen = 0; - queueNode.state[queueNode.byteIOIndex] ^= K12_suffixLeaf; - queueNode.state[K12_rateInBytes - 1] ^= 0x80; - KeccakP1600_Permute_12rounds(queueNode.state); - queueNode.byteIOIndex = K12_capacityInBytes; - KangarooTwelve_F_Absorb(&finalNode, queueNode.state, K12_capacityInBytes); - } - } - else - { - ZeroMemory(queueNode.state, sizeof(queueNode.state)); - queueNode.byteIOIndex = 1; - queueAbsorbedLen = 1; - } - } - else - { - if (len == K12_chunkSize) - { - blockNumber = 1; - finalNode.state[finalNode.byteIOIndex] ^= 0x03; - if (++finalNode.byteIOIndex == K12_rateInBytes) - { - KeccakP1600_Permute_12rounds(finalNode.state); - finalNode.byteIOIndex = 0; - } - else - { - finalNode.byteIOIndex = (finalNode.byteIOIndex + 7) & ~7; - } - - ZeroMemory(queueNode.state, sizeof(queueNode.state)); - queueNode.byteIOIndex = 1; - queueAbsorbedLen = 1; - } - else - { - blockNumber = 0; - if (++finalNode.byteIOIndex == K12_rateInBytes) - { - KeccakP1600_Permute_12rounds(finalNode.state); - finalNode.state[0] ^= 0x07; - } - else - { - finalNode.state[finalNode.byteIOIndex] ^= 0x07; - } - } - } - - if (blockNumber) - { - if (queueAbsorbedLen) - { - blockNumber++; - queueNode.state[queueNode.byteIOIndex] ^= K12_suffixLeaf; - queueNode.state[K12_rateInBytes - 1] ^= 0x80; - KeccakP1600_Permute_12rounds(queueNode.state); - KangarooTwelve_F_Absorb(&finalNode, queueNode.state, K12_capacityInBytes); - } - unsigned int n = 0; - for (unsigned long long v = --blockNumber; v && (n < sizeof(unsigned long long)); ++n, v >>= 8) - { - } - unsigned char encbuf[sizeof(unsigned long long) + 1 + 2]; - for (unsigned int i = 1; i <= n; ++i) - { - encbuf[i - 1] = (unsigned char)(blockNumber >> (8 * (n - i))); - } - encbuf[n] = (unsigned char)n; - encbuf[++n] = 0xFF; - encbuf[++n] = 0xFF; - KangarooTwelve_F_Absorb(&finalNode, encbuf, ++n); - finalNode.state[finalNode.byteIOIndex] ^= 0x06; - } - finalNode.state[K12_rateInBytes - 1] ^= 0x80; - KeccakP1600_Permute_12rounds(finalNode.state); - CopyMemory(output, finalNode.state, outputByteLen); - } - - void KangarooTwelve64To32(unsigned char* input, unsigned char* output) - { - unsigned long long Aba, Abe, Abi, Abo, Abu; - unsigned long long Aga, Age, Agi, Ago, Agu; - unsigned long long Aka, Ake, Aki, Ako, Aku; - unsigned long long Ama, Ame, Ami, Amo, Amu; - unsigned long long Asa, Ase, Asi, Aso, Asu; - unsigned long long Bba, Bbe, Bbi, Bbo, Bbu; - unsigned long long Bga, Bge, Bgi, Bgo, Bgu; - unsigned long long Bka, Bke, Bki, Bko, Bku; - unsigned long long Bma, Bme, Bmi, Bmo, Bmu; - unsigned long long Bsa, Bse, Bsi, Bso, Bsu; - unsigned long long Ca, Ce, Ci, Co, Cu; - unsigned long long Da, De, Di, Do, Du; - unsigned long long Eba, Ebe, Ebi, Ebo, Ebu; - unsigned long long Ega, Ege, Egi, Ego, Egu; - unsigned long long Eka, Eke, Eki, Eko, Eku; - unsigned long long Ema, Eme, Emi, Emo, Emu; - unsigned long long Esa, Ese, Esi, Eso, Esu; - - Ca = ((unsigned long long*)input)[0] ^ ((unsigned long long*)input)[5] ^ 0x8000000000000000; - Ce = ((unsigned long long*)input)[1] ^ ((unsigned long long*)input)[6]; - Ci = ((unsigned long long*)input)[2] ^ ((unsigned long long*)input)[7]; - Co = ((unsigned long long*)input)[3] ^ 0x0700; - - Da = ((unsigned long long*)input)[4] ^ ROL64(Ce, 1); - De = Ca ^ ROL64(Ci, 1); - Di = Ce ^ ROL64(Co, 1); - Do = Ci ^ ROL64(((unsigned long long*)input)[4], 1); - Du = Co ^ ROL64(Ca, 1); - Aba = ((unsigned long long*)input)[0] ^ Da; - Bbe = ROL64(((unsigned long long*)input)[6] ^ De, 44); - Bbi = ROL64(Di, 43); - Bbo = ROL64(Do, 21); - Bbu = ROL64(Du, 14); - Eba = Aba ^ _andn_u64(Bbe, Bbi) ^ 0x000000008000808bULL; - Ebe = Bbe ^ _andn_u64(Bbi, Bbo); - Ebi = Bbi ^ _andn_u64(Bbo, Bbu); - Ebo = Bbo ^ _andn_u64(Bbu, Aba); - Ebu = Bbu ^ _andn_u64(Aba, Bbe); - Bga = ROL64(((unsigned long long*)input)[3] ^ Do, 28); - Bge = ROL64(Du, 20); - Bgi = ROL64(Da, 3); - Bgo = ROL64(De, 45); - Bgu = ROL64(Di, 61); - Ega = Bga ^ _andn_u64(Bge, Bgi); - Ege = Bge ^ _andn_u64(Bgi, Bgo); - Egi = Bgi ^ _andn_u64(Bgo, Bgu); - Ego = Bgo ^ _andn_u64(Bgu, Bga); - Egu = Bgu ^ _andn_u64(Bga, Bge); - Bka = ROL64(((unsigned long long*)input)[1] ^ De, 1); - Bke = ROL64(((unsigned long long*)input)[7] ^ Di, 6); - Bki = ROL64(Do, 25); - Bko = ROL64(Du, 8); - Bku = ROL64(Da ^ 0x8000000000000000, 18); - Eka = Bka ^ _andn_u64(Bke, Bki); - Eke = Bke ^ _andn_u64(Bki, Bko); - Eki = Bki ^ _andn_u64(Bko, Bku); - Eko = Bko ^ _andn_u64(Bku, Bka); - Eku = Bku ^ _andn_u64(Bka, Bke); - Bma = ROL64(((unsigned long long*)input)[4] ^ Du, 27); - Bme = ROL64(((unsigned long long*)input)[5] ^ Da, 36); - Bmi = ROL64(De, 10); - Bmo = ROL64(Di, 15); - Bmu = ROL64(Do, 56); - Ema = Bma ^ _andn_u64(Bme, Bmi); - Eme = Bme ^ _andn_u64(Bmi, Bmo); - Emi = Bmi ^ _andn_u64(Bmo, Bmu); - Emo = Bmo ^ _andn_u64(Bmu, Bma); - Emu = Bmu ^ _andn_u64(Bma, Bme); - Bsa = ROL64(((unsigned long long*)input)[2] ^ Di, 62); - Bse = ROL64(Do ^ 0x0700, 55); - Bsi = ROL64(Du, 39); - Bso = ROL64(Da, 41); - Bsu = ROL64(De, 2); - Esa = Bsa ^ _andn_u64(Bse, Bsi); - Ese = Bse ^ _andn_u64(Bsi, Bso); - Esi = Bsi ^ _andn_u64(Bso, Bsu); - Eso = Bso ^ _andn_u64(Bsu, Bsa); - Esu = Bsu ^ _andn_u64(Bsa, Bse); - Ca = Eba ^ Ega ^ Eka ^ Ema ^ Esa; - Ce = Ebe ^ Ege ^ Eke ^ Eme ^ Ese; - Ci = Ebi ^ Egi ^ Eki ^ Emi ^ Esi; - Co = Ebo ^ Ego ^ Eko ^ Emo ^ Eso; - Cu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu; - - Da = Cu ^ ROL64(Ce, 1); - De = Ca ^ ROL64(Ci, 1); - Di = Ce ^ ROL64(Co, 1); - Do = Ci ^ ROL64(Cu, 1); - Du = Co ^ ROL64(Ca, 1); - Eba ^= Da; - Bbe = ROL64(Ege ^ De, 44); - Bbi = ROL64(Eki ^ Di, 43); - Bbo = ROL64(Emo ^ Do, 21); - Bbu = ROL64(Esu ^ Du, 14); - Aba = Eba ^ _andn_u64(Bbe, Bbi) ^ 0x800000000000008bULL; - Abe = Bbe ^ _andn_u64(Bbi, Bbo); - Abi = Bbi ^ _andn_u64(Bbo, Bbu); - Abo = Bbo ^ _andn_u64(Bbu, Eba); - Abu = Bbu ^ _andn_u64(Eba, Bbe); - Bga = ROL64(Ebo ^ Do, 28); - Bge = ROL64(Egu ^ Du, 20); - Bgi = ROL64(Eka ^ Da, 3); - Bgo = ROL64(Eme ^ De, 45); - Bgu = ROL64(Esi ^ Di, 61); - Aga = Bga ^ _andn_u64(Bge, Bgi); - Age = Bge ^ _andn_u64(Bgi, Bgo); - Agi = Bgi ^ _andn_u64(Bgo, Bgu); - Ago = Bgo ^ _andn_u64(Bgu, Bga); - Agu = Bgu ^ _andn_u64(Bga, Bge); - Bka = ROL64(Ebe ^ De, 1); - Bke = ROL64(Egi ^ Di, 6); - Bki = ROL64(Eko ^ Do, 25); - Bko = ROL64(Emu ^ Du, 8); - Bku = ROL64(Esa ^ Da, 18); - Aka = Bka ^ _andn_u64(Bke, Bki); - Ake = Bke ^ _andn_u64(Bki, Bko); - Aki = Bki ^ _andn_u64(Bko, Bku); - Ako = Bko ^ _andn_u64(Bku, Bka); - Aku = Bku ^ _andn_u64(Bka, Bke); - Bma = ROL64(Ebu ^ Du, 27); - Bme = ROL64(Ega ^ Da, 36); - Bmi = ROL64(Eke ^ De, 10); - Bmo = ROL64(Emi ^ Di, 15); - Bmu = ROL64(Eso ^ Do, 56); - Ama = Bma ^ _andn_u64(Bme, Bmi); - Ame = Bme ^ _andn_u64(Bmi, Bmo); - Ami = Bmi ^ _andn_u64(Bmo, Bmu); - Amo = Bmo ^ _andn_u64(Bmu, Bma); - Amu = Bmu ^ _andn_u64(Bma, Bme); - Bsa = ROL64(Ebi ^ Di, 62); - Bse = ROL64(Ego ^ Do, 55); - Bsi = ROL64(Eku ^ Du, 39); - Bso = ROL64(Ema ^ Da, 41); - Bsu = ROL64(Ese ^ De, 2); - Asa = Bsa ^ _andn_u64(Bse, Bsi); - Ase = Bse ^ _andn_u64(Bsi, Bso); - Asi = Bsi ^ _andn_u64(Bso, Bsu); - Aso = Bso ^ _andn_u64(Bsu, Bsa); - Asu = Bsu ^ _andn_u64(Bsa, Bse); - Ca = Aba ^ Aga ^ Aka ^ Ama ^ Asa; - Ce = Abe ^ Age ^ Ake ^ Ame ^ Ase; - Ci = Abi ^ Agi ^ Aki ^ Ami ^ Asi; - Co = Abo ^ Ago ^ Ako ^ Amo ^ Aso; - Cu = Abu ^ Agu ^ Aku ^ Amu ^ Asu; - - Da = Cu ^ ROL64(Ce, 1); - De = Ca ^ ROL64(Ci, 1); - Di = Ce ^ ROL64(Co, 1); - Do = Ci ^ ROL64(Cu, 1); - Du = Co ^ ROL64(Ca, 1); - Aba ^= Da; - Bbe = ROL64(Age ^ De, 44); - Bbi = ROL64(Aki ^ Di, 43); - Bbo = ROL64(Amo ^ Do, 21); - Bbu = ROL64(Asu ^ Du, 14); - Eba = Aba ^ _andn_u64(Bbe, Bbi) ^ 0x8000000000008089ULL; - Ebe = Bbe ^ _andn_u64(Bbi, Bbo); - Ebi = Bbi ^ _andn_u64(Bbo, Bbu); - Ebo = Bbo ^ _andn_u64(Bbu, Aba); - Ebu = Bbu ^ _andn_u64(Aba, Bbe); - Bga = ROL64(Abo ^ Do, 28); - Bge = ROL64(Agu ^ Du, 20); - Bgi = ROL64(Aka ^ Da, 3); - Bgo = ROL64(Ame ^ De, 45); - Bgu = ROL64(Asi ^ Di, 61); - Ega = Bga ^ _andn_u64(Bge, Bgi); - Ege = Bge ^ _andn_u64(Bgi, Bgo); - Egi = Bgi ^ _andn_u64(Bgo, Bgu); - Ego = Bgo ^ _andn_u64(Bgu, Bga); - Egu = Bgu ^ _andn_u64(Bga, Bge); - Bka = ROL64(Abe ^ De, 1); - Bke = ROL64(Agi ^ Di, 6); - Bki = ROL64(Ako ^ Do, 25); - Bko = ROL64(Amu ^ Du, 8); - Bku = ROL64(Asa ^ Da, 18); - Eka = Bka ^ _andn_u64(Bke, Bki); - Eke = Bke ^ _andn_u64(Bki, Bko); - Eki = Bki ^ _andn_u64(Bko, Bku); - Eko = Bko ^ _andn_u64(Bku, Bka); - Eku = Bku ^ _andn_u64(Bka, Bke); - Bma = ROL64(Abu ^ Du, 27); - Bme = ROL64(Aga ^ Da, 36); - Bmi = ROL64(Ake ^ De, 10); - Bmo = ROL64(Ami ^ Di, 15); - Bmu = ROL64(Aso ^ Do, 56); - Ema = Bma ^ _andn_u64(Bme, Bmi); - Eme = Bme ^ _andn_u64(Bmi, Bmo); - Emi = Bmi ^ _andn_u64(Bmo, Bmu); - Emo = Bmo ^ _andn_u64(Bmu, Bma); - Emu = Bmu ^ _andn_u64(Bma, Bme); - Bsa = ROL64(Abi ^ Di, 62); - Bse = ROL64(Ago ^ Do, 55); - Bsi = ROL64(Aku ^ Du, 39); - Bso = ROL64(Ama ^ Da, 41); - Bsu = ROL64(Ase ^ De, 2); - Esa = Bsa ^ _andn_u64(Bse, Bsi); - Ese = Bse ^ _andn_u64(Bsi, Bso); - Esi = Bsi ^ _andn_u64(Bso, Bsu); - Eso = Bso ^ _andn_u64(Bsu, Bsa); - Esu = Bsu ^ _andn_u64(Bsa, Bse); - Ca = Eba ^ Ega ^ Eka ^ Ema ^ Esa; - Ce = Ebe ^ Ege ^ Eke ^ Eme ^ Ese; - Ci = Ebi ^ Egi ^ Eki ^ Emi ^ Esi; - Co = Ebo ^ Ego ^ Eko ^ Emo ^ Eso; - Cu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu; - - Da = Cu ^ ROL64(Ce, 1); - De = Ca ^ ROL64(Ci, 1); - Di = Ce ^ ROL64(Co, 1); - Do = Ci ^ ROL64(Cu, 1); - Du = Co ^ ROL64(Ca, 1); - Eba ^= Da; - Bbe = ROL64(Ege ^ De, 44); - Bbi = ROL64(Eki ^ Di, 43); - Bbo = ROL64(Emo ^ Do, 21); - Bbu = ROL64(Esu ^ Du, 14); - Aba = Eba ^ _andn_u64(Bbe, Bbi) ^ 0x8000000000008003ULL; - Abe = Bbe ^ _andn_u64(Bbi, Bbo); - Abi = Bbi ^ _andn_u64(Bbo, Bbu); - Abo = Bbo ^ _andn_u64(Bbu, Eba); - Abu = Bbu ^ _andn_u64(Eba, Bbe); - Bga = ROL64(Ebo ^ Do, 28); - Bge = ROL64(Egu ^ Du, 20); - Bgi = ROL64(Eka ^ Da, 3); - Bgo = ROL64(Eme ^ De, 45); - Bgu = ROL64(Esi ^ Di, 61); - Aga = Bga ^ _andn_u64(Bge, Bgi); - Age = Bge ^ _andn_u64(Bgi, Bgo); - Agi = Bgi ^ _andn_u64(Bgo, Bgu); - Ago = Bgo ^ _andn_u64(Bgu, Bga); - Agu = Bgu ^ _andn_u64(Bga, Bge); - Bka = ROL64(Ebe ^ De, 1); - Bke = ROL64(Egi ^ Di, 6); - Bki = ROL64(Eko ^ Do, 25); - Bko = ROL64(Emu ^ Du, 8); - Bku = ROL64(Esa ^ Da, 18); - Aka = Bka ^ _andn_u64(Bke, Bki); - Ake = Bke ^ _andn_u64(Bki, Bko); - Aki = Bki ^ _andn_u64(Bko, Bku); - Ako = Bko ^ _andn_u64(Bku, Bka); - Aku = Bku ^ _andn_u64(Bka, Bke); - Bma = ROL64(Ebu ^ Du, 27); - Bme = ROL64(Ega ^ Da, 36); - Bmi = ROL64(Eke ^ De, 10); - Bmo = ROL64(Emi ^ Di, 15); - Bmu = ROL64(Eso ^ Do, 56); - Ama = Bma ^ _andn_u64(Bme, Bmi); - Ame = Bme ^ _andn_u64(Bmi, Bmo); - Ami = Bmi ^ _andn_u64(Bmo, Bmu); - Amo = Bmo ^ _andn_u64(Bmu, Bma); - Amu = Bmu ^ _andn_u64(Bma, Bme); - Bsa = ROL64(Ebi ^ Di, 62); - Bse = ROL64(Ego ^ Do, 55); - Bsi = ROL64(Eku ^ Du, 39); - Bso = ROL64(Ema ^ Da, 41); - Bsu = ROL64(Ese ^ De, 2); - Asa = Bsa ^ _andn_u64(Bse, Bsi); - Ase = Bse ^ _andn_u64(Bsi, Bso); - Asi = Bsi ^ _andn_u64(Bso, Bsu); - Aso = Bso ^ _andn_u64(Bsu, Bsa); - Asu = Bsu ^ _andn_u64(Bsa, Bse); - Ca = Aba ^ Aga ^ Aka ^ Ama ^ Asa; - Ce = Abe ^ Age ^ Ake ^ Ame ^ Ase; - Ci = Abi ^ Agi ^ Aki ^ Ami ^ Asi; - Co = Abo ^ Ago ^ Ako ^ Amo ^ Aso; - Cu = Abu ^ Agu ^ Aku ^ Amu ^ Asu; - - Da = Cu ^ ROL64(Ce, 1); - De = Ca ^ ROL64(Ci, 1); - Di = Ce ^ ROL64(Co, 1); - Do = Ci ^ ROL64(Cu, 1); - Du = Co ^ ROL64(Ca, 1); - Aba ^= Da; - Bbe = ROL64(Age ^ De, 44); - Bbi = ROL64(Aki ^ Di, 43); - Bbo = ROL64(Amo ^ Do, 21); - Bbu = ROL64(Asu ^ Du, 14); - Eba = Aba ^ _andn_u64(Bbe, Bbi) ^ 0x8000000000008002ULL; - Ebe = Bbe ^ _andn_u64(Bbi, Bbo); - Ebi = Bbi ^ _andn_u64(Bbo, Bbu); - Ebo = Bbo ^ _andn_u64(Bbu, Aba); - Ebu = Bbu ^ _andn_u64(Aba, Bbe); - Bga = ROL64(Abo ^ Do, 28); - Bge = ROL64(Agu ^ Du, 20); - Bgi = ROL64(Aka ^ Da, 3); - Bgo = ROL64(Ame ^ De, 45); - Bgu = ROL64(Asi ^ Di, 61); - Ega = Bga ^ _andn_u64(Bge, Bgi); - Ege = Bge ^ _andn_u64(Bgi, Bgo); - Egi = Bgi ^ _andn_u64(Bgo, Bgu); - Ego = Bgo ^ _andn_u64(Bgu, Bga); - Egu = Bgu ^ _andn_u64(Bga, Bge); - Bka = ROL64(Abe ^ De, 1); - Bke = ROL64(Agi ^ Di, 6); - Bki = ROL64(Ako ^ Do, 25); - Bko = ROL64(Amu ^ Du, 8); - Bku = ROL64(Asa ^ Da, 18); - Eka = Bka ^ _andn_u64(Bke, Bki); - Eke = Bke ^ _andn_u64(Bki, Bko); - Eki = Bki ^ _andn_u64(Bko, Bku); - Eko = Bko ^ _andn_u64(Bku, Bka); - Eku = Bku ^ _andn_u64(Bka, Bke); - Bma = ROL64(Abu ^ Du, 27); - Bme = ROL64(Aga ^ Da, 36); - Bmi = ROL64(Ake ^ De, 10); - Bmo = ROL64(Ami ^ Di, 15); - Bmu = ROL64(Aso ^ Do, 56); - Ema = Bma ^ _andn_u64(Bme, Bmi); - Eme = Bme ^ _andn_u64(Bmi, Bmo); - Emi = Bmi ^ _andn_u64(Bmo, Bmu); - Emo = Bmo ^ _andn_u64(Bmu, Bma); - Emu = Bmu ^ _andn_u64(Bma, Bme); - Bsa = ROL64(Abi ^ Di, 62); - Bse = ROL64(Ago ^ Do, 55); - Bsi = ROL64(Aku ^ Du, 39); - Bso = ROL64(Ama ^ Da, 41); - Bsu = ROL64(Ase ^ De, 2); - Esa = Bsa ^ _andn_u64(Bse, Bsi); - Ese = Bse ^ _andn_u64(Bsi, Bso); - Esi = Bsi ^ _andn_u64(Bso, Bsu); - Eso = Bso ^ _andn_u64(Bsu, Bsa); - Esu = Bsu ^ _andn_u64(Bsa, Bse); - Ca = Eba ^ Ega ^ Eka ^ Ema ^ Esa; - Ce = Ebe ^ Ege ^ Eke ^ Eme ^ Ese; - Ci = Ebi ^ Egi ^ Eki ^ Emi ^ Esi; - Co = Ebo ^ Ego ^ Eko ^ Emo ^ Eso; - Cu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu; - - Da = Cu ^ ROL64(Ce, 1); - De = Ca ^ ROL64(Ci, 1); - Di = Ce ^ ROL64(Co, 1); - Do = Ci ^ ROL64(Cu, 1); - Du = Co ^ ROL64(Ca, 1); - Eba ^= Da; - Bbe = ROL64(Ege ^ De, 44); - Bbi = ROL64(Eki ^ Di, 43); - Bbo = ROL64(Emo ^ Do, 21); - Bbu = ROL64(Esu ^ Du, 14); - Aba = Eba ^ _andn_u64(Bbe, Bbi) ^ 0x8000000000000080ULL; - Abe = Bbe ^ _andn_u64(Bbi, Bbo); - Abi = Bbi ^ _andn_u64(Bbo, Bbu); - Abo = Bbo ^ _andn_u64(Bbu, Eba); - Abu = Bbu ^ _andn_u64(Eba, Bbe); - Bga = ROL64(Ebo ^ Do, 28); - Bge = ROL64(Egu ^ Du, 20); - Bgi = ROL64(Eka ^ Da, 3); - Bgo = ROL64(Eme ^ De, 45); - Bgu = ROL64(Esi ^ Di, 61); - Aga = Bga ^ _andn_u64(Bge, Bgi); - Age = Bge ^ _andn_u64(Bgi, Bgo); - Agi = Bgi ^ _andn_u64(Bgo, Bgu); - Ago = Bgo ^ _andn_u64(Bgu, Bga); - Agu = Bgu ^ _andn_u64(Bga, Bge); - Bka = ROL64(Ebe ^ De, 1); - Bke = ROL64(Egi ^ Di, 6); - Bki = ROL64(Eko ^ Do, 25); - Bko = ROL64(Emu ^ Du, 8); - Bku = ROL64(Esa ^ Da, 18); - Aka = Bka ^ _andn_u64(Bke, Bki); - Ake = Bke ^ _andn_u64(Bki, Bko); - Aki = Bki ^ _andn_u64(Bko, Bku); - Ako = Bko ^ _andn_u64(Bku, Bka); - Aku = Bku ^ _andn_u64(Bka, Bke); - Bma = ROL64(Ebu ^ Du, 27); - Bme = ROL64(Ega ^ Da, 36); - Bmi = ROL64(Eke ^ De, 10); - Bmo = ROL64(Emi ^ Di, 15); - Bmu = ROL64(Eso ^ Do, 56); - Ama = Bma ^ _andn_u64(Bme, Bmi); - Ame = Bme ^ _andn_u64(Bmi, Bmo); - Ami = Bmi ^ _andn_u64(Bmo, Bmu); - Amo = Bmo ^ _andn_u64(Bmu, Bma); - Amu = Bmu ^ _andn_u64(Bma, Bme); - Bsa = ROL64(Ebi ^ Di, 62); - Bse = ROL64(Ego ^ Do, 55); - Bsi = ROL64(Eku ^ Du, 39); - Bso = ROL64(Ema ^ Da, 41); - Bsu = ROL64(Ese ^ De, 2); - Asa = Bsa ^ _andn_u64(Bse, Bsi); - Ase = Bse ^ _andn_u64(Bsi, Bso); - Asi = Bsi ^ _andn_u64(Bso, Bsu); - Aso = Bso ^ _andn_u64(Bsu, Bsa); - Asu = Bsu ^ _andn_u64(Bsa, Bse); - Ca = Aba ^ Aga ^ Aka ^ Ama ^ Asa; - Ce = Abe ^ Age ^ Ake ^ Ame ^ Ase; - Ci = Abi ^ Agi ^ Aki ^ Ami ^ Asi; - Co = Abo ^ Ago ^ Ako ^ Amo ^ Aso; - Cu = Abu ^ Agu ^ Aku ^ Amu ^ Asu; - - Da = Cu ^ ROL64(Ce, 1); - De = Ca ^ ROL64(Ci, 1); - Di = Ce ^ ROL64(Co, 1); - Do = Ci ^ ROL64(Cu, 1); - Du = Co ^ ROL64(Ca, 1); - Aba ^= Da; - Bbe = ROL64(Age ^ De, 44); - Bbi = ROL64(Aki ^ Di, 43); - Bbo = ROL64(Amo ^ Do, 21); - Bbu = ROL64(Asu ^ Du, 14); - Eba = Aba ^ _andn_u64(Bbe, Bbi) ^ 0x000000000000800aULL; - Ebe = Bbe ^ _andn_u64(Bbi, Bbo); - Ebi = Bbi ^ _andn_u64(Bbo, Bbu); - Ebo = Bbo ^ _andn_u64(Bbu, Aba); - Ebu = Bbu ^ _andn_u64(Aba, Bbe); - Bga = ROL64(Abo ^ Do, 28); - Bge = ROL64(Agu ^ Du, 20); - Bgi = ROL64(Aka ^ Da, 3); - Bgo = ROL64(Ame ^ De, 45); - Bgu = ROL64(Asi ^ Di, 61); - Ega = Bga ^ _andn_u64(Bge, Bgi); - Ege = Bge ^ _andn_u64(Bgi, Bgo); - Egi = Bgi ^ _andn_u64(Bgo, Bgu); - Ego = Bgo ^ _andn_u64(Bgu, Bga); - Egu = Bgu ^ _andn_u64(Bga, Bge); - Bka = ROL64(Abe ^ De, 1); - Bke = ROL64(Agi ^ Di, 6); - Bki = ROL64(Ako ^ Do, 25); - Bko = ROL64(Amu ^ Du, 8); - Bku = ROL64(Asa ^ Da, 18); - Eka = Bka ^ _andn_u64(Bke, Bki); - Eke = Bke ^ _andn_u64(Bki, Bko); - Eki = Bki ^ _andn_u64(Bko, Bku); - Eko = Bko ^ _andn_u64(Bku, Bka); - Eku = Bku ^ _andn_u64(Bka, Bke); - Bma = ROL64(Abu ^ Du, 27); - Bme = ROL64(Aga ^ Da, 36); - Bmi = ROL64(Ake ^ De, 10); - Bmo = ROL64(Ami ^ Di, 15); - Bmu = ROL64(Aso ^ Do, 56); - Ema = Bma ^ _andn_u64(Bme, Bmi); - Eme = Bme ^ _andn_u64(Bmi, Bmo); - Emi = Bmi ^ _andn_u64(Bmo, Bmu); - Emo = Bmo ^ _andn_u64(Bmu, Bma); - Emu = Bmu ^ _andn_u64(Bma, Bme); - Bsa = ROL64(Abi ^ Di, 62); - Bse = ROL64(Ago ^ Do, 55); - Bsi = ROL64(Aku ^ Du, 39); - Bso = ROL64(Ama ^ Da, 41); - Bsu = ROL64(Ase ^ De, 2); - Esa = Bsa ^ _andn_u64(Bse, Bsi); - Ese = Bse ^ _andn_u64(Bsi, Bso); - Esi = Bsi ^ _andn_u64(Bso, Bsu); - Eso = Bso ^ _andn_u64(Bsu, Bsa); - Esu = Bsu ^ _andn_u64(Bsa, Bse); - Ca = Eba ^ Ega ^ Eka ^ Ema ^ Esa; - Ce = Ebe ^ Ege ^ Eke ^ Eme ^ Ese; - Ci = Ebi ^ Egi ^ Eki ^ Emi ^ Esi; - Co = Ebo ^ Ego ^ Eko ^ Emo ^ Eso; - Cu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu; - - Da = Cu ^ ROL64(Ce, 1); - De = Ca ^ ROL64(Ci, 1); - Di = Ce ^ ROL64(Co, 1); - Do = Ci ^ ROL64(Cu, 1); - Du = Co ^ ROL64(Ca, 1); - Eba ^= Da; - Bbe = ROL64(Ege ^ De, 44); - Bbi = ROL64(Eki ^ Di, 43); - Bbo = ROL64(Emo ^ Do, 21); - Bbu = ROL64(Esu ^ Du, 14); - Aba = Eba ^ _andn_u64(Bbe, Bbi) ^ 0x800000008000000aULL; - Abe = Bbe ^ _andn_u64(Bbi, Bbo); - Abi = Bbi ^ _andn_u64(Bbo, Bbu); - Abo = Bbo ^ _andn_u64(Bbu, Eba); - Abu = Bbu ^ _andn_u64(Eba, Bbe); - Bga = ROL64(Ebo ^ Do, 28); - Bge = ROL64(Egu ^ Du, 20); - Bgi = ROL64(Eka ^ Da, 3); - Bgo = ROL64(Eme ^ De, 45); - Bgu = ROL64(Esi ^ Di, 61); - Aga = Bga ^ _andn_u64(Bge, Bgi); - Age = Bge ^ _andn_u64(Bgi, Bgo); - Agi = Bgi ^ _andn_u64(Bgo, Bgu); - Ago = Bgo ^ _andn_u64(Bgu, Bga); - Agu = Bgu ^ _andn_u64(Bga, Bge); - Bka = ROL64(Ebe ^ De, 1); - Bke = ROL64(Egi ^ Di, 6); - Bki = ROL64(Eko ^ Do, 25); - Bko = ROL64(Emu ^ Du, 8); - Bku = ROL64(Esa ^ Da, 18); - Aka = Bka ^ _andn_u64(Bke, Bki); - Ake = Bke ^ _andn_u64(Bki, Bko); - Aki = Bki ^ _andn_u64(Bko, Bku); - Ako = Bko ^ _andn_u64(Bku, Bka); - Aku = Bku ^ _andn_u64(Bka, Bke); - Bma = ROL64(Ebu ^ Du, 27); - Bme = ROL64(Ega ^ Da, 36); - Bmi = ROL64(Eke ^ De, 10); - Bmo = ROL64(Emi ^ Di, 15); - Bmu = ROL64(Eso ^ Do, 56); - Ama = Bma ^ _andn_u64(Bme, Bmi); - Ame = Bme ^ _andn_u64(Bmi, Bmo); - Ami = Bmi ^ _andn_u64(Bmo, Bmu); - Amo = Bmo ^ _andn_u64(Bmu, Bma); - Amu = Bmu ^ _andn_u64(Bma, Bme); - Bsa = ROL64(Ebi ^ Di, 62); - Bse = ROL64(Ego ^ Do, 55); - Bsi = ROL64(Eku ^ Du, 39); - Bso = ROL64(Ema ^ Da, 41); - Bsu = ROL64(Ese ^ De, 2); - Asa = Bsa ^ _andn_u64(Bse, Bsi); - Ase = Bse ^ _andn_u64(Bsi, Bso); - Asi = Bsi ^ _andn_u64(Bso, Bsu); - Aso = Bso ^ _andn_u64(Bsu, Bsa); - Asu = Bsu ^ _andn_u64(Bsa, Bse); - Ca = Aba ^ Aga ^ Aka ^ Ama ^ Asa; - Ce = Abe ^ Age ^ Ake ^ Ame ^ Ase; - Ci = Abi ^ Agi ^ Aki ^ Ami ^ Asi; - Co = Abo ^ Ago ^ Ako ^ Amo ^ Aso; - Cu = Abu ^ Agu ^ Aku ^ Amu ^ Asu; - - Da = Cu ^ ROL64(Ce, 1); - De = Ca ^ ROL64(Ci, 1); - Di = Ce ^ ROL64(Co, 1); - Do = Ci ^ ROL64(Cu, 1); - Du = Co ^ ROL64(Ca, 1); - Aba ^= Da; - Bbe = ROL64(Age ^ De, 44); - Bbi = ROL64(Aki ^ Di, 43); - Bbo = ROL64(Amo ^ Do, 21); - Bbu = ROL64(Asu ^ Du, 14); - Eba = Aba ^ _andn_u64(Bbe, Bbi) ^ 0x8000000080008081ULL; - Ebe = Bbe ^ _andn_u64(Bbi, Bbo); - Ebi = Bbi ^ _andn_u64(Bbo, Bbu); - Ebo = Bbo ^ _andn_u64(Bbu, Aba); - Ebu = Bbu ^ _andn_u64(Aba, Bbe); - Bga = ROL64(Abo ^ Do, 28); - Bge = ROL64(Agu ^ Du, 20); - Bgi = ROL64(Aka ^ Da, 3); - Bgo = ROL64(Ame ^ De, 45); - Bgu = ROL64(Asi ^ Di, 61); - Ega = Bga ^ _andn_u64(Bge, Bgi); - Ege = Bge ^ _andn_u64(Bgi, Bgo); - Egi = Bgi ^ _andn_u64(Bgo, Bgu); - Ego = Bgo ^ _andn_u64(Bgu, Bga); - Egu = Bgu ^ _andn_u64(Bga, Bge); - Bka = ROL64(Abe ^ De, 1); - Bke = ROL64(Agi ^ Di, 6); - Bki = ROL64(Ako ^ Do, 25); - Bko = ROL64(Amu ^ Du, 8); - Bku = ROL64(Asa ^ Da, 18); - Eka = Bka ^ _andn_u64(Bke, Bki); - Eke = Bke ^ _andn_u64(Bki, Bko); - Eki = Bki ^ _andn_u64(Bko, Bku); - Eko = Bko ^ _andn_u64(Bku, Bka); - Eku = Bku ^ _andn_u64(Bka, Bke); - Bma = ROL64(Abu ^ Du, 27); - Bme = ROL64(Aga ^ Da, 36); - Bmi = ROL64(Ake ^ De, 10); - Bmo = ROL64(Ami ^ Di, 15); - Bmu = ROL64(Aso ^ Do, 56); - Ema = Bma ^ _andn_u64(Bme, Bmi); - Eme = Bme ^ _andn_u64(Bmi, Bmo); - Emi = Bmi ^ _andn_u64(Bmo, Bmu); - Emo = Bmo ^ _andn_u64(Bmu, Bma); - Emu = Bmu ^ _andn_u64(Bma, Bme); - Bsa = ROL64(Abi ^ Di, 62); - Bse = ROL64(Ago ^ Do, 55); - Bsi = ROL64(Aku ^ Du, 39); - Bso = ROL64(Ama ^ Da, 41); - Bsu = ROL64(Ase ^ De, 2); - Esa = Bsa ^ _andn_u64(Bse, Bsi); - Ese = Bse ^ _andn_u64(Bsi, Bso); - Esi = Bsi ^ _andn_u64(Bso, Bsu); - Eso = Bso ^ _andn_u64(Bsu, Bsa); - Esu = Bsu ^ _andn_u64(Bsa, Bse); - Ca = Eba ^ Ega ^ Eka ^ Ema ^ Esa; - Ce = Ebe ^ Ege ^ Eke ^ Eme ^ Ese; - Ci = Ebi ^ Egi ^ Eki ^ Emi ^ Esi; - Co = Ebo ^ Ego ^ Eko ^ Emo ^ Eso; - Cu = Ebu ^ Egu ^ Eku ^ Emu ^ Esu; - - Da = Cu ^ ROL64(Ce, 1); - De = Ca ^ ROL64(Ci, 1); - Di = Ce ^ ROL64(Co, 1); - Do = Ci ^ ROL64(Cu, 1); - Du = Co ^ ROL64(Ca, 1); - Eba ^= Da; - Bbe = ROL64(Ege ^ De, 44); - Bbi = ROL64(Eki ^ Di, 43); - Bbo = ROL64(Emo ^ Do, 21); - Bbu = ROL64(Esu ^ Du, 14); - Aba = Eba ^ _andn_u64(Bbe, Bbi) ^ 0x8000000000008080ULL; - Abe = Bbe ^ _andn_u64(Bbi, Bbo); - Abi = Bbi ^ _andn_u64(Bbo, Bbu); - Abo = Bbo ^ _andn_u64(Bbu, Eba); - Abu = Bbu ^ _andn_u64(Eba, Bbe); - Bga = ROL64(Ebo ^ Do, 28); - Bge = ROL64(Egu ^ Du, 20); - Bgi = ROL64(Eka ^ Da, 3); - Bgo = ROL64(Eme ^ De, 45); - Bgu = ROL64(Esi ^ Di, 61); - Aga = Bga ^ _andn_u64(Bge, Bgi); - Age = Bge ^ _andn_u64(Bgi, Bgo); - Agi = Bgi ^ _andn_u64(Bgo, Bgu); - Ago = Bgo ^ _andn_u64(Bgu, Bga); - Agu = Bgu ^ _andn_u64(Bga, Bge); - Bka = ROL64(Ebe ^ De, 1); - Bke = ROL64(Egi ^ Di, 6); - Bki = ROL64(Eko ^ Do, 25); - Bko = ROL64(Emu ^ Du, 8); - Bku = ROL64(Esa ^ Da, 18); - Aka = Bka ^ _andn_u64(Bke, Bki); - Ake = Bke ^ _andn_u64(Bki, Bko); - Aki = Bki ^ _andn_u64(Bko, Bku); - Ako = Bko ^ _andn_u64(Bku, Bka); - Aku = Bku ^ _andn_u64(Bka, Bke); - Bma = ROL64(Ebu ^ Du, 27); - Bme = ROL64(Ega ^ Da, 36); - Bmi = ROL64(Eke ^ De, 10); - Bmo = ROL64(Emi ^ Di, 15); - Bmu = ROL64(Eso ^ Do, 56); - Ama = Bma ^ _andn_u64(Bme, Bmi); - Ame = Bme ^ _andn_u64(Bmi, Bmo); - Ami = Bmi ^ _andn_u64(Bmo, Bmu); - Amo = Bmo ^ _andn_u64(Bmu, Bma); - Amu = Bmu ^ _andn_u64(Bma, Bme); - Bsa = ROL64(Ebi ^ Di, 62); - Bse = ROL64(Ego ^ Do, 55); - Bsi = ROL64(Eku ^ Du, 39); - Bso = ROL64(Ema ^ Da, 41); - Bsu = ROL64(Ese ^ De, 2); - Asa = Bsa ^ _andn_u64(Bse, Bsi); - Ase = Bse ^ _andn_u64(Bsi, Bso); - Asi = Bsi ^ _andn_u64(Bso, Bsu); - Aso = Bso ^ _andn_u64(Bsu, Bsa); - Asu = Bsu ^ _andn_u64(Bsa, Bse); - Ca = Aba ^ Aga ^ Aka ^ Ama ^ Asa; - Ce = Abe ^ Age ^ Ake ^ Ame ^ Ase; - Ci = Abi ^ Agi ^ Aki ^ Ami ^ Asi; - Co = Abo ^ Ago ^ Ako ^ Amo ^ Aso; - Cu = Abu ^ Agu ^ Aku ^ Amu ^ Asu; - - Da = Cu ^ ROL64(Ce, 1); - De = Ca ^ ROL64(Ci, 1); - Di = Ce ^ ROL64(Co, 1); - Do = Ci ^ ROL64(Cu, 1); - Du = Co ^ ROL64(Ca, 1); - Bba = Aba ^ Da; - Bbe = ROL64(Age ^ De, 44); - Bbi = ROL64(Aki ^ Di, 43); - Bbo = ROL64(Amo ^ Do, 21); - Bbu = ROL64(Asu ^ Du, 14); - Bga = ROL64(Abo ^ Do, 28); - Bge = ROL64(Agu ^ Du, 20); - Bgi = ROL64(Aka ^ Da, 3); - Bgo = ROL64(Ame ^ De, 45); - Bgu = ROL64(Asi ^ Di, 61); - Bka = ROL64(Abe ^ De, 1); - Bke = ROL64(Agi ^ Di, 6); - Bki = ROL64(Ako ^ Do, 25); - Bko = ROL64(Amu ^ Du, 8); - Bku = ROL64(Asa ^ Da, 18); - Bma = ROL64(Abu ^ Du, 27); - Bme = ROL64(Aga ^ Da, 36); - Bmi = ROL64(Ake ^ De, 10); - Bmo = ROL64(Ami ^ Di, 15); - Bmu = ROL64(Aso ^ Do, 56); - Bsa = ROL64(Abi ^ Di, 62); - Bse = ROL64(Ago ^ Do, 55); - Bsi = ROL64(Aku ^ Du, 39); - Bso = ROL64(Ama ^ Da, 41); - Bsu = ROL64(Ase ^ De, 2); - Eba = Bba ^ _andn_u64(Bbe, Bbi) ^ 0x0000000080000001ULL; - Ege = Bge ^ _andn_u64(Bgi, Bgo); - Eki = Bki ^ _andn_u64(Bko, Bku); - Emo = Bmo ^ _andn_u64(Bmu, Bma); - Esu = Bsu ^ _andn_u64(Bsa, Bse); - Ca = Eba ^ Bga ^ Bka ^ Bma ^ Bsa ^ _andn_u64(Bge, Bgi) ^ _andn_u64(Bke, Bki) ^ _andn_u64(Bme, Bmi) ^ _andn_u64(Bse, Bsi); - Ce = Bbe ^ Ege ^ Bke ^ Bme ^ Bse ^ _andn_u64(Bbi, Bbo) ^ _andn_u64(Bki, Bko) ^ _andn_u64(Bmi, Bmo) ^ _andn_u64(Bsi, Bso); - Ci = Bbi ^ Bgi ^ Eki ^ Bmi ^ Bsi ^ _andn_u64(Bbo, Bbu) ^ _andn_u64(Bgo, Bgu) ^ _andn_u64(Bmo, Bmu) ^ _andn_u64(Bso, Bsu); - Co = Bbo ^ Bgo ^ Bko ^ Emo ^ Bso ^ _andn_u64(Bbu, Bba) ^ _andn_u64(Bgu, Bga) ^ _andn_u64(Bku, Bka) ^ _andn_u64(Bsu, Bsa); - Cu = Bbu ^ Bgu ^ Bku ^ Bmu ^ Esu ^ _andn_u64(Bba, Bbe) ^ _andn_u64(Bga, Bge) ^ _andn_u64(Bka, Bke) ^ _andn_u64(Bma, Bme); - - Bba = Eba ^ Cu ^ ROL64(Ce, 1); - Bbe = ROL64(Ege ^ Ca ^ ROL64(Ci, 1), 44); - Bbi = ROL64(Eki ^ Ce ^ ROL64(Co, 1), 43); - Bbo = ROL64(Emo ^ Ci ^ ROL64(Cu, 1), 21); - Bbu = ROL64(Esu ^ Co ^ ROL64(Ca, 1), 14); - ((unsigned long long*)output)[0] = Bba ^ _andn_u64(Bbe, Bbi) ^ 0x8000000080008008ULL; - ((unsigned long long*)output)[1] = Bbe ^ _andn_u64(Bbi, Bbo); - ((unsigned long long*)output)[2] = Bbi ^ _andn_u64(Bbo, Bbu); - ((unsigned long long*)output)[3] = Bbo ^ _andn_u64(Bbu, Bba); - } - - void _random(unsigned char* publicKey, unsigned char* nonce, unsigned char* output, unsigned int outputSize) - { - unsigned char state[200]; - *((__m256i*) & state[0]) = *((__m256i*)publicKey); - *((__m256i*) & state[32]) = *((__m256i*)nonce); - ZeroMemory(&state[64], sizeof(state) - 64); - - for (unsigned int i = 0; i < outputSize / sizeof(state); i++) - { - KeccakP1600_Permute_12rounds(state); - CopyMemory(output, state, sizeof(state)); - output += sizeof(state); - } - if (outputSize % sizeof(state)) - { - KeccakP1600_Permute_12rounds(state); - CopyMemory(output, state, outputSize % sizeof(state)); - } - } - - #define CURVE_ORDER_0 0x2FB2540EC7768CE7 - #define CURVE_ORDER_1 0xDFBD004DFE0F7999 - #define CURVE_ORDER_2 0xF05397829CBC14E5 - #define CURVE_ORDER_3 0x0029CBC14E5E0A72 - #define MONTGOMERY_SMALL_R_PRIME_0 0xE12FE5F079BC3929 - #define MONTGOMERY_SMALL_R_PRIME_1 0xD75E78B8D1FCDCF3 - #define MONTGOMERY_SMALL_R_PRIME_2 0xBCE409ED76B5DB21 - #define MONTGOMERY_SMALL_R_PRIME_3 0xF32702FDAFC1C074 - - #define B11 0xF6F900D81F5F5E6A - #define B12 0x1363E862C22A2DA0 - #define B13 0xF8BD9FCE1337FCF1 - #define B14 0x084F739986B9E651 - #define B21 0xE2B6A4157B033D2C - #define B22 0x0000000000000001 - #define B23 0xFFFFFFFFFFFFFFFF - #define B24 0xDA243A43722E9830 - #define B31 0xE85452E2DCE0FCFE - #define B32 0xFD3BDEE51C7725AF - #define B33 0x2E4D21C98927C49F - #define B34 0xF56190BB3FD13269 - #define B41 0xEC91CBF56EF737C1 - #define B42 0xCEDD20D23C1F00CE - #define B43 0x068A49F02AA8A9B5 - #define B44 0x18D5087896DE0AEA - #define C1 0x72482C5251A4559C - #define C2 0x59F95B0ADD276F6C - #define C3 0x7DD2D17C4625FA78 - #define C4 0x6BC57DEF56CE8877 - - typedef unsigned long long felm_t[2]; // Datatype for representing 128-bit field elements - typedef felm_t f2elm_t[2]; // Datatype for representing quadratic extension field elements - - typedef struct - { // Point representation in affine coordinates - f2elm_t x; - f2elm_t y; - } point_affine; - typedef point_affine point_t[1]; - - typedef struct - { // Point representation in extended coordinates - f2elm_t x; - f2elm_t y; - f2elm_t z; - f2elm_t ta; - f2elm_t tb; - } point_extproj; - typedef point_extproj point_extproj_t[1]; - - typedef struct - { // Point representation in extended coordinates (for precomputed points) - f2elm_t xy; - f2elm_t yx; - f2elm_t z2; - f2elm_t t2; - } point_extproj_precomp; - typedef point_extproj_precomp point_extproj_precomp_t[1]; - - typedef struct - { // Point representation in extended affine coordinates (for precomputed points) - f2elm_t xy; - f2elm_t yx; - f2elm_t t2; - } point_precomp; - typedef point_precomp point_precomp_t[1]; - - const unsigned long long PARAMETER_d[4] = { 0x0000000000000142, 0x00000000000000E4, 0xB3821488F1FC0C8D, 0x5E472F846657E0FC }; - const unsigned long long curve_order[4] = { CURVE_ORDER_0, CURVE_ORDER_1, CURVE_ORDER_2, CURVE_ORDER_3 }; - const unsigned long long Montgomery_Rprime[4] = { 0xC81DB8795FF3D621, 0x173EA5AAEA6B387D, 0x3D01B7C72136F61C, 0x0006A5F16AC8F9D3 }; - const unsigned long long ONE[4] = { 1, 0, 0, 0 }; - - // Fixed GF(p^2) constants for the endomorphisms - unsigned long long ctau1[4] = { 0x74DCD57CEBCE74C3, 0x1964DE2C3AFAD20C, 0x12, 0x0C }; - unsigned long long ctaudual1[4] = { 0x9ECAA6D9DECDF034, 0x4AA740EB23058652, 0x11, 0x7FFFFFFFFFFFFFF4 }; - unsigned long long cphi0[4] = { 0xFFFFFFFFFFFFFFF7, 0x05, 0x4F65536CEF66F81A, 0x2553A0759182C329 }; - unsigned long long cphi1[4] = { 0x07, 0x05, 0x334D90E9E28296F9, 0x62C8CAA0C50C62CF }; - unsigned long long cphi2[4] = { 0x15, 0x0F, 0x2C2CB7154F1DF391, 0x78DF262B6C9B5C98 }; - unsigned long long cphi3[4] = { 0x03, 0x02, 0x92440457A7962EA4, 0x5084C6491D76342A }; - unsigned long long cphi4[4] = { 0x03, 0x03, 0xA1098C923AEC6855, 0x12440457A7962EA4 }; - unsigned long long cphi5[4] = { 0x0F, 0x0A, 0x669B21D3C5052DF3, 0x459195418A18C59E }; - unsigned long long cphi6[4] = { 0x18, 0x12, 0xCD3643A78A0A5BE7, 0x0B232A8314318B3C }; - unsigned long long cphi7[4] = { 0x23, 0x18, 0x66C183035F48781A, 0x3963BC1C99E2EA1A }; - unsigned long long cphi8[4] = { 0xF0, 0xAA, 0x44E251582B5D0EF0, 0x1F529F860316CBE5 }; - unsigned long long cphi9[4] = { 0xBEF, 0x870, 0x14D3E48976E2505, 0xFD52E9CFE00375B }; - unsigned long long cpsi1[4] = { 0xEDF07F4767E346EF, 0x2AF99E9A83D54A02, 0x13A, 0xDE }; - unsigned long long cpsi2[4] = { 0x143, 0xE4, 0x4C7DEB770E03F372, 0x21B8D07B99A81F03 }; - unsigned long long cpsi3[4] = { 0x09, 0x06, 0x3A6E6ABE75E73A61, 0x4CB26F161D7D6906 }; - unsigned long long cpsi4[4] = { 0xFFFFFFFFFFFFFFF6, 0x7FFFFFFFFFFFFFF9, 0xC59195418A18C59E, 0x334D90E9E28296F9 }; - - // Precomputed integers for fast-Babai rounding - unsigned long long ell1[4] = { 0x259686E09D1A7D4F, 0xF75682ACE6A6BD66, 0xFC5BB5C5EA2BE5DF, 0x07 }; - unsigned long long ell2[4] = { 0xD1BA1D84DD627AFB, 0x2BD235580F468D8D, 0x8FD4B04CAA6C0F8A, 0x03 }; - unsigned long long ell3[4] = { 0x9B291A33678C203C, 0xC42BD6C965DCA902, 0xD038BF8D0BFFBAF6, 0x00 }; - unsigned long long ell4[4] = { 0x12E5666B77E7FDC0, 0x81CBDC3714983D82, 0x1B073877A22D8410, 0x03 }; - - // The table below was generated using window width W = 5 and table parameter V = 5 (see http://eprint.iacr.org/2013/158). - // Number of point entries = 5 * 2^4 = 80 points, where each point (x,y) is represented using coordinates (x+y,y-x,2*d*t). - // Table size = 80 * 3 * 256 = 7.5KB - const unsigned long long FIXED_BASE_TABLE[960] = { - 0xe18a34f3a703e631, 0x287460bf1d502b5f, 0xe02e62f7e4f90353, 0x0c3ba0378b86acde, 0x90bf0f98b0937edc, 0x740b7c7824f0c555, 0xb321239123a01366, 0x4ffcf5b93a9557a5, 0x297afccbabda42bb, 0x5948d137556c97c6, 0xa8189a393330684c, 0x0caf2b720a341f27 - , 0x3a8ba018fd188787, 0x5546128188dd12a8, 0xb0b3cc33c09f9b77, 0x1baeeaf8b84d2049, 0x006425a611faf900, 0x18f7cd12e1a6f789, 0x6dccf09a12556066, 0x448e05eeace7b6eb, 0xbf2f33689d2829b0, 0x6d911dcb2957bdb4, 0x9f2353dbdc3c03ee, 0x06c54305babee501 - , 0x2eaf45713dafa125, 0x72963058648a364d, 0x61b7771f9d313ef2, 0x4f41c7f8bfe2b069, 0x408623ae599790ac, 0x4d33858644330a42, 0xfc5696649cdd7487, 0x74df72e0e598e114, 0xc9a06325913c110b, 0x076bd4115fe4b0d8, 0x76619e65d6bff3d9, 0x249240147cee3a08 - , 0xd695b96148965a73, 0x28aac8a28829f706, 0x41f1c05329f7a57b, 0x441ca9e89f03e00e, 0xe1aa38ab8bf7241e, 0x58f28cafc832b7f4, 0xcadaf8b8fa5400c6, 0x34b6d106284e863e, 0xf5498cab3af15097, 0x6dbe7790017d9c49, 0x63bf76a81448e8bc, 0x6371925bf23ae006 - , 0xc5e2c721bded81fa, 0x4ede70eed68056ab, 0x8f3cd9b5b4975810, 0x4752fd192f0a9aa8, 0x318794eb1f734414, 0x11ddf7d2c8468662, 0x2613b06f72b1a34e, 0x465575b37ab06770, 0x40b9845f82638d2b, 0x48894050790298ce, 0xbedb93a501b4f131, 0x04f3560d2889b2fb - , 0x457dd875115b278b, 0x56f25ee54d92858a, 0x92d4c1cdce0c977e, 0x078fca4187d74996, 0x3bbb2ded76cc22a1, 0x117b28853ddc2bf6, 0x43f3767cb9c2baa2, 0x73079e25e0ea8a8f, 0x0177992b5a15796d, 0x2e77721480d9ef92, 0xbe09883567372916, 0x258f176b7af7576d - , 0x308338fd6168391b, 0x7285925f9a7353a4, 0x862c0fd04fe85114, 0x53259ee7423aeb51, 0xfe0031a84b3b1a68, 0x1a4f1d661fa071fc, 0x2ddd54168dc928a7, 0x60185c1adf196a6a, 0x49809717dc6da9b4, 0x6062094b4dcffc03, 0xa41ea6fa05fa7e8d, 0x4a4fe06f277148a0 - , 0x7bb253a9ee9e80f0, 0x419a928bccb11733, 0x84323be66a9a039e, 0x01b2d1ae972814bb, 0xa7588584d3051231, 0x54df1e20cc979dd7, 0x91d906fe3e2f22dd, 0x4e36e9975fdf1a0f, 0xd81871746b747634, 0x3e5e31baeee13433, 0xe4da80979573baa3, 0x4b852ad97cfe77c6 - , 0xe08b346714418b9e, 0x283d719b2fe6ef88, 0xb7339d2de45c180b, 0x75acfcef11d2d5c8, 0x8f40777a8c561876, 0x0c54ac40a7134c4b, 0xb92e287d66baee08, 0x6f357e5006a188bf, 0xc5903319ed1e6971, 0x747c45ef91dafd40, 0xde4086a91d2f816e, 0x5dcb27edb3b3ef7d - , 0x43fdc46cfa1dd2ee, 0x51551f9f70966498, 0xb54534f761ed9bdc, 0x453455b3073fb07f, 0xf24773e383cab70b, 0x679be25e758cf4df, 0xda17edf2943eee29, 0x3dc9e5b8d6dc0f66, 0x56a50cba413fb75b, 0x1e65315bc5a8537f, 0x5ff90242802c7213, 0x73c9d8c8f425252e - , 0x3c637b8633198c8f, 0x534f84b3ed414f33, 0xad313e72dedd6902, 0x5ed57e941cdf33af, 0x5a6fe01d2a57306e, 0x73b63dea344713f9, 0x39cb70570f1c2bf3, 0x2df8c6e49f1a18db, 0x661bc349677797e4, 0x501ae7cbbebe9062, 0x5b52a88de8959643, 0x0372752811c01d51 - , 0x010c57a2301bb928, 0x378b317155554fc6, 0xf883fa4229a02cf1, 0x5f0047b850d7db29, 0x4d247ae328402daa, 0x0d030627a850a2bc, 0xb4e65d9a88a443f5, 0x6ec9686b2d6db089, 0xde202e08fea1d987, 0x5c64e1d3f28d7600, 0x157d17bef661bfb7, 0x56392d36dd75334c - , 0xe25478d8bd19155c, 0x146d4f2d3d336afd, 0x9bfbe00bf94e15e8, 0x2b185a9a6adf10c0, 0x926527b3ed52ab7b, 0x67997e1473101e80, 0xb58f4ff4947cc541, 0x36f800c7fac99a7a, 0xd0302e32400456d9, 0x4372e43640bc697b, 0x9144cabb4750d898, 0x75d25afac9a23cbf - , 0x794591767655cbfe, 0x74db216617fc4b07, 0x7057b2242566d0c9, 0x1d543b5908417b23, 0x19c280b444428783, 0x352309fd8b6cc3ef, 0x37833d6ac068ae72, 0x4ec0671a23c019f4, 0x9d9836e1a3d05bb5, 0x44fe1adff224efe3, 0xa296bc3ce57efb4a, 0x2efec86835a14150 - , 0x2fe19c09fb194bca, 0x18cc07d3953cd206, 0x5bdff217c9c0b9e0, 0x671aa756581abcee, 0xe1cc33ae28f7d1a2, 0x1b6f254937a0a3fe, 0x51503d1665babb83, 0x74b95636d5889211, 0xbdb97ae4ea96f869, 0x1507ce189e2510bd, 0x796e4d54fab93b13, 0x6a81765f05960929 - , 0x2e940521e5a833ed, 0x3bdea532b245f644, 0xbea76975ffd52693, 0x64b94848ba6d4ed6, 0x9db52d0194e33ec7, 0x71cf65da55639f25, 0xede73b1fdb5a8138, 0x12e4d13b6c62dc22, 0x9d19b0c265185517, 0x77a011d257b5fdd0, 0x1fedc5caaecd84e4, 0x46844e151e3492d1 - , 0x7a423a31904220df, 0x5b3165c747e8f099, 0x1c665eeadf35e22e, 0x7802b556fc45595b, 0x85a2def4015bd2de, 0x17f2ab87957166ad, 0x19cf6d352060c1e5, 0x122a7ad1be408e6a, 0x5b79bbc8645bf766, 0x20fb009d4d0adacf, 0x97526a272ba28538, 0x7041b4e90d420bde - , 0x3b30113358dab057, 0x3d398b66f0d24243, 0x91a5999a03cd4708, 0x1eae2409cd938096, 0x66dd6b604c36108c, 0x1713083789081968, 0x57cad6917125dcfd, 0x34b06cb89704f1ca, 0xdcafe8e71f35abf2, 0x698331198d544db9, 0x6287676643af075b, 0x200950e5559d2b6d - , 0xd4f63fc3ecdd9074, 0x7473317142ac13a2, 0x96b0030805319356, 0x2c20ffe0244378ba, 0x4889511ad26ac01a, 0x4ee327219997fcf6, 0x15ffe6e70f0bf8ea, 0x6b617fb4a6d0a6d7, 0x4916dca1c52f7324, 0x3c8269f086468277, 0xc24210c4c837e04b, 0x4e480b4f915a542c - , 0xc5fef3b09a7fe35e, 0x31a501de44fd84b2, 0x79f29e4940a407b9, 0x0ba7e03ca5cce5ab, 0xa7a8b2058a74d8ea, 0x46f4c7810e26dadc, 0x46171ace94a1128a, 0x44db55025495a811, 0x7f889e1a4bf18d5c, 0x4d4f172a43f306b2, 0x33a99766bb1cffad, 0x6254775924d39aca - , 0xd855230ec225136e, 0x1c544dd078d9211d, 0x12fe9969f63f63ba, 0x069af1dc949dd382, 0x305bcf40cfe5c256, 0x63ae90924bbbb595, 0xe451097793b7de06, 0x09780cf39fc0043e, 0x827af8e7eb798871, 0x3ace8a6c77577a37, 0x79df061332e055ba, 0x561dc07aaacea92b - , 0x7e4422d9820d2673, 0x6b85df83e0af5348, 0x1f151ac1ded8526b, 0x35ead8e5157142bd, 0x6da6ef6c33c79dd4, 0x5f2ea04d2594fde4, 0x91037d0cc027d5fa, 0x53b5401007b0331b, 0x810f198a3d4ba5a3, 0x4463bd259ba94195, 0x32b894acec2acf9e, 0x78711761d64349ce - , 0x253ae1b3f51fe211, 0x409e4b3f535b6463, 0x3a236d10da5e49de, 0x19d2b1029c21336a, 0x2835f40436aadd90, 0x0942a31505190b19, 0xc189131876828279, 0x3afe96c3ca8e1f9c, 0x9f1801b491230693, 0x39e28db8625fd091, 0x9fab50355dd44c8e, 0x145155da729b280d - , 0xd3ccf8101d4d76d5, 0x5a0faa1a8c2b6c68, 0x3cc66c84cb54ea8a, 0x51052ce3f566c773, 0x3bee14de65ae9ff5, 0x7586118a01ccf024, 0x089e791c896bf15e, 0x35ff022d261d93d6, 0xcd3ce13d8f7d1cf9, 0x4f1de98f95b7b8f6, 0x51e68a2462dc41b4, 0x61ad9e3c23f6dd29 - , 0x584fea6480ebdb51, 0x5d52fe073f9decf3, 0x9afe483eadf336d5, 0x1dfa03c980b1696a, 0x55f73d47ff819a19, 0x697bf55d361100ed, 0xded4804446399419, 0x618c94467fce259f, 0xf2597ff1f08ef50c, 0x07c935b98dd933c0, 0xbb758cbc78ded5f6, 0x1e9a0d06af13148f - , 0x879ce1457f4cd4db, 0x28396ca1962d4994, 0xf5095a3dc57605c3, 0x1e570f3da4c527b1, 0x2af69a3904935787, 0x591ee376fdd01cce, 0xf77b58df88bc8633, 0x5464d651b2f395d1, 0xafbc096b1e9a86ae, 0x6ce2df4bf65b6b28, 0x3b3a828d2e9d3e08, 0x6382011d8d2d66d0 - , 0x94987ca64d3d193d, 0x50ddf70d3b6d56af, 0x8d5df67cc8ad15a9, 0x39208098bc5b1f92, 0xce99f520dfd5a4fb, 0x323bbc87b86a7ba9, 0xe13f88a8d803c789, 0x56ffdcbdf2200055, 0x3aff0da31b24c72d, 0x70011566460c0c16, 0x76f7b7f53ac46a13, 0x1c069bfeb7077bc2 - , 0x8f47193ca14a3c36, 0x6d73e34af088de3d, 0x634b2bd9317d6634, 0x5b404738b77f1ec8, 0xf34fabb71ca1cb1d, 0x054abbcaca546a46, 0xe8cdcadd08eda660, 0x6971abbf958bdef1, 0x41338557dddb4eaf, 0x1e158585b079b67c, 0xd2270474cfa26068, 0x53b36d32b3cea469 - , 0x011523c16c543d08, 0x4668e92c5f73314e, 0xbaef3ebe4117acd1, 0x04037d1aa713931a, 0x68e118e4e390c68d, 0x6b80cd55a44c1575, 0x7307ea8a5729c032, 0x5cc5475feee99ab2, 0x34450e424c14ac75, 0x3f09157e5db3dcd8, 0x62ce2b1b50588052, 0x27a899c54e652f8f - , 0x0acd039f2fc2a5ed, 0x4b4044ddd5813eec, 0xc04d189e90a75958, 0x242551bce71d33a1, 0xd95af96b51f87f05, 0x02988820f809d815, 0xb27f65f73b9483c5, 0x2ef60745f4364b43, 0xcb66bdc93f4fb8b9, 0x2b86c9b48756bb8a, 0xf8ebdae09b9867a1, 0x441e70184e6fe9aa - , 0xfdc2530330cc1289, 0x47d8d65a8b4d6992, 0x8c03b6fa30ae74be, 0x1ca8693cc3bd99d5, 0x699eb1511018f2a6, 0x3da04764d9f4fff5, 0x361720433d3aab59, 0x2fa911612cb857ff, 0xa4057da10c2f1cac, 0x48a219b933a5c619, 0x42341020d15f0bc5, 0x73f8895046a09dad - , 0x1bad5312c67421b8, 0x4194771b368e622e, 0x8cc71a79e44e0dff, 0x4b4564e45467f1c2, 0x7759f16aafe52093, 0x391b71dcd75fbea9, 0x2a1c0694ab4ef798, 0x023087545444130d, 0x4b7ae1ffcfaa1aa1, 0x64e26f32d73361e7, 0x8da47038bd0b54b9, 0x148cfa6feaecee15 - , 0x3756d4d479c2cc3d, 0x25d44ea8d31543de, 0xd82c8bef26bb2c43, 0x2c2047033d27f37f, 0x5bd33d9837dad260, 0x77943117a3383b7d, 0x12071d697ea583f2, 0x3c7c41272a225bf2, 0x92ebbdfaf1f03ad3, 0x5d61030c68b63704, 0xca6e2853baee75d1, 0x12404b34771a3636 - , 0xbe13c46326667e4f, 0x2bd261916f9be3b0, 0x86e3f8cbadc80f89, 0x74520d8a1794cb48, 0x1e15c745024cf97e, 0x5cee741e1e53eb02, 0x8d088de0af99cda1, 0x625812961cc0862c, 0x4313437321c0e934, 0x60bbc768c424f7a4, 0xaba71fbf3c10e143, 0x37b8ea9f14a915b8 - , 0x8d96ec65c40213ff, 0x74a08828ff77845c, 0xbedb7194daf607a3, 0x17e86671161c8706, 0xaceb98e0524059cf, 0x68552ac494916f09, 0x4cd2971baf1b3c47, 0x68442ebcdde21b70, 0x19629b8c0e867595, 0x6a6955d3635fa47a, 0x6fab45e0f2e393ad, 0x66dd3ef4fcf050c4 - , 0xbb0b7abcfddc7df1, 0x14eb5b751b0bcf9c, 0x1cf79f9ca2fd411d, 0x5c496f73fff0600a, 0x49648d8555426d70, 0x46c1016a2322d8a9, 0xb57fdb870d9b6d4f, 0x609eb65209ddb633, 0xe70f9166bedc82c5, 0x772fb5b5c8afaf27, 0x79a294d9b0227a20, 0x7f75b141112dbc8d - , 0x98d1c7f88e070020, 0x5953d0aac48217b1, 0xe28253ebe15f33ff, 0x267d1dc11e614c45, 0xbe64f50ab99e2246, 0x4eaaab5c82fe5495, 0x927d5ac07e60bed0, 0x67d3786de6aa1b4d, 0xa71962bf0f6e2945, 0x63d93844a35eea9b, 0xb34228c7d26640ac, 0x169c38d2eb28f5a1 - , 0x4b7972b33439dc22, 0x71478457cdaa1e14, 0x5226e125ec1d58c7, 0x669d8796e78fd4f1, 0x750dd1aaaa44a07f, 0x327c62b55aebbecf, 0x006b8e95b54fbd25, 0x2ab3f95d01eb364e, 0xfcbe5080c0d5e196, 0x2a1b9bd75a57e725, 0x1d2b2b6758139b5d, 0x751cf4af849b7a73 - , 0x164a7d2e337d00a5, 0x00cee3a4cb83a4bc, 0x3498e0366dbe28f9, 0x053d899148d28502, 0x01665d64cab0fb69, 0x4a99132208d68e74, 0xba44bbd4bd3f915d, 0x1d34b0f9172122bb, 0x5d114dc729e8a9f3, 0x08e7a43dd5334b60, 0x28db8e9232f0f3e8, 0x5cb7be1b80264f62 - , 0x9af2c78782508f23, 0x336ae7ccf7e3a1b2, 0x7fe2d4ee2dd194be, 0x573d2e1b2b8a6872, 0x3332ea3363b2ea36, 0x200bc1375b1f4243, 0x65c47c8c06b3260d, 0x42021fca53995c5e, 0x2f7e6cf49bb19946, 0x311fba6a23196d2c, 0xc30c13b62be0d70d, 0x61eeac142711b0dc - , 0x88526996597d35d4, 0x70169bcbe6bd21d7, 0xa0f1b2d0ad29a510, 0x2ade531472c1b94d, 0x11e320dc189873e7, 0x2d2a1794e85cdb38, 0xa0a8c453a6f621e3, 0x4b06d5b54525f6f7, 0xf42916691848ec1c, 0x1d4216555d578730, 0xf8c60da7290a5b4e, 0x66dd9f39a1f3565f - , 0x55ac29d937b474a0, 0x4291967a4a369ee4, 0x918dacaa12e6bc89, 0x3d46e8900651c310, 0xaf055430a00e90b1, 0x16f62bf56da5ca39, 0x1a021c33488c51e6, 0x0d64dadf63fbbcd5, 0x0918ece59dbfea7c, 0x3b3319d7dd74203a, 0x1d88545b8b9fa90c, 0x13b792dc908c59e6 - , 0x0a2d939a9c3d0979, 0x321a5dbeb74bf127, 0x5e5947fff66d8470, 0x22ec9ecafd26bc99, 0xde17ca8293b10536, 0x593f56c0559dd846, 0x1148373375485023, 0x23c6b0fdf7448b1c, 0x377904458a27804f, 0x573e91962726ea70, 0x35e1b24f3235ac70, 0x51ba082049f4f85e - , 0x4bc4918160d47194, 0x5d29a21e3308e1dd, 0x7e15894b3e6e4e33, 0x50dbbd2f4f31d0fb, 0xef248bd235a9c9de, 0x3418add21b634710, 0x96c7233a52363bd2, 0x7c8414ad9a08c99f, 0xbc6acb4a54e6c05c, 0x5729021a1193579a, 0x0627c3e00b08fa1c, 0x3d0b4ff9e17c2a73 - , 0xd507e8755990317f, 0x75b27bb3bc7bfe48, 0x44a80f2c6ce651f5, 0x7b9795fc1b706e46, 0x9de75bdefdf9a640, 0x75ade50ababffaa8, 0xce0ab116870889a0, 0x6f3ddcfcdd59ec6c, 0x6e36833588de0674, 0x291d1129ea28a073, 0xf8b8e53864884d61, 0x706ef8f1ae854d76 - , 0x137a8c6583753069, 0x01e45f1cc620f966, 0xe28e1ff82f76c7ba, 0x36d29eace3e89c54, 0x83379f157f0b49cb, 0x65e9c39e2bacb937, 0x9b323c45070cda3e, 0x16e02f31ab7e2de5, 0x53bcf346635122b7, 0x1fd7e207d6c2de09, 0x3a5f5f94ea1e57ac, 0x0cba06e8d0f0b4df - , 0x70b440c387a9c392, 0x1e7dc143dee1d800, 0x5498ba6d7239912b, 0x332870a017182d14, 0x6be306fc672d794c, 0x2c2ce211245b2b4e, 0x109b722c8d2ba79f, 0x268520fa9c5f727a, 0x515b300524fe78ee, 0x736201eccbaea698, 0x4608ac113210bf78, 0x32d8fd919c441843 - , 0xc9557e1b04b8f2d8, 0x775437f798dc7459, 0x1200f5585ba417f5, 0x2e00ec5f3e7ad304, 0xfc873d5f2b446288, 0x32270a93624876e4, 0xc646a47c08789b22, 0x2370d9fe925616be, 0x430afa3619e671c4, 0x156468ceac1f5fb2, 0x3b84dec2f2417635, 0x31140e9017c0e58f - , 0x5c85f88ccb7443fa, 0x0da75f5d64d864ac, 0x295ff44871b0fb84, 0x1b79e10bad3336c3, 0xffdf9942dd2977b3, 0x4c1b198d0f9a1a23, 0xba778a24c112864e, 0x74f66897f26d48d0, 0x3fd5c06e867ab611, 0x4b98ce33ff7878b9, 0xf7db4dce75cb9165, 0x11665aa099ec5163 - , 0x2a498f16ae7118b9, 0x265ec3dbb4eb509a, 0x3da4230668ce2c86, 0x36e62baab2e33385, 0x99507d4a79ab4478, 0x25bfb2fc411e8875, 0xd7ac1ec933022ce1, 0x23d341ae033d0466, 0xd295b465e962bc00, 0x23d0211ba2d73180, 0xa03ccd7aff922d4d, 0x1e767148de301514 - , 0xc241ab36a894efab, 0x1c9fc2f343fc1e58, 0xca3b96562bd27a87, 0x53623e2285dd7015, 0x557411f01c219420, 0x19265577096b42f9, 0xd3312d941b23592f, 0x30a9a9a1c3c51c06, 0x3d89b0b3ea6e8f79, 0x7eab751dc5c77cb2, 0xc0a9b186e6df6e36, 0x4f844d583f155694 - , 0x419018232793dffa, 0x2add440b6bd3854d, 0xd55480f131df6e32, 0x318ce3846ae3e417, 0x0565062d1a0984f4, 0x6ebaec63d2bff9f6, 0x77075fe729e79790, 0x0dd9434624c8a4e7, 0xbf8f11e2dfa9b062, 0x1b17d8255ee8b364, 0x62c2150cf72c6344, 0x28106880d081e8dc - , 0xf4a4af0ddfec91c1, 0x1a8f0e6c977e1f2e, 0x72a7a3a738b9316f, 0x323716728c4e22ec, 0xc14069065ba4af3b, 0x081514248911d367, 0x51bd4afaa8b6c337, 0x50e77a9b513400e7, 0x46c0051b2a822548, 0x024886e41a5edcfc, 0xa06b0efa41cac17f, 0x336a30b01b9c5675 - , 0x74fb2c10ca097626, 0x2b204caa48e90981, 0x6902c952b9a17b74, 0x39c2e9b6b922303b, 0xb9216b9b3c597419, 0x6d92930264f15f76, 0x7b1297d5eeae1427, 0x0f0744adfe1bd307, 0x33b57e265be6a89d, 0x282fa2e533356c10, 0x3a03995c61dc772c, 0x4f5d8f5e893dcff5 - , 0x4bfc927efc48023f, 0x596f2241d6a685ae, 0x3cb3e0afec29b8a2, 0x31018e0d10653842, 0x2fd00fe944575626, 0x1241d8704982e011, 0x970d56664e6781a7, 0x1b05f49d0f3de2ce, 0xa994ffdf63717e66, 0x416374a76ba88e98, 0x8b082ced53f1579a, 0x56781dfab5d2aa4b - , 0x8151defd1865b318, 0x64669b840d6081f7, 0xe436f4bb5f38e14e, 0x43d438410a974b40, 0x5832ceb3d666be02, 0x06347d9e1ae1828e, 0x6979471b39e3ea86, 0x2cf2cf61cb4b5ae4, 0xb7ab29eada5a6ee4, 0x12e75cb29aca5768, 0xe65b1109d30d1ffc, 0x71f9becd6b320e5a - , 0xdc8289026647eed9, 0x31d62d050ca5458f, 0xea2bbf523a54c1e5, 0x602bf0b9e3ee5491, 0x25aa73622380ad4b, 0x2b6b1e3271df5f58, 0xdbc5efd86aa0470d, 0x05353c24b8c4354b, 0xa3c7db3cf5e06bca, 0x288a1c8f2b4ea5f7, 0xd6152f5e12ce7ca1, 0x59d4c1b436673c7d - , 0x1e02554e521fcb95, 0x66d3980f240ad440, 0xabf16f6b39a4d9d1, 0x7fea351ca94c2f62, 0x3d62b6f3389163ba, 0x0fc6b44f2e7895ea, 0xd5c64403cda7c669, 0x2e4099090e603193, 0x9b5c0faf15fa4c2f, 0x46295c9d8e12b639, 0x5ce4add63a5b331b, 0x5fa7bd736c4c5879 - , 0x47b3471447d1aef2, 0x28004c1c22325739, 0xd588437d9a3c5299, 0x2ab19c1812cd27e8, 0x3ae700f680037802, 0x1ad163800b422b36, 0x45b7ef36fabc2139, 0x44bcdeff21dcbd1d, 0x41c6da2171e11c7b, 0x2c35ee79f7c4cc14, 0x4852942759c13849, 0x6492d26f10be050a - , 0xa6f54e988c50f0d9, 0x6a2db2b6dd62181b, 0xf7d9806b2a5e57a3, 0x57526bdb3ba53d20, 0x17ce6cb1f500e650, 0x05d841b042f8f345, 0xaa800a6c698de970, 0x04f4b559abe2cb8e, 0xc050dfd7259ce49d, 0x213839bdf94db935, 0xb371258655306204, 0x7d323b8b19f9705a - , 0x26d4502b16b6c618, 0x79717069aa89595b, 0xf867c0e36db41872, 0x13d601d86c76e1d0, 0x2dfc8b0d331b7383, 0x185472f3e42e8075, 0x05bd13e72b10eba0, 0x519a387490f79b95, 0x8d09c1b2d3ad2500, 0x045da45d2cf0f733, 0x640181956862426c, 0x728d57f59bfe1b09 - , 0xf9a99f878da2c585, 0x4fc4831e61dc4e10, 0x6dc602cc54394fe0, 0x0484566b67e9e8ae, 0xc5fcf0474a93809b, 0x71c0c23a58f3e2bb, 0xb400fabe36fe6c43, 0x614c2f3eaee4c0a7, 0x7610a980d0e1c6c1, 0x1ce8197c88885dcc, 0xeade1c9f3ac2cb2b, 0x471ad07baf2f341e - , 0xd67a837c6b01121b, 0x2a8e64281f59cb59, 0x52e701e42f3262ca, 0x19e0a27dece50580, 0xb5691c17a7bda6ac, 0x43484c311b9df1f2, 0xa68155549bae49ea, 0x43a2c5dda225fae5, 0xfa5e992aed700eef, 0x58911f5623918856, 0x648b81a1e48c4da9, 0x66e6e30cbdd0c3bd - , 0xf3ba209c169d266b, 0x20f7a86230447685, 0xd1bb5aaa1a0c3d2e, 0x366c29843d1111f1, 0x06c78b642dcc9013, 0x27484a64e109e3fb, 0x8f8eacbca4677464, 0x0b6cb31b1dc24cc1, 0xdf69c84f898f0fa0, 0x2dd426744920f2a2, 0xc0912a197d4c5c69, 0x489ade7f6a98d8d6 - , 0x458769f47f203e28, 0x124f4123fc05ac97, 0x3bb936f4ad6d7d67, 0x330954fed4f00ff8, 0xc2ce650046f90eaf, 0x7bf94762d4f9debd, 0x2e93172a586dfb83, 0x3c7a6062b4113d96, 0x5ddb0397147f0d93, 0x08e3596fc6839034, 0x374e67ff67639bfa, 0x19021c2119888232 - , 0x002f5d04fdd55efa, 0x05b4c6e079e1baa3, 0xe5678ea3ad74c84c, 0x1c42f7826a58a77d, 0xe054668bd2cafacd, 0x237668d3ede4261c, 0xedf46a6374aebb32, 0x31ec8c5931cf0ef4, 0x955c2e95c35b5825, 0x27d8b0ea68259603, 0xb7a8976e427d1ec0, 0x6b6cc5c07152bd13 - , 0x03d88f0ca0b244cd, 0x001cae9a8cfed897, 0xa844b3a1f693a7fd, 0x676c9acb7abdec96, 0x631b6bd5e0cdbd33, 0x29f289dc0cddd9b8, 0x0947d57536fb2eff, 0x1eb2ce650e3eb059, 0x2139b3a40e8bf405, 0x4165edfb39f4ae8d, 0xe061eda67a70d6a6, 0x2e3cc0328c9084f6 - , 0x1ef8329ed056063f, 0x6d4d01ce49e8b3d5, 0x0110c92f1656d34b, 0x6dad1c4e170829e0, 0x584c56c590b477be, 0x597e5f0ad525e935, 0x6008264d8eb7d36d, 0x3f586754999c829e, 0x3d7ea89df5546a1d, 0x41754f7d9a3f4364, 0x3b0796822ef879a7, 0x1ab2779598262872 - , 0xdc37c9f0bbef7923, 0x256ec818ec35a097, 0x4a72da5c09dd5846, 0x51df6c61edcad45c, 0xaef24fcdcf5ce819, 0x0ba6bb959ae689f1, 0xe667bd65a57b3a9e, 0x71ffd591a28a8e4a, 0x06c325fa53a7fadf, 0x6667f2986b2dcf13, 0x3ef751a6d52a09e4, 0x517a104240b8c74a - , 0xd08cddfd8c8183f5, 0x59237cc71b8147f1, 0xfff94fd188395933, 0x538acc592d10ef67, 0xac51ce386ff0eb1d, 0x69d42b8114c5fe65, 0xa17eda3995bfe8b9, 0x5dc6d98fdf05a341, 0xf2304d375ce8be78, 0x31b58521ecc483ca, 0x04d2d8140780222a, 0x3dc18b2be3ed95c9 - , 0xa48e1639f2d70d2b, 0x4ffd54a6bc0f38d0, 0x8ae3c65ba6b7143b, 0x482eb41f9178fa9d, 0x240b8b4e87ad4f1d, 0x6d8532420059eb40, 0xc135f77e44275132, 0x6261076a0daae349, 0x35316bdb3842765c, 0x246165ba3a8bfd92, 0x1c2d774bd5177a75, 0x045a2f991647e3b6 - , 0xed3b5923594671a8, 0x0514fada5acd4db5, 0xe8297fc358a0f50f, 0x7cd2badcf2952a91, 0x0da45130ea9ac266, 0x26a0d43c1e14c979, 0xbb62b729fe93a390, 0x360357aff7f67ccb, 0x3ad4835d1c7c59e8, 0x570daffd86fa470b, 0xd7c4be698fa3bd96, 0x17e4bdec2ad76ffc - , 0x43ce4ea9ead7dc51, 0x58ba7ae0d64a518e, 0xe014cc7e64680555, 0x03abc953ce2630b8, 0xa318620c7799be57, 0x2b258fa2e84da952, 0xdd88fdc5063b2ffd, 0x17371dd79a3aa556, 0x927b837578981299, 0x554552101d90ab2d, 0xb45306218ce54bd0, 0x59109b65ffdb6235 - , 0x8663e0c4a180a515, 0x41467fe41c6604f4, 0xae2c1aa4dcb73878, 0x19d3cb02c6c07517, 0xaa147c97ea6745f1, 0x70dac71a31cac43c, 0xb9213ec26af87dfa, 0x67f228e9f60e7b25, 0xbfb59b8cf78df3df, 0x36687792a4256fa3, 0xe1be5c1f23177544, 0x786a9e1b644b1c90 - , 0x4172f47393ca7f5b, 0x62ae5bb4b8aaeb59, 0xbcd9c431fa631b6f, 0x1fbe20b2edc9cc6d, 0x5fdd829fbc0ee085, 0x241dd315adc5dd59, 0xb4b688d625f7dbb6, 0x595a82fee5bed2d4, 0x69653ae0cc11880d, 0x2b9e85fefc402f76, 0xbb2495b507770a81, 0x05d20c575fb34731 - , 0x9d9e623436485ab2, 0x27012a9665f3febb, 0x586cfef484c04ff7, 0x44a5860cc0eabfbe, 0x6fbfe6e2f3532e80, 0x05abeabaaf3220fe, 0x1bed21f2cb809678, 0x2aa62112b7eafed2, 0xe298837cf610190b, 0x1ec8fbbcef9158f8, 0x1efe9b3aa4f96f6b, 0x6a3b842a068b0ef3 - , 0x92dd4b7cd7f827f7, 0x605175bbf3fd1c97, 0x139bb6419c1f6d98, 0x3a3ab2e9978db310, 0xc5c95941c9d5dd0b, 0x34c6c76025b2bce0, 0x0d44115a49bb8126, 0x7622cbeb11daf619, 0x785bff93164ef5ad, 0x7191647d355cb45d, 0x117f255c4cce6e5c, 0x581b448b0e9aae3e - , 0x54a4f3cb36225414, 0x790180c539bc4685, 0x47064043b7c6b96f, 0x43cccf5b3a2c010b, 0x1dfbf3afc14c3731, 0x1c368f3195572574, 0x00bc2ed3b5070b5a, 0x0332d8dd63b37f60, 0x0744b1908c9bd8f0, 0x2d258e628dacb9ce, 0xbba5b4bdb9c61e14, 0x0bca12295a34e996 - , 0x059c84c66f2175d4, 0x1a3bed438790be78, 0xdf394f577dabb5b0, 0x304777e63b3c33e4, 0x59a29d4fe82c5a6a, 0x72e421d1e88e77a4, 0x69e6230313312959, 0x2da03aad8cf2bbb8, 0x2858d8608fecb0b6, 0x343099e7a40243a6, 0xba29b675d29a8f63, 0x3d2028a4f6f15886 - , 0xf068e2d286047d0a, 0x14999b5d6c770e20, 0xd1874a592385da79, 0x78aeb552c15a1cd9, 0x482dcccc23e9c06e, 0x7b18a19fb54b5745, 0x036c896efe9a7a06, 0x2f2c2ce0d1871c13, 0x3b2d9b9ed65492c7, 0x0649c7e50819d077, 0xcdab66ea7b65e3cb, 0x49b15b40c4aaf03f }; - - // The table below consists of four mini-tables each generated using window width W = 8. - // Number of point entries = 4 * 2^6 = 256 points, where each point (x,y) is represented using coordinates (x+y,y-x,2*d*t). - // Table size = 256 * 3 * 256 = 24KB - const unsigned long long DOUBLE_SCALAR_TABLE[3072] = { - 0xe18a34f3a703e631, 0x287460bf1d502b5f, 0xe02e62f7e4f90353, 0x0c3ba0378b86acde, 0x90bf0f98b0937edc, 0x740b7c7824f0c555, 0xb321239123a01366, 0x4ffcf5b93a9557a5, 0x297afccbabda42bb, 0x5948d137556c97c6, 0xa8189a393330684c, 0x0caf2b720a341f27 - , 0x892756b15bcf68c4, 0x5742f77c98a526ba, 0x340a5a1de9f89f9b, 0x14ef680aee75d0f7, 0x84e770e14043a41f, 0x0212c41116c33c95, 0x35b791e6de4dc0e2, 0x5949df08518d5d28, 0x6a0e120744ed10db, 0x5a5183ce844391d3, 0x6f618b158afdba50, 0x2ce2037e470e2088 - , 0x1f49fa149a64ba3c, 0x5f9876d519670451, 0x030105056f55586b, 0x020f1a557d8fd726, 0xdf4cb175b06d86c8, 0x694fbcbe7fe58390, 0x7933294a756a1b67, 0x09dbe9924b58f8ec, 0x590f4403cdf197b6, 0x1c07969fc87a0ba7, 0xc496477712252367, 0x5508976022f1b096 - , 0xefda361e452e1775, 0x7a0a0cccacc838fb, 0xb07e791c0be5dc5f, 0x24d9b6b418cbcb93, 0x497970f3c6117e03, 0x3986a158cb96d595, 0x8f80586ce692612b, 0x305cafda7e4df9d6, 0xc1a1c2e06452914a, 0x7ef989c0eb583079, 0x3a765b1f7364b099, 0x4fee236d58299c6b - , 0x6f81095f770e8419, 0x53bbd86b7396bc09, 0x2b72ba726b2b4210, 0x625dda1d2901c78b, 0x0ff5bc7b18cd2b3e, 0x0556598c7358d332, 0x0991245f20ff50d7, 0x0e7f58e5e919a97e, 0x5a0561373b758756, 0x6447bc93f87c198a, 0xf9230604c34c7520, 0x6b214425475c1bfa - , 0xe93de62d6a7f9497, 0x2129459d86f4493c, 0x456394c7c464cfe4, 0x612434fec3f4a1b3, 0x1ed91eddf44261f3, 0x0c6d3854f9e0a3ff, 0xd3fd153188a7e4e3, 0x24691fbdca16910c, 0xbe97465cd7625c9d, 0x2aa61cd373f759f4, 0x824d5763a326d62b, 0x1a0ae39e50da20ba - , 0x32d0c8481ee4c3b9, 0x6c3687109cdd18c6, 0xe52717142fbf95da, 0x67bfa41fb52ce9c6, 0x4e24d6a088a01474, 0x49a6ca0ae3fb6626, 0xd67f8faa9103191e, 0x674888f5aa6d3062, 0x4ba73824c2e85a99, 0x406b2fd18d35b314, 0xa7087b1bea728ac1, 0x11d2f222317b160e - , 0xf8946e007e23a469, 0x22a196fabbce31a2, 0x5309ee1bdc1216ba, 0x240fe9953827a324, 0xf9fcb89b63aeb5c7, 0x603b8149ed16b1b0, 0xb1f1876c02cf61fb, 0x4a5e32af612f948b, 0xfc491aede69a8813, 0x1ad9379136e53aa5, 0x5da50db1d5e6c123, 0x2f4014f7fe2c12ca - , 0xe4f6791d7685c3f5, 0x4c218521c3745a9b, 0x0c0521af98555f97, 0x1462a12953cada7b, 0x0bb2ab63d6452c1b, 0x5783c531ec98bb87, 0x737def53605dbc9c, 0x49f982b930e86719, 0x75b16790cb5211e3, 0x45ad6574cdbae99e, 0x1062b72dfeec9851, 0x45029a09cc468c88 - , 0x532240de77f3a1f2, 0x17bd291eaa9ad0ea, 0xe0a2d7efc2f8a0a0, 0x3a7412052021778e, 0xb0dfb0976acc90df, 0x7fd603b689a7b1f3, 0x1152579ccb00d6c6, 0x6340743b631849a3, 0xebaa47290e0cda01, 0x143265a6d53fef0b, 0x45325d6fd981e75a, 0x0e9780cc39586f2a - , 0xa4f68d207a8628dd, 0x50d230b51893e841, 0xf3bd769a4bb504b6, 0x55975c063969292e, 0x07727ba25fb8756f, 0x07ff86cf8ed731fd, 0xef57fa40cc35a1f0, 0x70753a70874218fc, 0x615954e2342b973c, 0x5aa9d68f1a59df86, 0x3b8e9e9ff5e44468, 0x2e749114d60a3d23 - , 0x14a1b91ec176db4b, 0x55f91a63d69aae6d, 0xf42382327b1b6d27, 0x2acf1f475facaafd, 0xfd9069b479b58968, 0x3baaf4e5c4a45f77, 0xa2ac9ab98a7aaab6, 0x5466cb5018f50981, 0x3e6ba27771ba3205, 0x31ea90cdea1bbbe4, 0x0000416b5c557393, 0x464cb0415a510d7d - , 0xd02087d206ff2bbf, 0x2b9c8ecd7fabe736, 0xb2b56d3842caab0d, 0x046ea0b7767700a7, 0x113a7a889e317310, 0x5992a354bef7d0ca, 0x3edda94ed50388bd, 0x052661f767839154, 0x4c28edf6e19e28e0, 0x1d19c2f2d2f644e5, 0x5d732148db35ab3d, 0x680c4714b83580f5 - , 0xa374f282bb80ccec, 0x789e609bc77ae11c, 0x10d2577d599b45f2, 0x1c548b5b857721b1, 0x7baea726b4543fdf, 0x3c1562912d1b4ed2, 0xd6362203b7e82082, 0x1414e523d3c7a900, 0x7ca349951c1d23a9, 0x4da4265e3ce80fb4, 0x7981ebbcaca9ef36, 0x4ebac9e5b5bf980b - , 0xabd2c1dcf49cb5a4, 0x3f54acfc25c6340f, 0x202eeffabbd11cbd, 0x67216b7cb3695e8c, 0xff7cbcf9b23fc9f1, 0x2eebebdff7fa7afb, 0x71156befa111f85e, 0x1b8fd98df522902c, 0x6b28ebad62519791, 0x6cf0ea960e01d8ed, 0xb4617bc2006967d5, 0x323da065cb3df0ad - , 0x31687d0741e24d9c, 0x02db8f2b509a7cc2, 0x9243f85924320527, 0x68c360f01d6e6d2b, 0x2351c5e877d5306a, 0x6f56ccfc85c5f3a9, 0x1b09652837c4928f, 0x0b3337554c83f971, 0xe2931be2ccc783ec, 0x46829694ba08c64f, 0x9f35e36358e2c6ac, 0x1474b333b000d170 - , 0x24d792756fc96640, 0x618fda9fef868c5e, 0xb7ff5b125afd9375, 0x778dd97e0440c258, 0xfbff314886219627, 0x3417e1e1e2a7e811, 0x21e959a88f7b7bdc, 0x3508c2eb8c3c8672, 0x827ecdde111c430f, 0x21bcb19fb07aa134, 0xe0c1fa50ab2f5746, 0x401e680b4e6658fa - , 0x2cc24bab313693cc, 0x20541c12b964447a, 0x374975b6fb81c3cc, 0x52905efb344e17f7, 0x79c5c9b56d8b5f9e, 0x3390bf75d2b9a3ec, 0x7ef3807d895bf4e4, 0x2814165a42046b51, 0x7f8cfd09326fe158, 0x3232fb4f4c9762ec, 0x5678d6dacc194d25, 0x6f7caffb0a7545e8 - , 0xbd981637b23e7963, 0x691d7b7cb88a0ef5, 0x10ba319ae2062914, 0x06fb144f8295a85b, 0x80e620976bf62f8f, 0x2a425971ec73d6b4, 0x800aa9e741d10b1c, 0x230d7d8bd1a0469b, 0x65aace37428dfe8c, 0x0fcab5297f58b667, 0xcf0e9526943af7b8, 0x7d90915b75d4dae7 - , 0x7455a46156259d6b, 0x29bcc06374cce1b5, 0xf2fb0ed3aa87aefd, 0x211a06af0e54dd58, 0x6c0c95c5723de9bc, 0x6299b6ed25008ca7, 0x7fd63e784d4dfb18, 0x2cc93b4d9bc1db30, 0xebc7e2d44c5d13ea, 0x3278e18d4d3d11a0, 0x349e3dd25a215f79, 0x7eb2a7150b30416d - , 0x05f3d7d5f6a094cb, 0x2a3771d48e331405, 0x08ef39e9dc96f009, 0x012248373a364992, 0xf758f92fc9fd4d33, 0x2339d8c6dfd3ca6c, 0x8b000965962673b4, 0x746ff43eb99d9054, 0x47ecdc054a422eff, 0x33d8f7c8267b7f0c, 0x22fe00ac921a42ae, 0x31e57f3d31fcd8e6 - , 0xbb912315a1c50869, 0x4ac8cdb0fa7ebbaf, 0x0541d74a60973edf, 0x7234900334b2c5d7, 0xf2e545f730adfa33, 0x224e44e63db5ac96, 0xfcba3d005c6fdeb9, 0x2c93a4e6559936b5, 0x7727a0d7ad88d758, 0x2e33100216719cdd, 0x7b2ef89aeb2c0254, 0x1f6de5b74758afb4 - , 0x6ae89047114fb321, 0x3d605e9a6ec6d80d, 0x18e915c727a874d8, 0x699088b5e9d0912f, 0xaf9344618e056f10, 0x1b9169df8245e0b3, 0x5eb8c33d70f4c891, 0x1609ddfb222b13c3, 0x8131c885d1b366ed, 0x7bc3cf9d9cb1a7b0, 0xd297478d2fc93968, 0x13cbb4573a4ea7f5 - , 0xdd37b5cc64d5986b, 0x7ed3d1d7d81ab5dc, 0xac53485f23973c9e, 0x0705675d333b91d7, 0xade5d213c43186c1, 0x6a8bdf57b4bfdf14, 0xa87f88a1de717963, 0x17f29220b519bce2, 0x7af2d7fb0f95c610, 0x28d1d3923b144a7c, 0x8e73c3d8972813e1, 0x00100b40c62e72c1 - , 0x84de7a81fa1f50da, 0x4fa391d6589d8244, 0xbcc3596f0834b285, 0x4d4acbd60a24e9ce, 0x97fa98b8c1835a0d, 0x33abcf8e29901d0b, 0x60a73d1975b3d082, 0x60666aa4325b948d, 0xad54adb769284a39, 0x227a98d113609b28, 0x4a1e1ffcae6a3872, 0x1e4ee44bd67f818c - , 0x5a74c6bb4387d315, 0x019428c0b1b18795, 0x5cc153e270bbb055, 0x2b3cabdf00dc4a61, 0x834110c026924b57, 0x2d30e985f2d9f217, 0x47116979333389f5, 0x53e3fd6a18202417, 0xb1393cd79c2e5864, 0x58d92935e4112e82, 0x86989a7ec8305b6d, 0x42a8fe4eee28f37a - , 0x74e212ef01591901, 0x3277917a0397b1b9, 0x7bbcbe6e3d687544, 0x0b8957701d09afb6, 0x6cfbc8ee74503668, 0x48a9925ada9f8348, 0x57045753ba2d0f4e, 0x7d69ca3866223d66, 0xc7054ce22917271f, 0x41bce1e1133b51de, 0x3a3ae42df81ec35e, 0x7eaada0f42d47cc3 - , 0x13b138f1048a57cc, 0x64f98abd7e915a8f, 0x7af195eb16a0c732, 0x11be81a791d634d2, 0x97d8df47430f61b8, 0x0767c7b381271004, 0x3e949136fb940aa6, 0x3bdee340cd956dba, 0xb250ec4ff91d2602, 0x4cde2454d47f59db, 0xaf5e749530d978cb, 0x5a8e2f2119d4d835 - , 0xdf1cb5425a0744df, 0x3d3b08a7bf35d055, 0xc6335e832de4719c, 0x6eb8d97e09154d42, 0x2f6a3f8de3d20dd9, 0x13f23cfd276233da, 0xb4a6b80dfc0fa41c, 0x58d876403acfd7d7, 0x2ad422078b8e139b, 0x73dbee2abbaf494d, 0x09a2758891eca3c8, 0x6ef9a9f1178b0938 - , 0xfc7e9ecb90c637da, 0x3a04345fc10b1a7c, 0xc024e9cb62f9ff1f, 0x6c4f9c3aa4aa33d8, 0x049d6995b95ac1f0, 0x2243845195763a1b, 0xa1466a31700ac276, 0x600fb7123a325905, 0x9d391a64a0d35a24, 0x3b093b550641f108, 0x2275de5bfd2e221f, 0x25f5e7465963db1e - , 0x3e220107f7e7fb84, 0x6f06a23bc1b85a8e, 0xb4198d19f6eb0e48, 0x5dc11761dad45fda, 0xba303e492ab52a0d, 0x127c69c73da9f528, 0xd3a5b70cf6c790be, 0x0d72b0c50819da5c, 0x193f90d62ec2cdf7, 0x67f7d0cfc4f46daf, 0x7aec083d52f380ea, 0x7c0a1dda4a28bf4d - , 0x46fd20fe6008cba7, 0x7a588c914115d595, 0x8fb1d3daecf45f78, 0x0851dac094e7b036, 0xcae0a76e2a32a892, 0x104f861322dddb2f, 0xb79d81e46e1f9006, 0x1e4d28d7a2498912, 0xaf3175d3974b89bf, 0x613d00f9a69c55c2, 0x23f6883e8e65226f, 0x072f7ed65c6def05 - , 0x6690e643bb38e243, 0x1a81c4a7c9189b15, 0x1056d1669e4749ae, 0x0137f2a7418f190c, 0xed3192796e699d16, 0x3ed76db45c38a37c, 0x78e86d1475a88243, 0x45985aacc495b16e, 0x47d5c8208e8f1030, 0x6dbe5f68b4d0e782, 0x08d3d0182cf7f26b, 0x64c375ce172fadbd - , 0xba0f6db3a20c2875, 0x57e1d90a53241250, 0x0315433fddf8e63e, 0x33344750e37dad9b, 0x62cc0d28ae69b016, 0x435fe80f6100d547, 0x5874aea8669d3df5, 0x3b96913f8264d4a9, 0x738067d6bb1314b0, 0x48cccf24cc6f4ccf, 0x6f5e2bbd68b777af, 0x34c2c37ba9635d66 - , 0xd731534900fdbe5b, 0x4e4f9d97afe11d43, 0x81b41214351b73d7, 0x1d48d100ad11a5ae, 0x2a4ee76628e2b151, 0x34902e901877efb8, 0xb5a8561a0fd45394, 0x44317af6d5cd5ac0, 0x354c2469e9068bad, 0x0771fe2761cad022, 0xfda76ee8212d0f2b, 0x76cdeec6d4435495 - , 0x55c98575b3e825fd, 0x2983325ed5d73a1b, 0x563c4c4fb3f466e7, 0x731b0fa413338bb0, 0xdeb519ca57a05240, 0x7a7e909b5c4f7351, 0xefb7c153dd2ab28e, 0x11ca1c865dee30b3, 0x013ca8348d9d7de1, 0x575e0bdaeee8cf9a, 0x464c98a21083af7f, 0x683ddcd85c212ee3 - , 0x1171f0ab4cd02019, 0x22c7e01c7f4d64c8, 0x972ec0ef3f2e2ed3, 0x623f83c2611a476c, 0x99b3f16be9aa25a1, 0x2d3ebc5468990e0b, 0x5d5fba8546a4d5f2, 0x4716e6919d2986e3, 0x3ab2f2bc183f5d6c, 0x5f6257d3910cd4be, 0x341c6f2a78f94f2b, 0x6ee8390b8a5064f5 - , 0x9d8640b9b83ca8e7, 0x033c5ad24466be3d, 0x6f6cd68db30dfd59, 0x52aa6b1c0f90f3f6, 0xfe7bcd4c97403646, 0x11ab3fc960b05fb0, 0x24584b77575896da, 0x427f8deb932da137, 0x928a28cb505306f0, 0x04ae916fe863820e, 0xaabaa98911b9cd3f, 0x59e588ba994d9145 - , 0x9b8f1afabeee9e9f, 0x04ffc7ef3476ff8e, 0xe9cf53ce9937b146, 0x73fe42a801524448, 0x224bda3cf3bbaaad, 0x5fa85056d59884a4, 0x8e6eead48345726b, 0x09230936d41736d2, 0xe679eb58d1ad6be7, 0x08bb759b530b1eaf, 0x9688eb527860e24b, 0x13704d2daf9af278 - , 0xd9273ac71b906f14, 0x57ee05fbbd40deb5, 0xb7788e19ba9e61eb, 0x7967b6dc1c5d9699, 0x36e043fc230127c0, 0x2a716598bb2d519c, 0xc017b2840d4d1b07, 0x1d3bfa489f756a3f, 0x4ad73abf24318d36, 0x1915e6f53e12625d, 0xb219a7c941f89084, 0x2280087a8f4762fc - , 0x8eb280345fd1b4e7, 0x55b8d4ee5772fd79, 0xc9e63a787e2ce2e1, 0x685741adbda93885, 0xffb830ab11a3b491, 0x7e891121f9356428, 0xc03aea271a629078, 0x71c45932930a2639, 0xe7df192a6bf81795, 0x704aee8f183aadf1, 0x06ddb55a8a7a63d7, 0x52556d8763f3033c - , 0xb76b458c6f0c33a7, 0x28666b87c362b95a, 0x365ae575a4c27b9b, 0x36ef35110562adfd, 0x89955dd8d927f9c7, 0x526e787d6a586c9e, 0x762e0bc4eff988c1, 0x6c9523b4b5ae4946, 0xe90a909688cfe95f, 0x658a7dc8b3ffada3, 0xbee148ba7a58520f, 0x6819007d8573d1cf - , 0x75d3b5ec141be9c5, 0x4bc236ae634f3c27, 0x1192fa9b8b30e894, 0x4129d43e1d092cbf, 0xfcac068558bbea45, 0x513e8d87b8116534, 0x5377a179a155ecd4, 0x6c93531e5545572f, 0x727df81ba09aad91, 0x07527139dbc96250, 0x150320b1d8ba172a, 0x2281e85f60a1809b - , 0x7164b7d524eba6af, 0x50d387163fea4ca8, 0xe90de17d62aebe78, 0x6ab369ba28c0410d, 0x17d07e315a95d138, 0x58b496352453fefd, 0xb87a04dbbc101b92, 0x40a8f0fb757e9b0e, 0x2148b48a696e64d1, 0x4e004a3a350c17d7, 0x17927e9f386b563e, 0x29da9cd441e3e3c5 - , 0x883d2dc357417213, 0x2e94653ff7862644, 0x53a37af548453df1, 0x04475db3c300b93b, 0x2d65fa4d815e7204, 0x231a2db74c2c3ccd, 0x1fd734c0cf4d97cd, 0x32d255c105f6d122, 0xbb74fd9201eb07b0, 0x12e33f1c81ac6f60, 0xfb9a6439bea97072, 0x52e14b7db9cdcbc1 - , 0x637ac1a91ae374cb, 0x1c8622c35adc8224, 0xeb786c50a64b7d33, 0x362823a7232a5893, 0xf22dafca688d472a, 0x18598f0e0237f7c4, 0x97b8497bfff4bcf1, 0x7abf4cb27a9c5b7f, 0xea47c44e3b3d95d3, 0x58728fe3e1827a43, 0x7fd3681a6df902c8, 0x6db1dbbdc413de79 - , 0xbc4effed1ac3007f, 0x7f31a54744887cab, 0xe6559b4f8bd2519a, 0x18a78ec5b0c241db, 0xf6e10285b15d2030, 0x5c1323ea219a8ff4, 0x134b6f20dd116b47, 0x5d0abddbc8998733, 0xa3c993938702e151, 0x0ab6aeb494f6ad5d, 0x8cf3b4beda1815e6, 0x546ce323008c2fdc - , 0xa10eb5a6a78dbe39, 0x26d2e8a8b8457da4, 0x026ccbe31517d806, 0x2a35174b812f562c, 0x57d70499dd7a374d, 0x3368f951acd3c5e5, 0x490b2515f901062c, 0x316109e7c315c377, 0x32e20eba569535cf, 0x496a8c39d667d709, 0x5578096dc44d5e0f, 0x608a162ce73903b0 - , 0x6b2e65852cb37cab, 0x75b09a2e6ed609a9, 0x7ac84b3082602455, 0x7690cbb594e84b94, 0xfc85dad9511973fb, 0x738a74b08c9006d0, 0x83233fc939d5883e, 0x7fbfc08b5db3c9f4, 0x81a0e493fb5f7749, 0x2c255ef7e69a77c1, 0x234f02e609cc656f, 0x5960cf0b961f3cec - , 0xac72940237b1f17a, 0x434e038a29d446ac, 0xca6a090e00d8b0c6, 0x1f1aad24001e473e, 0x6d64b6dc133399fe, 0x0899ba41e9dd4607, 0xca590b3f25bbf5df, 0x57217978b0d8ce11, 0xd6b4cb13da6de9ac, 0x3c88520cf564f75d, 0x649fbd5075a7757f, 0x3f2593b90fe72161 - , 0xe1bee53e91dcc9a8, 0x010069dce4c74a92, 0xef83968978aa855c, 0x6cd8848183b53d73, 0x0b3df59610e403eb, 0x713225d446180a7f, 0xcc23112cc59850e2, 0x105796b670a3730c, 0xa147f4ec7a2fa4cf, 0x32da1f072d75b253, 0x4e7007455e85f560, 0x76a5376a771fdd60 - , 0x47eb4fabdcc699f7, 0x4e45db6334c6ed96, 0x36066f2bab72546f, 0x04f48065593ecdec, 0x3fec02793fbb5601, 0x122f74626b64a526, 0x21d0f66ff83b4dbd, 0x1370610ede647f1c, 0x57b82242b88172c9, 0x527dcbadfdc65ade, 0x5e9c9a04385c93f5, 0x64d1cf9e52548a6c - , 0xba0073337865c994, 0x633ee14e50bcd615, 0xf840228ec4251095, 0x49bb96812a98f08d, 0x82f57d0422f96678, 0x06d7e43bffe7e0e1, 0x33910cca752ae863, 0x04d46e7c66087e38, 0xf14935c4167017c3, 0x3f22e2f44d03c9ac, 0xa6196244f2cd6164, 0x15a2b4ce514fa4db - , 0x5191a04c4abbd0c4, 0x0e763360ecc8a19d, 0xfef583c184a673c0, 0x75c2f30a7c7433e7, 0xe947a55547c7c099, 0x245c7ae44f6e7a83, 0x67a666f9e6bec2d4, 0x5de0b922fa645ac8, 0xdd9b3e4a5cb72e22, 0x0139c2c857adba8e, 0xa7feb68e863ac231, 0x501381ef88ec2da0 - , 0xb2b8c6a470f40b01, 0x051d65bdb8363062, 0x4ce90414a6d65714, 0x1e510b525d19df0c, 0x569e723f5d374cf6, 0x4bfe02fd38fde1f0, 0xae7459ebc50f9aa2, 0x0f7e2cb170dfde32, 0x3c3da2326a7407cb, 0x0cfc50a85ffd1842, 0x62ab34c85e85c3c8, 0x22b4d9644bb37333 - , 0x57d313b3d87c2d98, 0x4f432c1cba49133f, 0x6163d11fa4befc0c, 0x1ab94e122fddf12e, 0xfb7c9358aefc85a8, 0x5b20068f81d949b1, 0xcf8ed6ff2145c810, 0x5794afc021932d00, 0x5c8987ad9b6e35d5, 0x6bb1f4b836fda03e, 0x794f1fed4a3ea1d7, 0x0cf6d128deb0e7bf - , 0x54ec3e1c65878cf5, 0x002811763ba2200e, 0x382d917051e77b71, 0x49e00cbd013a9e7f, 0xccf576e9a4cf019c, 0x4b4a66287970333a, 0xf772168915edfc1f, 0x278eb5eca6479685, 0x8a95c8b9cf41cf06, 0x6e58c9c7826d39db, 0x478e119889f2fe75, 0x73ecd21991bd98d4 - , 0x26e751fe9fbb9502, 0x29825b71b0632e95, 0x21668f96ef8bb5c5, 0x2f2a899e53c9a004, 0x2803292ed4345ce8, 0x72731055c7c65dec, 0x3aaaca9c4b6fe9a5, 0x6228d3ceda8bd671, 0x773e2c5effc48eaf, 0x017ab19e0fea9ac9, 0x9609e10496c8d766, 0x121e89f9b302c30f - , 0x4e87d00a0be96480, 0x09bd8d170ba9dbab, 0xc6756f947ecd4e52, 0x2c9e40bbbccd0f5b, 0x42a5b77669fd812e, 0x66aba9583b080d9e, 0xee55df99d16e77c1, 0x4cc00c5c5eff2509, 0x8c84d5e20ab7c16b, 0x00ae5c96184ffefb, 0xb295e90346dcef54, 0x5d1bda0a39dc3b72 - , 0x75f92d72a89b5ef2, 0x259d998c9ff9ac0e, 0x8a1cfb72a6c433c1, 0x23f5b71d49d67604, 0x478d8f30914f62ef, 0x08fe61135218eca9, 0x4da2ce9bc6488c4a, 0x15f1eafd35283e2e, 0xc2d2be3ebc42ea0f, 0x2a5216539d6ee902, 0xa1e99052e7bdeeb2, 0x3a8f2631ec78290c - , 0xb71518a82ebfbfe4, 0x24700671c46ebddc, 0x6ef52d591a221f75, 0x4794614db6a67d92, 0x761f5c8ee4bab607, 0x31d9dd8f2361b5d5, 0x1a45593be8db3b29, 0x7f06c365eb116260, 0x9d305a66e52eb65b, 0x5edcfcb5613eac18, 0xef34fd28154adb75, 0x790f805753b9d742 - , 0x6ecd5ac255dfb797, 0x0cbe14db5d9a88db, 0xc1c86c5efa815528, 0x2c636133ba59d887, 0xc75d42c2d9f52297, 0x4bd3540c21e2ebd3, 0x32e7cdf790de6903, 0x1aae3c9837d3e30a, 0xeed028e49d436f09, 0x779ae12351efed1c, 0x6e0145587d9797a5, 0x25156e4cee9a407b - , 0xac2fd82f2ac57119, 0x7f8c026f1d182ed2, 0xeacc0d8fb3241611, 0x5968db65d2d7545a, 0x7d525846b1121dbe, 0x57949fd7b80339cf, 0x471fe9bec9b66c01, 0x5c270057f1268efa, 0xce092463083f656e, 0x16e8241cdc862cf9, 0xb7cb2bbcaa06b312, 0x3c25936bd8863416 - , 0x19b8ca966c4a3827, 0x1ae43badfd21e63e, 0x1dfd002b95a6ac6a, 0x4708e27f6d98e997, 0xb5fd6322dc31ac7d, 0x53baf4d9a16dd550, 0x025aa2ea5463960c, 0x5b5b33c7a3cfa54f, 0xdba287866ee96b90, 0x4748c1f3f3a6dc4f, 0x2333ec05a80c154b, 0x4a47745d5b99fb96 - , 0x44955b062a6ecded, 0x7791feea9015f170, 0x736bf603d12fc35a, 0x2632adbca5388026, 0x956e4c48e1697c4f, 0x4ee9adfe8600e32d, 0xa584042a0da56406, 0x34a3d7f4bf457353, 0x8d4fd4fe00176fab, 0x15321ee855941f4e, 0x670701ef81f340a4, 0x0c7d7c618aed0ba8 - , 0x73283131d9bfd9d6, 0x34935a39e31bac65, 0x466cfbbcaae8b991, 0x250dd54e18478ac6, 0x659e46c51e40de4f, 0x618ea014fec50e04, 0xfe64d883080b877c, 0x572cabbb6688c4f7, 0xa2c817493a834146, 0x06cd734876378120, 0xe3de0b717336a849, 0x36942f5191db53c4 - , 0xa3f9adf66abf4d88, 0x2a9a144b8087fa96, 0xfe49fefcb78a5b4f, 0x1be40a8616928bab, 0x07a901975521f7aa, 0x1fc66ea683693510, 0x4dbf0084ba42380e, 0x1f374495b918c737, 0xb8346956a380a00a, 0x1346f4766fcdaa07, 0xb4db5689d46312c1, 0x775e7f3274dc1316 - , 0x07898828f32341c0, 0x144390a33b3e86df, 0x70bc604ce1e9c5e4, 0x127652de00220873, 0x2874bc669df50d45, 0x236f4585150161f4, 0x3bfa4ffd318214e2, 0x7cc92a6165059745, 0x2fae0e92090ef72a, 0x26676bd59c4fcc3b, 0x220c030974d1d447, 0x66455887e98686e7 - , 0x4164b8e4d8760ddc, 0x5517a86f840feb63, 0xd9b42c6c9371cade, 0x3a7f03ceecc160b9, 0xdd4086d64cae366c, 0x1b6290c327842533, 0x144efcd2a7a0e82b, 0x16621925ca10d31e, 0xa9dcd13118e208f1, 0x5a90f97edcb1c54e, 0x80c47331c8749d99, 0x6f061a3569a80b55 - , 0x0f6abf619e2a15c5, 0x29106c98122245f4, 0x5860b10985c9b47f, 0x4f379a379e15f410, 0x2dd6f45df68e1678, 0x2c475167ad9b283c, 0x23b7aa00952a6a3a, 0x5532bc26a40c5365, 0xa5c0a8be3596ce22, 0x4fa3127a9aefa56f, 0x944e843aa973e67f, 0x3c7727d45ae87854 - , 0x48fa2ce675117ea4, 0x7bca8e04ad3bbb9c, 0xd57439e4726f88e5, 0x3337d3a6a03b2286, 0xb0b6172902005953, 0x514bd76734e6c0a1, 0xf97f8934eed7c6b4, 0x0abe13cee7f1b75e, 0x6c88107a120e54a7, 0x634f966d7a6e11df, 0x5044c53109b94097, 0x68d49fc65522b73a - , 0x69e295cd8c444666, 0x542c4c5fd999a224, 0x13ff89418b5da76f, 0x7133fa786a87ecb4, 0x2f180926456402b4, 0x52ddada7931c4dcc, 0x6eaf0d2130c71590, 0x014ec2a2ec231826, 0xac05b61443b34dd6, 0x157acbfab118b219, 0xe4e2f4b84ad01099, 0x0abf4a4da29a0eb8 - , 0x5f852b85b59eab1f, 0x1bd259c4726869ed, 0xce565d9287790a15, 0x17a48442bcf58a00, 0x01e519522381363b, 0x2336d07a710da07a, 0xcfebf2fbdc714cb2, 0x2f7a51474c23b8a9, 0x77db2a07d4e3716c, 0x40e8d8d2d0a09806, 0x644363ce6d401ae4, 0x53f9cae0470172fd - , 0x58d96ecd8ddadc53, 0x15028204f3d6d696, 0x6f40a09214439ce2, 0x738c5371236c3e56, 0x64f87ee7a28bf9fc, 0x4f1899449a810fee, 0xd0aa95f4bf21e376, 0x6170cc24283856bc, 0x9dfc4927d764ff75, 0x227ea1563fa2e012, 0xaddd3665622ce087, 0x473d3bea07a5285e - , 0xc0b986ee0d2b0eb2, 0x78e584c740dd18ed, 0xd5adbf30a04fd508, 0x1c6aed5ab59bedbb, 0x25d05fccbddb5ba1, 0x4a58fb6b3f896319, 0xdb2f6343fd8144fa, 0x46a445de6d5b07e5, 0xf67a06684fe9e1da, 0x57b2515923b15c9f, 0x50439940820a2a0c, 0x62f4b9b26f04dab5 - , 0xe79ea601d01b033d, 0x009bc6176f10fffb, 0x333bff2f907ed39a, 0x253d0a9e626dd400, 0x7a9bbedcfcbef06a, 0x2d1b6a7a5b39342d, 0xbadfb462a124cc9a, 0x2e8cde9d82c15cb0, 0x7c3f81bcd6f1b2a1, 0x04cb0b8fa4075294, 0xfa36d3db38cbd304, 0x59fef93442883553 - , 0x91982a741cb9342e, 0x7b9d63ac17b01982, 0x530b4ec25a293ece, 0x611069ad9fa0f0a4, 0x7a262a59b656a79d, 0x6fe6f8f4d6d015b0, 0x2c2fd7641a5d4e50, 0x24b0c507058c911c, 0x834882e492fe45ae, 0x68d0b01b13432761, 0x0eacaaaf94178b8c, 0x123e3a93006d7d01 - , 0xecf2fe69377ff33c, 0x4fc960ab4408584b, 0x2adc445b1ee45654, 0x4989681cd1d09a93, 0x79509599afe9e3b6, 0x7f6ffbbeee861c15, 0x2ed2859fd6391b25, 0x5e8bd52289b6ad27, 0xc949280adbce7c79, 0x510999e865f0cd54, 0x7f957314ce7d373b, 0x4b2c0ea4bab08ef2 - , 0x2d7cc08b5c05a8db, 0x4609a0ea23507697, 0xe204ba35182c55b8, 0x5e4d5903fdef61e6, 0xfe63842f2826598b, 0x782a3fd3ab62a179, 0xd2f01a1979e5a0f3, 0x0fb4c6bdd637fba2, 0xfbff4c192020c350, 0x14859008c3d223c0, 0x65ed7a889c1a2e55, 0x1d78daf483fa12cb - , 0x5b54d11b01bc09ca, 0x54fde75737306515, 0x89725231105b63a7, 0x712d1f394adcda99, 0xb554006ee9abefab, 0x04dd8f7bbd4c5381, 0x98d22b3a31995549, 0x637a53de6b57122f, 0x8367d69b4c92da63, 0x236f2a9514250df6, 0xb265509af63d7b7c, 0x08522e36bc4b65f8 - , 0xabae725012ce8301, 0x493b257197a98ce9, 0x33185838570e5f0a, 0x65f5477ac414eb6c, 0xd002a36854699753, 0x2be693b4d96efdb3, 0x3b32484119bdc53d, 0x55691ac09a8fae1e, 0x0249e394514c047f, 0x765674c90b78171f, 0x1166f64638d6ab37, 0x746adba4cb52d18f - , 0x93e293653dda6cda, 0x5d004ed52ebf0b68, 0x65c7c42d0ad96cc2, 0x3350dbe11cafca74, 0xc638cfa8942fef67, 0x0ff2dfffc5ac1164, 0x9e1b625e649aa471, 0x13a219d03d2eb86d, 0xdb92859ebaf9f7f9, 0x645c50918f7d5abc, 0x25c10cfe99f7e5c6, 0x13d858b53f90170d - , 0xddb258b13ab1e7a6, 0x4849ff49f4e13fc4, 0x9ef87fa85511cda8, 0x48c50d4d3b4d2f7a, 0x6c98422c8007c9ac, 0x3fdd72e65a3d3491, 0x56b18cb165b4ec3b, 0x6e2c6df9e3fc3daa, 0xf6db5aa98ddc97a4, 0x423fd4082f3fb795, 0x42f8f5edf424d0a0, 0x1a091c2696139936 - , 0x3161c2bbb3b2d58a, 0x2e8d339eb0fb9099, 0x45ef7d11f6fab685, 0x7f222a068db3da4b, 0x9af96f9742549a7c, 0x55370df31dcec81c, 0xde98e81b131af02e, 0x58bd0622a474acee, 0x8ab40fa7ca882e0d, 0x5b4db195655f2410, 0x4754eb479ada77fd, 0x67a8a437d6fc8a7d - , 0x9888254a4f0c9d58, 0x3232ba83bed0c618, 0x587b0de0207b57d9, 0x020df6becb096aa7, 0xef9e41052a29a8ab, 0x4ae671ee70a15a69, 0x167ce954923ee086, 0x6878c3996c1de887, 0xb29c711490ac097e, 0x1cf41a9c2577d144, 0x0590796ba46d8d29, 0x1c2e6dc8d4aebb65 - , 0xbfb904f8ac9b4cb9, 0x4ea1742c786469e7, 0x5a422f48401be57d, 0x0be0afdc77d6d32f, 0x5e8765cba2c738d3, 0x7dad0475059a089d, 0x9288ae0c40df7df6, 0x51c65f97715a16d5, 0xa9615d4c786ff9d4, 0x507ffe03ec0189ef, 0x1c1f46684604e41f, 0x282fe9d567db0efc - , 0xebee7f8381fb8178, 0x5bd4b6045c208d57, 0xf35694743439ed71, 0x7cddd5a373ebc5ec, 0xa58df33cc68e3b5f, 0x40e6714f5c5c8df3, 0xea881d4bfd489131, 0x6b36400b491c28c1, 0xd4475cf594b6303b, 0x5b630cddc72e654a, 0xa0b587ad34394ce3, 0x3ea3ba6014f86275 - , 0xc3deac125d20eeee, 0x2ef3568410a2b3bb, 0xee6ba3fac5d7ec00, 0x5fabcb3337aaa23c, 0x6b1212e7b817889a, 0x0b37d285a9be51d1, 0x617ca543d762bf51, 0x0896b4ca694b01d0, 0xe3add9718277a1fb, 0x553dee7dd4784865, 0x904b8f7e936cf430, 0x5b6a78f20b244b90 - , 0xa2b876c2914b9bfa, 0x704de952e9d969f4, 0xb04ea1b54b7e7654, 0x5d307bb3949cf660, 0xcee4c23ebd049d17, 0x7a88293bb1031063, 0x00b8432b8286f656, 0x260a9c86a16216e5, 0xd140e6e6629d8686, 0x296011ff5601a000, 0x536f0f76cd9b2928, 0x267409c23a823dd4 - , 0x0f041043797f8423, 0x3da6102605962ca9, 0x2e69dfeea02098ea, 0x427e7eeeecd3a0c5, 0x75efa5e8a590793d, 0x1f5841df6dfdfc91, 0x1aa1e1b8b9f3c326, 0x07bd5b0983fcee91, 0xd169420be9c48939, 0x7940334f0bb9023d, 0x9bb330fff113764f, 0x674ff1b0cfe246c7 - , 0xe2083f8d7129cbab, 0x7e6223e3d9c04904, 0x9be411a7d5e883a3, 0x72642664e7c25590, 0xbb1f783b5c412322, 0x46716e8fd737280b, 0xfa363eeaeffde271, 0x6c256c131fc2c3b9, 0x13259abfcb2ce1d8, 0x53b96556e96aa708, 0xfaa7c8d25119da19, 0x05019f438e9f8995 - , 0x05e1d55a9424f1ee, 0x63e8e14e6c2f3f09, 0xe9d844e997a10158, 0x51904ed1e94a0ca5, 0xb09462d4df6bc6cc, 0x2ee5308e62172691, 0x3f8438484547187a, 0x62b92b8d9739ddd4, 0x3ca54ab5d39f083c, 0x25b3336048a288d4, 0x7cab0fd67e296979, 0x58ba2e783962cbb7 - , 0x77808f1a1b8f3515, 0x290c219ee7153bdd, 0x7584441f79128f01, 0x0442db406f5135e3, 0xe741de52ec030a9d, 0x37469756586776b2, 0xbd64c2a7173adde0, 0x2280b66d20888d0c, 0xdd1b53cb4adb0fb2, 0x3974964394c445be, 0x53b6a95e7c7fdd97, 0x6eacdc6f50496d95 - , 0x178d04c0578a5bb3, 0x0d171a5f5215c9c8, 0xfe0d0171c504962e, 0x04eece54b220495e, 0xac4d145001db67aa, 0x6577c466962160af, 0xcddae62d99686ad7, 0x7a053a048d230d89, 0x1ff09aa0e605a880, 0x5d260426f355232f, 0xfbdaf7b0b53aab89, 0x5eef31b9eb0df78c - , 0xfb787e56b7276288, 0x4dcccba87d630d06, 0x415e4a4bc0a44b01, 0x0f0a981f71d8ae33, 0xe0ebb786f98a1502, 0x0ea4aa3ce70dc628, 0x8d36240617ebe037, 0x2d20c0e1d2002b5b, 0x336f8aa411a30282, 0x1d87c67d8178ec4c, 0xe468dff8ac26b63b, 0x266086bd7f11c9bc - , 0x05cfeedc80d829f8, 0x146902a029dd3355, 0x413db9327c068394, 0x55fa413791f64c38, 0xe06395c10021bf9d, 0x18d66268cf79ce45, 0x9e7ae6858dcc21bf, 0x3ad51dbe97b558f7, 0x06792c747aeef43c, 0x27ec9b782170abb7, 0x6aafca394a23e935, 0x18f7cbd98db64112 - , 0x34146ce6b36edbfa, 0x1dcfb4eab7ccea23, 0x68498e1f45b35467, 0x1b20d71a3b71d412, 0x7a875fc94e602e3e, 0x78c15fa449576c2b, 0xb52326d01ccafe8a, 0x3f53f57324d70666, 0x3830836e39bcebaa, 0x27a30c73dd02c884, 0x5dfed73dedf2306f, 0x75ee4a8b6cf54f74 - , 0x97ecc9c5851a8e3e, 0x496b581690c3df2d, 0xf7bba1fe2d169e7d, 0x4b06184810a77bd3, 0x40e6d643b903c7bd, 0x3c90f63b5176906d, 0x92f47e1ac51f1ec6, 0x70c2454c53cc0dcf, 0xb5a75d246c653b4e, 0x7e5173a420a8b0df, 0xcafb44c471d0f4a3, 0x69a3a4e92bbe5977 - , 0x26e93183cdfeb424, 0x1e0489b56fa7e130, 0x669befa672fe9979, 0x0f8aea6a7ef65bf9, 0xff0b883ea96b51ff, 0x31a668763c3c8867, 0x6887a0029701c9be, 0x545644cd70c87d63, 0x537b6fb7db9410e0, 0x6ca227f10229b3b9, 0xc7d1b4d71ff22468, 0x522058d3b20569f9 - , 0x5f4bfd813a51fb62, 0x105b94a3a42424a1, 0x96dfdb685825857b, 0x14d98588154500bf, 0xb4db83514c7a9404, 0x67aaf998856faf37, 0x1229d7e95dbc821c, 0x7e617a17a2f72bd3, 0xe964cdba7222695a, 0x677619cc40a07eaf, 0x7f82c099a8df7538, 0x2a219175ec95a1ad - , 0x755ac147b51ff3dc, 0x4a87f652f86823ec, 0x6d8d4a923f50278d, 0x4bb952ac98c0120a, 0x968c57a6a31e482c, 0x0855a11481fd5653, 0x3f05db6ac608d16d, 0x33f9e5746e1079c6, 0x1f3458e3ec51f53a, 0x4ae3fc836ceccf81, 0x3c0b2e2db5875ddf, 0x42336a1262cbb5e0 - , 0xe3651453cadc3868, 0x25081cfd6e80a2de, 0xd4cb31092872e53a, 0x16ca9349a11a9c37, 0xb1d3ae440d1cb675, 0x41b2d6ecbccbd6a4, 0x475e6a844c3d0ca1, 0x2cd0e0dedbf07023, 0x85ad446ddb002a6e, 0x72a06e5419a64609, 0x9e779387e9a3276c, 0x414a8163a9408b10 - , 0x25c7b53c1791333e, 0x3ea57190b42cd838, 0xbf20b346b094f121, 0x47570cba99b06c9d, 0xe6bd01c8746cb5f2, 0x3c0b0b8c4c0968ef, 0xb22009690e243975, 0x251737e4a5643da2, 0x3cdd49123ab89dea, 0x68748cd1e3cc45a6, 0x563746685effea7b, 0x4e4c5b1c86eb3a29 - , 0xe1ba017516d32070, 0x5cdd35a0c4ba93a3, 0xdbc66a0c7de30288, 0x22107156a0f700f1, 0x0fb69045aac0f647, 0x111dcb9763d08bc0, 0x266db39f6d78cced, 0x02a32587c7033892, 0x76fc94ce6a2a4b19, 0x474db0f12fcfa96f, 0x0c44584c08377ac7, 0x5f435bf43140f4c0 - , 0xb9741c3014eef7a3, 0x54596c23b536ff04, 0xeadf56bb6ea39450, 0x32f24f6e1a656b10, 0x21422e4dd5f54e3f, 0x0d6ad57853660607, 0xf6f62ffdd0bf9928, 0x72569c930015caa7, 0xf4293579931b9216, 0x049d6a4057e6827e, 0x6223e20060be0e05, 0x20d91ae969dfa9a4 - , 0x02611b345456d47a, 0x601dd413d1bdea0f, 0xe6b017b26bbc9bf8, 0x63399ff3d6542359, 0xdbdfe225045a9764, 0x10acd93346649beb, 0xc652d5a50e0535ce, 0x49efbd5639c4caf1, 0x65a5dbd8a304de65, 0x08ddebed0e865be8, 0x5db8337d5e715261, 0x34cf4c75496807e2 - , 0xd840c7416e44b56a, 0x10fd30d282d8b151, 0x36ffe6df2c1c9568, 0x66d8a38b6d31a2b1, 0x01fad3aa61984774, 0x412a9fd87b303d90, 0x2720945ee0f0ec9e, 0x0c91b4c7ea84cf37, 0x98462f25fd5832f0, 0x6f4cd578c490d842, 0xecc7d24c31ed3342, 0x580ab96994515fd8 - , 0x6d8a97ed98465b3e, 0x16995dc010908ae3, 0x50626a4e555b774a, 0x082636e5a8a9b568, 0xa99435cc4823b413, 0x41fc423d10eff4e7, 0x114236dce6f9f9dd, 0x6c3995c4bbe0aadc, 0xf3f22c975935753d, 0x6b1b3f27edec2a78, 0xdbadaac32ccc292e, 0x3856036f8a3795aa - , 0x947154caaec01d73, 0x0a22e573e3f0f49b, 0xc50c949f39c184a3, 0x2aadd0868535d0c8, 0x22bc5bbe5f992446, 0x15d36adfca3ace90, 0x038010e37a6308f9, 0x161b06d8d7180307, 0xcfbf4e3abef8d056, 0x2a1765fe9c7696ba, 0x6a15d44ce18ef392, 0x5405239c0369de64 - , 0x5fabda1210f58e29, 0x40cbb03974b37035, 0xa29fdf2875322520, 0x3b32ace85edac547, 0x0f0c92b41d679df8, 0x7f07ecd47a7d2f0c, 0xb5fc65c05accc95a, 0x0e8b1da70636f221, 0xb2ebd131f4e8a846, 0x7df51e4aba57f391, 0xaa2f3d40fef689ed, 0x0ee1e115fde5d582 - , 0xf7d025b42e240ae6, 0x29fc1befeb526af2, 0x7c5ffcaff205e565, 0x4cf4d0d8840e2e1e, 0xb8b00d1810ad0ff6, 0x44d3af686ba915ff, 0x86a8fd1eeea8d08c, 0x3eb300adcf6edc4f, 0x8db03c266b588186, 0x289d0fd301e96881, 0xba83ba260cccc170, 0x26ee69546ceb0c77 - , 0x1109d8bf92c4ea05, 0x033aa036671937d1, 0x4bd9902e5a664a0b, 0x42bd48ed44fdbb71, 0x7359e19357a9622d, 0x0d6ee92855dae22f, 0xc24debb323643859, 0x4c60fee1e191766e, 0x3beaec0e99faa328, 0x056c2ae1709c5b0a, 0x7fe89e0c62710909, 0x7e3b5cd3ac4e6ce1 - , 0xe9d06486ac7370a4, 0x4b1a8c62e99f9429, 0xb11a50e20bc3197f, 0x75ec513c25dac300, 0xfb9fd064b1466dca, 0x290379cfce59308c, 0xca3ee3fb7db99943, 0x2af7a3e930faea44, 0x0d294e6d1505e35b, 0x7d534585181e001f, 0x90285700831d4cfe, 0x419f25105d06c90e - , 0x5f71e79f5f828172, 0x02921e2a43326798, 0xa0981553e84d4a6a, 0x220c82041938573d, 0xfd2b5b78ef20c927, 0x3c99a2dc611caddb, 0xfb1247fd99ed2828, 0x4b3a3739f724890c, 0x7775ea2d7d2d1017, 0x3ab07cb5ba8ac987, 0x82e5123a20a6b5c3, 0x44965098aa82161f - , 0x20948c77e9ac4c0c, 0x521e934ab214157d, 0xc8f4f4052dffedab, 0x1da963c2ef46f27f, 0x3be7631e212fa2e0, 0x0d188e88d1a4184e, 0xb4483ed385de4bae, 0x4ffadfde83d2b0d9, 0xacebd9a51a938608, 0x40968c0c9302b0e8, 0x85704404d06f3a5d, 0x3e9f477a61a26d37 - , 0x1da1efc7cbd18d12, 0x4fb87a47b9f2cb04, 0x7556a45e8b5c8caf, 0x7f6991b7723b35cc, 0x3fa10a169532635f, 0x15e61b1cd72bd52f, 0xe6b45dc3b4667c21, 0x45cf3bd4bbf39baf, 0x7343b0636a9d63f9, 0x457551c49ac49567, 0x331e611a3fcec018, 0x7d19e2584756b92d - , 0x78951df174059655, 0x0573cd896a793337, 0xb3e37121fd458870, 0x3cc032b1a1bebc3c, 0x2571dd06d24d5a41, 0x017382ec4aa29ffa, 0x6cda850c15a224ed, 0x6af59bee2d7586d4, 0x287d3c4027f80ee9, 0x6aa570b9e51d4f25, 0xf29f327c5e0490d5, 0x00fb62f93f43edfb - , 0x7b06e602dc313277, 0x5d8dc98e723b039e, 0x5bb61813041a589a, 0x2a4c9f13eef7f1ec, 0x9439edcb4bbaba6f, 0x027f4d494e7784ad, 0x087ae2a2fd6bbc8d, 0x230f37ba41aec2ff, 0x63876e43daaac09c, 0x28abd7ae6e17dbe3, 0xd354d50cf000982a, 0x1dd774a1273aea75 - , 0x243658930d4b0902, 0x0df50723a2da63d7, 0x22bc07b9ac9628c5, 0x134123d68aa939cc, 0x4e84ee2cf0d450e2, 0x53a8c6dbd4aa9ed1, 0xd06e741c45610565, 0x608da7f96f2f7e19, 0x59b7fc9fe6a0243c, 0x0da36bb46fd1eb3d, 0x09a11de836914182, 0x3becc1cc0b96f1e4 - , 0x820b8a4cad71c17f, 0x2a425dd0204a843c, 0xf6f7fdaae1523c28, 0x5fb74c0c961e6fb1, 0x0c76e0f72b7845a2, 0x273db117946ce778, 0x7a22d35cdea5934f, 0x73aeeb1b24265d5d, 0x938a618552e4392d, 0x6050215beb6c1923, 0xf32f6ab781efbf2f, 0x2e4ece5c476e1354 - , 0xf2a4a59613812356, 0x555185da018933fd, 0x2fffbf95863bce54, 0x72644f9c3181e7a6, 0x98c6b1d509e3d624, 0x5bddd5730939d7d0, 0xdd197613d550fbad, 0x7671fafa1facb923, 0x13dbb61148c5b802, 0x616bc5c73ccdc3bd, 0x0b175b4c46fd8871, 0x498a1eeb000ab870 - , 0xa49f1ca2d7802521, 0x6906346cce00be5a, 0xf1bc33c727dd52b0, 0x5d005ff3122fd749, 0x51318ad5d7c622e7, 0x50f93d6d15e46e82, 0x88dfa2123ffff3b9, 0x3848e6fce3cac6e5, 0x6cefc31a33ea4f5e, 0x0cc5e7dc4e5e144f, 0xee2009402e59a7e2, 0x257679fdb86f4712 - , 0x4cf68953d8b17e83, 0x710f970c16ce2070, 0x4000b8e9e51e6aad, 0x5af48dacd01f24f6, 0x209679d5d3fcc916, 0x0a3538dd7cbe8232, 0x2d6d7aba44d990d2, 0x46c718f2d4b2c1a6, 0x9953d799a378233c, 0x4f4e80f4a682e7a0, 0x9912f04acbb77eee, 0x317432079a195b2d - , 0xaccccda6a1c11e3b, 0x3fd895817d0f3be2, 0x016db17673f750ea, 0x635fc619a24009b6, 0xb8447ab3370da1e7, 0x6c893aa19abf4221, 0x5f35ac703d8508d0, 0x13533d324d4adcb5, 0x84610370dece8512, 0x2223f126f9a70f4b, 0x18f00d60f3bf6a04, 0x174bd78b20ef8543 - , 0xeb179bc6a1698189, 0x732bf44a62015302, 0x98352342bc0e4bc6, 0x053f6640c1549e85, 0x65eee8b0397c7ce8, 0x790451f39f2fa27b, 0x36ffa0cb286cdb97, 0x46d07cec4c967bf2, 0x7c849ace30868412, 0x6dee239d339ef499, 0x8ab78548f273e57f, 0x01c5bebd8b7f5ef0 - , 0xe440e5f042eae93b, 0x65583f57fe057db6, 0xe6d5d26c24a565c9, 0x6b3b87a0a6ad702f, 0xd3f5d533117b8e64, 0x4addb9d0da92df89, 0xf1bd51990e0f9bfa, 0x30c624ec1dbcd0a4, 0xafaf2f00da7023a0, 0x3086e132b54574e4, 0x93bdbd4bfd3dd8c7, 0x690976ee132c892e - , 0x86fc11c79524d198, 0x0f6b95662e02c734, 0x5b78bb385564f568, 0x55c9b3f55d7cd16b, 0xdf1316434ad1c07f, 0x093d67d3fdf312de, 0xa1fd2257ea57b3d6, 0x4b5b18abe4b54439, 0x66c28f5b59d796b2, 0x7baffe6e642fdea4, 0xb9d3753265e68ae4, 0x40903bd6dfb02d6f - , 0x357958d4d72d6bc8, 0x179330dea4659dd3, 0x5a9ca85bc8721aef, 0x0209f09e03c9b225, 0xc0bf2e9738933495, 0x5e0dde4d715e50c5, 0x2743c96b66a6b951, 0x6af96188a0d6d358, 0xb2f3c72820f2a709, 0x5e9b8fd43327d9a0, 0xf0b13f5324012177, 0x7abdeaf4f741bace - , 0x6f006249351471f7, 0x3204eb91cfe9ed6c, 0xe09af1c83c13afa2, 0x6d70ed88d5de535b, 0x2078873d1a2faa1f, 0x5c73bedb8d96f3da, 0x41bbb407a3a1ce1d, 0x7a40ec2fb54eea85, 0xd6d569cb9dd722e3, 0x10acf67805927b6a, 0x27c61d818cc0ea05, 0x57b175c9f59904e2 - , 0x4f7b40bc92b5a60d, 0x51431f647b46b89a, 0xcd84dd55cc2a720e, 0x6b36059700809a1c, 0x78e3e5dd060e9a0f, 0x630c0c1a146c77d4, 0xc9925b0dea8fee2b, 0x4728f0604b16a06d, 0xb4601050635b2318, 0x2484f7281864709b, 0xbe2ed2a2523211db, 0x6425d4ff23dd3a5b - , 0xf0868c09017aef5e, 0x2733d1e1adc6d5ee, 0xa631db49f17f87e9, 0x36d753ced54d5727, 0x451d17fb6c4af537, 0x1dcc4d611dd55b04, 0x0bb8de0c8d3e549b, 0x2fb2ca1271592c3d, 0xd877914ffbc31ced, 0x190809a196504d10, 0x44bdd65a970277e3, 0x13195c678b4b01fa - , 0xe69a41a54f84d41f, 0x61c7c870565e4508, 0xeca2d2fc6f0e1c9b, 0x7f065480e257152a, 0xfaaa9f7c3a8873b0, 0x43fcdb8db58a324a, 0x969a79026e9da7a2, 0x4eab135af328b9d9, 0xb38aaafe87f85f7c, 0x69eba4fe1a6b6f32, 0x5607f6c6b4d27cbc, 0x273072bea774f9e7 - , 0x3c1149e3c8d51db0, 0x161f8cd433c28bfa, 0x765a61f218fe70da, 0x442b5d405f2036bb, 0x96f790271c564cc1, 0x3d5dbb33505cc956, 0x621a38b446af395c, 0x2da978b45bb70ce6, 0x755aca711da49388, 0x46f2e33e55e86df8, 0xfc5b454d5cb7be24, 0x67df47d68d8f6d12 - , 0x7a1e224893898aad, 0x0400219c89c2d13e, 0x6c969e4d63d460d9, 0x4df64d5df8b60ad2, 0x1feed05a45ff89ed, 0x290c4b59e684b4ef, 0x97ffbc3df096adb6, 0x4ac6037e76561c96, 0x1bc40299115e51b1, 0x7169e0a1d96aa1be, 0x43f55f8b6bac596c, 0x1cc6a0603081a178 - , 0x8e1d2db69bc925d0, 0x6ffb86eed51d2931, 0x3ad1eb242e0af1b5, 0x338198152fcd6d7c, 0xc1f381496df13943, 0x05d9242fe1c60b02, 0x39617510de7eec81, 0x24d8ba5ac76b12b8, 0x280eb2db9e548483, 0x6c51317b3a8a93f0, 0xb2a9f90939bd1235, 0x2da9de86c39f9aa6 - , 0x7f54917103127b97, 0x7be2be5ad3276169, 0xc969d703d31e9da7, 0x0500df3bbb1f8a4e, 0xea05c77685795917, 0x049575a992d09345, 0xd567f8de2daabe35, 0x383fad35a8e035cb, 0xb9353eb2bbd43d56, 0x52b3953221860c5a, 0xf9e4bcd46dbec03e, 0x4b0db0b4a7b3279c - , 0x8cc5f6b6e1ff80c0, 0x1bd2ce464b552215, 0xd008eb25b39c4236, 0x3b4ce5bb2f42a9fc, 0xe1f249681d153d9d, 0x3e022cb14bc4c5b9, 0x8a11d021c8ed5a53, 0x560d3fb258bec495, 0xf4405852705a6012, 0x5c8bccd2b1b3efd3, 0xd93c0f63ba7ce0c3, 0x337798cb3e93dbba - , 0x7a9f68cf800c8e88, 0x579afe689f3ebcce, 0x7dd41d6cdfbdb4a9, 0x3802410c4e1b274e, 0x64241d770cf0db02, 0x2f7c8133c74bde23, 0xf3c3fd835ed1952e, 0x741b1d88a3cee37b, 0x74e1ae644683c68f, 0x0c80dd9e0f7a91e1, 0x3984d741f3e47c24, 0x4b3eb97b6a39d252 - , 0x32e9b9410da9a195, 0x11d09fdc04ec3b41, 0xf92fd5e53cddea30, 0x296e095589e0ce05, 0x4e3200c3a283b696, 0x7e33fbba44ecb32c, 0xed3c039790ad0033, 0x5c8ebb260b5ec084, 0xa667455bb79d2e9d, 0x12fbec9d4f5bb155, 0x3aa5f6bb4d0d8d49, 0x0ca652ed7065d80b - , 0xb7938753d51c6f83, 0x41644ac1a602f9f2, 0x84223d4d63c38f7d, 0x71057b4b8b931282, 0xd39fa015165f47b5, 0x7536c8a19c33c201, 0xbe713ca4166c2dad, 0x456c98c2b4198511, 0x4793f25e1cb44658, 0x1d002f1cfe1a1ba7, 0x9f9ed6e1e1a27957, 0x095dece028426bdb - , 0xe57d3412fc1001d6, 0x481c63a0d9b25e99, 0xc756b6ba0dc02aa5, 0x24af047d79ed4683, 0xe37ac10133b68275, 0x418b45e570802012, 0x87578def0c3900ce, 0x7c5661923b8c9740, 0x5f4ab0a6fdda7366, 0x0ac6100825e4eb3c, 0x308528e42c9e4d32, 0x436e5979933ddde8 - , 0x0cd6ebe123352222, 0x63d1768a46f33dc7, 0x96cc55dff38c9273, 0x474438da7140411d, 0xa184b89b81cf6402, 0x6bf820a3aa675050, 0x3bd4720417391f0e, 0x3f2b8f859a8e0cba, 0xed952561b125da29, 0x07eb1ac74165097d, 0xc3f70d0c7db0a9fd, 0x5ab896a489294a6c - , 0xd4b608975c20018d, 0x6243b039f25d0456, 0xf766e98fc24c7464, 0x20035c09d2291e42, 0xcc0e5b5eeb462524, 0x24bcba5505f90657, 0x43a98d98e4fa9bf6, 0x3b621ec4188264d4, 0x633472fe235c812c, 0x31a20844a3316d23, 0x47b80db7d7f5d0bd, 0x22d482f5663780f9 - , 0x4df227dc52142020, 0x25076d0624bf137e, 0xcb4a6ee30a657645, 0x0ce469dbb5ada433, 0xfdb06251f65b9c5b, 0x44f82274a8e8f538, 0x98fa4c81cdec4b97, 0x0ccd61d1abb61d0d, 0xb9dc371344c5ab54, 0x35dcd9ccf8e5f919, 0x67fc81f369ba5722, 0x121b5aa1af6024da - , 0xe0b1b16b0fb1f1fa, 0x4dc688d6d3b1805b, 0x05c187cf10e40104, 0x71af39c743daacd9, 0xe691e97f82acf4b3, 0x0c46305b9243bf5b, 0xb063af137fde616b, 0x4e26e72a1de067f6, 0x61fe66d01a221004, 0x172fe9240cea50b1, 0x4ff50d37b2effefc, 0x06be02ab0b89aa5d - , 0xdd4aab96717af213, 0x32322555b58a7ffc, 0x7812aa965889326d, 0x1bd608f60d6457a4, 0x2c7b6b44e999e141, 0x113a86a87856a8a8, 0xd95469fc33814855, 0x4a18dc36f6bfd586, 0x0706b60bdb854fd3, 0x4dc356685650fa90, 0x24ef7cfce41f8dcc, 0x19049c3e632deae8 - , 0x5c9a4e28b7138a89, 0x0f0b7dbc1e5087e2, 0xebf49cdc66a362d2, 0x19e4b815e6576c85, 0x1896051ee3b6063d, 0x09ecc741852a68e4, 0x4009034def986795, 0x36b440ff39b4b5e8, 0x9bc2647ee28af1cb, 0x62613c9dd152b3a8, 0xc2018ae5dfae5f2d, 0x29ce5ef30009c855 - , 0x0b653558b21d2b1c, 0x45e2c505d1f74936, 0x48304373240553d3, 0x0528569885a82310, 0xa90d402e33924181, 0x5e610edc23cb9555, 0x28890ae7e007d28a, 0x7e5132b6b1ebae37, 0x0d5252eb7c94cb1b, 0x308ddaea1fdbb672, 0x99fac0b431730534, 0x77d54ed63b9325b9 - , 0x4d647bcb76c6ec3f, 0x0e968b22ec2cad86, 0x4b22b5ec30b08a35, 0x3b31df3b52326b5c, 0xbe84f638dac3105d, 0x7db085f133ecbed3, 0x7a8b694596f2cf2a, 0x67b2e6c15d16e0aa, 0x4808b20bf173011d, 0x25d5fbbfbe66f864, 0xf67f3f3cd9743987, 0x654250e89617ddf3 - , 0xf5a1a7e0ba0a88c0, 0x3616c781799ab50a, 0x2669c27a2d256902, 0x3a8ec380e12fd7dd, 0xa25361f44a418e30, 0x2942f3001d233645, 0x60f1d3b7535a4133, 0x14deaaa12e5c7bdf, 0x0089fbece10c8d6f, 0x4bf7c313757c803d, 0x65aa30bfbb70567d, 0x4fed47af409a3fb3 - , 0x07557dd875d3daf5, 0x36c49c2380e3c9bb, 0xa21f643d329ae02f, 0x6cf6f7474338bcb0, 0xb5df78136a0f3012, 0x031fb2df2e00e9d4, 0x4d86fccbe75e79cd, 0x23f890e082d03b7d, 0x5716a1ffb50a8262, 0x0199b50aa6cf3302, 0x6a1be351f86090d5, 0x36095efc13349364 - , 0xffe752be8ce46920, 0x65047a340b652f65, 0x320ee55fd03156a6, 0x5af6aa45278409f6, 0xa6caf283b1cf3850, 0x4e3a988f61072f96, 0x750f67926b18f680, 0x09fc3f2927d21a4a, 0x914893c2f2ce1169, 0x4d15b367121b3e75, 0x6cb12559723774f2, 0x3ee5b8c2a70e054a - , 0x7dd9b3518d84d2d7, 0x147d5a5a53f57a58, 0xe1bd0904ad842a05, 0x3a0f3b029c9a5845, 0x7153c03261410074, 0x4e203d6737058c17, 0xebecf5cb79f28af9, 0x574b889870c279f4, 0x326317b005f444a4, 0x7480da44b34f4b1e, 0x7c5f21cdc46275b2, 0x210494b9ee24e4e0 - , 0x3cbf6ca1f4aa4ead, 0x6bf3872ccbfed940, 0x19e8a84673a566ca, 0x61a80e16990401a2, 0xea2e029e7f9b3824, 0x5762298465f0ebd3, 0xf60e36d4969f9af0, 0x00b826180531c799, 0x17120ec95cf3c61d, 0x47196cd6de85c7d0, 0xb0d47cff46a5cba3, 0x29271400d7ede26b - , 0x835908353516b894, 0x4bc57f8c1eedec8e, 0x2ec5deede5c0db5f, 0x7b9fc48ac4a689fb, 0xf82ce6de88fc10e5, 0x6c5d84a70e03a3d6, 0x88a211fc4ea531f9, 0x7d5583e5918aa03e, 0xbdf2d70766fb8f39, 0x5926497e734ab18a, 0xd6a9872b800cacb4, 0x757c1cd521fd22d6 - , 0x22d50b0c13ec4bc0, 0x288a77d34a15e99a, 0x95c8e78fced3d4eb, 0x45ece109c15be169, 0x878ef262d0132128, 0x48110e9fd98939d6, 0xe3fc5425d2e7741e, 0x050ca6e71f599c65, 0xe02f97605d9fe375, 0x2af48b9bfee410e4, 0xfd34a1c107229a54, 0x43dc6f0cdcbd41fe - , 0x15b4eb7d65cc562b, 0x369a7b0dd3e91248, 0x2b087611edd32810, 0x116b234ddce09d7f, 0xcdb03cae8e90d2b0, 0x4017d51587566038, 0x081793739242b600, 0x5086e8e633cd52a1, 0xf5ddaee155cb8087, 0x773311b60d59a7e9, 0x36e5aa0acadf2068, 0x7126a4281b192882 - , 0x54a10df54f7ecef8, 0x3cd7d2fbb6e33f67, 0xac31eb6c3e740c25, 0x517db54840feb2de, 0xf17cb269b3ce27a2, 0x04a8fecd1dcc99e7, 0xfc887c1f2f85a2da, 0x280da7425bb55b01, 0xa1af72f5256a5a53, 0x71da839fc459f465, 0xc203fe7ba6587f71, 0x08a4201f77a4f335 - , 0x6cb9ea5683014d96, 0x5da17076b6b51ae2, 0xb55ac168c3e3997f, 0x41b9a32373d78f7a, 0x96f58033b8600a50, 0x6ebfba3ec9d956cc, 0x0ff8883707d66d0c, 0x2f562b035445226f, 0x2388fc015bd368c7, 0x2b7d802ce27f627e, 0x301f0369c24083a6, 0x77e139f6da8d5aaa - , 0x9f78574697fce43c, 0x02726c94565421b6, 0x1ad6007338e26585, 0x6134cc5eb35c02ff, 0x77ae739c9cdcd1e1, 0x04e96543233c7a13, 0x97d3926dcded2e10, 0x6bcdff7e14cebb73, 0x9c46ae2b32489774, 0x04a97b9a0473af8d, 0xb0350bd910d9784e, 0x448212d3e2164ad7 - , 0xf3464e0351f5e995, 0x68ab4d24b3ade8d6, 0x86854d534002af20, 0x613f7ffe5de92aeb, 0xb385b4f4608a370a, 0x220dccecbc6f2688, 0xc31ec5384abd3680, 0x25a82841a2000fd8, 0xd19e422504694236, 0x0bc1124d541781f5, 0x0808651edcd99176, 0x41b81f223d429c76 - , 0x1a6dcb2662cc80c6, 0x0b101fb0ef0d1f74, 0x6f02aed8f8327119, 0x5b4c5176ccc4a340, 0x8fcefd200d6ee8ed, 0x0548127287f44749, 0xe1efeca1fadd1341, 0x0e74bc189dc9016c, 0xe90470353f46cb12, 0x69513d3455bc890c, 0x9503686f1f2497d1, 0x280a0bb7733f1086 - , 0x14e5f99930a91dea, 0x7840ad84b03c3878, 0x46e32c654fdbceb1, 0x7e88d2822bb2cecf, 0x4d78a8aed7f8661d, 0x70eb17416ef40180, 0x97b6f1733c474a10, 0x3d0d27fc4c7084ef, 0x730f60f6a1ee0d71, 0x7bf6e3885d3d9302, 0xa1e8af33742f1611, 0x73b798ec129822ed - , 0x0f669bb094642a70, 0x142927de789fc4a4, 0x0db18e01fa98cbd7, 0x6ae4d37674be1451, 0x7175e98f178b4b74, 0x40534e319bc52c6c, 0xb7211d252c4db879, 0x1a7651f8f3ed1aae, 0x9c9a43932d50cc97, 0x630b232b7201c359, 0x327d77575f5b3839, 0x5f0e19e78431864a - , 0xbfbb00b6530a3bb6, 0x19ba9d60d97f7857, 0x759779de744bd764, 0x5facbe63177791e1, 0xc74ea511c56a3b61, 0x1d8909e84083c31d, 0xcd20094b507af492, 0x2ef1b9c07c92ab37, 0x8430ed9ef8494fc9, 0x3f9170e6df5b1fa1, 0x1fb8dbc837175d73, 0x65b961b58008d022 - , 0x7e1afb6816864b6f, 0x54c4b92c534871e9, 0xc0a1dcd60d61ef84, 0x4390f0e992c41298, 0x1e54e2c8b7c27348, 0x7a987e01a2ec308c, 0xee42fbd90c4a89fc, 0x1ed8c77f8d7c609d, 0x569dedaca99a3346, 0x0eb471e609fef4ed, 0xc915522a3b9fd03c, 0x726453b246746bfb - , 0x4ed3cae53dc5fa4b, 0x1bf1e4b34b9feef6, 0x0850df9f0401fac3, 0x0a58d33cb2422e2f, 0x3d197f9603ecfc29, 0x45e46edba1cc432e, 0x96c0c93310d9bcaf, 0x18de3a458be2c33f, 0xc9e65e5bcc12a49a, 0x71a5345f0239b187, 0x53b3b2f01c5710b3, 0x438350f57ce2ec4a - , 0xdbbd368a760391db, 0x4033638dfec29fe2, 0x297ad75ed73117fd, 0x269c08d54b106e8c, 0xa4e3e4fd238b4218, 0x1f48a1cb09208aaa, 0x9575153115cf5fa7, 0x59feeff0876fb74a, 0xfdedb4af6f368710, 0x79be1fe79fa674d4, 0x689d6bbb4c707c39, 0x394a451499057bb1 - , 0x5887d4fb21fc43b3, 0x37628dfc4b5c23bf, 0xc66b76944b34bd13, 0x6e97f0a8a45bcb36, 0x3ac6b10139edbbdd, 0x313f4846b6745833, 0xf8758d9777cd9037, 0x02fdc98f02692537, 0x9e79f381fff833a5, 0x25ac5d68c49b105c, 0x1e9f48a076d8c9ee, 0x788c85c9fe9543b3 - , 0x776ea51db3b3b778, 0x0007c44055b64db2, 0x3c392c2a82fddd25, 0x65000203be8ee976, 0xea119666ab7c50ab, 0x528b2700e8f82d39, 0xc4aaf797118b8282, 0x55e5a7d5382e0d3a, 0x15a80b22e89f1039, 0x199f68594b1247a0, 0x8d5630750d622435, 0x2687f48cc6def5b2 - , 0xa16b0c0259eafaee, 0x7aeb10834e93595a, 0xe31bcf34ce679d9f, 0x4e2c19829eee3c87, 0xa46869cb8ca35c9d, 0x3cd35313c08504eb, 0xa088eca66e98389c, 0x44c562f0f7262740, 0xd3eb8a28f447523a, 0x43a0e059bfe37576, 0x0312c5d6d0f2e0ad, 0x5f30aaf0d1614c61 - , 0x6f09a7a6e182b0aa, 0x575db3d21a82296b, 0x6599bb5eee7925e6, 0x093f89458dcc2fe3, 0x70c4af785151fc84, 0x1230c0c519de5480, 0x0e66f8f93075a4f6, 0x5de4a122633a5c6d, 0xdb99cf83f9ece1b6, 0x1c3acd4a13ded617, 0x4dfe69e68f59c447, 0x482ba1f7715a3c16 - , 0xefeed2a7c81ea8fd, 0x4e089eeb713a572f, 0x78bc74acfbdf322b, 0x4b4951ce8eb86fbf, 0x0eafb6b46ac6714d, 0x72913ed109f7d404, 0xb498bf6fcde9e3a2, 0x3c08a283ef5ded62, 0x9af09f593a48b346, 0x7ed52441d00d4980, 0xa78e843ee5df44ac, 0x25db12d420a86151 - , 0xec840e7e89d049e0, 0x5a34cbe928bf96cc, 0xd875dc5525da882c, 0x2af4442fc256827d, 0x089fb428c2ef5a5d, 0x0b573ace080a3d9c, 0x6f57282554c240da, 0x425ceda6707b6bc9, 0x94b5a8c3dde824fb, 0x264f6f6a445b5da9, 0xadf292191c5c1eb7, 0x5e302e82fa4e5533 - , 0xf51712fc44237f35, 0x2b0af62c42e56e66, 0x10392cb4d9c71b75, 0x4d7e08fe8457a95b, 0x210b9eceb04534bf, 0x73329d1c7d88e1e5, 0x667a43fdb4ba79e9, 0x3435ec04276ede87, 0x38b8540a1a78b098, 0x4f6c266e6793bb78, 0x447ea35172754041, 0x109d7b742d8c3dac - , 0xe3ccab45d2a4f6f7, 0x59040bb73f3bbd2a, 0x730b39d65645bab5, 0x5c61aed2f83382aa, 0xa992143de3cf83e1, 0x13455cb889b700f9, 0x54648228b310e2f7, 0x5b837752ee0f733a, 0x3923a6c0e5ea0dd9, 0x5ebebd01fc9ca9a2, 0xa34c205b8fd94258, 0x7d1a10029a0b6cd5 - , 0x6c83c02241a46527, 0x4127c85d6be1fc62, 0x26f86ff5ca7240b6, 0x2167391e7dd95cd9, 0x79227506ac78caef, 0x1a2cf919b8832a0f, 0x07745266405cf574, 0x38095a07f5713ae1, 0xe5eeab985ca3e7e7, 0x6a5dd9eeb734d639, 0x991027ebe44a4822, 0x311085fb4de9c1f0 - , 0x33f361e21066c3b5, 0x550091d2dfc8688f, 0x376345c5532bac13, 0x0aa0898f990931b5, 0xea2f3346e5d3226e, 0x208790ab78776afc, 0xac7c2ae63433850c, 0x3c5c373ada10ef52, 0x96c1b4003f4cde6a, 0x4546a9c475c09781, 0x6c961fd3e8536294, 0x43f36e63fc0d5066 - , 0x296601d8c42167f4, 0x241c1fc38565471b, 0xdb00a27e11ce9617, 0x60381181b7e7e4ee, 0xc1076b7635ac4d52, 0x0166010ffb8dda38, 0x5238f69becc43e0b, 0x63303a2015708b17, 0xe8badb2e5bb22591, 0x3a10a4e218b6131d, 0x236ab01aabf1a7b3, 0x1ce8a51a68a4126f - , 0x59e775e2a2a87928, 0x770b48eb4b738301, 0x0b43c2be176bf79b, 0x1957850fb6424660, 0x44455ee1ecb0ab2a, 0x620ceaa116eef4f0, 0x0198f62cb6183f6b, 0x3274f78eaf2d55db, 0xd2ba4e460cf7ed5f, 0x19cfc17bc0b66f43, 0xcbae6f45b1942722, 0x5d93e44739147b58 - , 0xd07180b9d28fc597, 0x35372b21b2ea5a46, 0xed2673477f083464, 0x7a9ebeeecc57e6c2, 0xb51d991a81a6b314, 0x35e7d90f4ed6de58, 0x45f21e209510dd05, 0x446ffd2715c8d380, 0xe69b5c7a9b6d3e76, 0x1379e79fb96912e6, 0xc161c848bd508738, 0x22264a049d8cfff6 - , 0x32321a68ff7ef7b3, 0x57b0e50cc585b333, 0x1c08c65ba9d764e7, 0x5534c793f92f00f5, 0x7a1ced97eafe6fe4, 0x6b8933739202599c, 0x618c5f8fcadd3ff2, 0x2a8719b3e6548653, 0x346a9ec5c4200f0c, 0x7a36b8d00d0eda58, 0x844b22b75021accd, 0x769737059fc5e465 - , 0xdb1ba69b5019f266, 0x1777242305db9ac1, 0x491d11ad264b6ff3, 0x136198dfc57a3053, 0x4a6cc64741eb7176, 0x14e811c97fc97650, 0x6b64667f71be386d, 0x3286fcadf019eb5e, 0x3f2591f4498e10a0, 0x674fa7c32df7867b, 0xbae8ec7ee100dcf2, 0x03b2c0a20a6372a4 - , 0x4c8d76b471e24474, 0x421fb6a7b8a3216b, 0xc672bdb2fe8f514d, 0x202af653d9aff3f5, 0x05e5f80f9626953e, 0x7b721fa3ccd42ffc, 0x99d8e481c0f70479, 0x054c31746d23362b, 0xfbef2e20430e8025, 0x60e1e3f02e7720c2, 0x161701874eb347e3, 0x363924e90cbb77a6 - , 0x180f5ee1863a1a6a, 0x2f79c0046ff79fe2, 0x44679866e35447f0, 0x1c64c6dd73e0d636, 0x1d8175566341469d, 0x5ba634965b8b9e87, 0x8f48744f976952a5, 0x744f28d23db94c8a, 0xd15e84b1f232da34, 0x556f3d7aa38bee8c, 0x14693c56e866ef89, 0x1564fb9a0f81eb03 - , 0xe97eed56fa2b483f, 0x6d3f7e01aebd1957, 0xae8f128aca3b3e45, 0x3d41e85ba2afd3a9, 0xe4fe485e4b6d8328, 0x65c49b4c3e98098e, 0xe96a00e054d6e91a, 0x394a2122738cd006, 0x715cca3dffd90785, 0x7bc3dcde15890965, 0x6dcdc47a33a148ac, 0x435db9d6dbe1bd55 - , 0xd74d4d6e0fd89c27, 0x25e727f6a5380553, 0xbe54127ba6c5189a, 0x65c87d3c3e61939c, 0xc34a6d122a809e2e, 0x7de6b787f097eafa, 0xb8f8b6e701758661, 0x10705fbf97042046, 0x1591614e6da2d44f, 0x7c74f26ec6eb070f, 0x9ad98c1a50249c60, 0x6e1bbd44d64b2302 - , 0x937cee76047790f9, 0x5b4ccbc70beaf690, 0x332e79ae75ae0dae, 0x2e6394161d093556, 0x4b378bf68f6849f0, 0x6c419fa0cebba72d, 0x8bb431e1e273f2a4, 0x357cec80bbe024fd, 0x83a6e913962f11a9, 0x7808df02e2523718, 0xb6690b5dabc49e13, 0x6cef23259375972a - , 0xd18ac767b5e551fc, 0x5a0ba1dddb15bd36, 0x6f7923de219e3e1f, 0x3ec23588db9b5cfe, 0xa4fc23d42c83bbe0, 0x21581a00768658cd, 0xa295b6e57110218e, 0x3e7bbab1d15f477f, 0x2266c03d3f0d0635, 0x4174f08a95be03b5, 0xaa1a674abb8cbeb8, 0x6bdf6ba553ae3390 - , 0x8a31f824638545e2, 0x2a9e37a0f0eede53, 0x148a53d8cba69f65, 0x64c587e816d96316, 0x777a028a47e97e93, 0x13728e46befb2e0e, 0x13138b44862fa665, 0x0fca8c38a87775f6, 0xcc44bd580dd067fa, 0x40f2f7642e22d02e, 0xab3ba6db80c2f728, 0x5068aa2e2d25b7f9 - , 0x5a8a842c0a2923ff, 0x67c39e8a1006c196, 0x8f5cb9ff55460a84, 0x2e735c20a419a518, 0x0c6ee3fcbfdc2da4, 0x5bf6ed60a87b92bd, 0x5e4ce130e8e1608f, 0x0932ceb3e50028e8, 0x793cf8a0538cbfb8, 0x4e89e2c018beb7bd, 0xcaaa79642f5060de, 0x542a38a4d13f0016 - , 0xa1b0fd9aac663e55, 0x5158bf1f7b33c0e4, 0x060e82f65a4119fe, 0x32347069a1529fc4, 0x5c96ef69127480d5, 0x409a902134df6ffe, 0xdbe8c392eb6c7013, 0x73f2c48b0e3b4a79, 0xddf5060b937e2dff, 0x1534f901278611d9, 0xf47fe29ae4fd49a7, 0x7a2c0bfe75539f29 - , 0x19e04d1b2b0fe7fb, 0x56381ebd8181b50e, 0x5c8970c249df4ac3, 0x08acaece8ede7685, 0xc44f1a71aca0d20b, 0x623edc8d92e4ac3a, 0x5496a7e5885a0c95, 0x20a9ba37315b116e, 0x3765873809f5b55d, 0x23c44c42ebef2ff5, 0x56a96d921f724573, 0x3217815b72b8a9eb - , 0x2cc1b42f5350a489, 0x31f0b36e85b8c70b, 0x504a5c8c4d2ce34d, 0x1af8ea26b3786eac, 0x69bc5e26d7afd62f, 0x21e399d04247bf9a, 0x6e6d6676a88efb27, 0x476212b9fe9a6fd4, 0x0740fb65284168de, 0x5f7570be65e69408, 0x0166c3279dd81c29, 0x6565489007c4ed6d - , 0xbafb5bd37b5219c9, 0x00251709f2e210f7, 0x0d22639b51c1198b, 0x0f3c0df3be3de811, 0x3552612be3374eef, 0x0834744318ffa0aa, 0xcb9f1c1e3557a00c, 0x20c359f5de8b6614, 0xd319482a34d05268, 0x42165771b46b75d7, 0xca336c22e8d911a6, 0x4d072f70067a47e1 - , 0x9022c6f101555e9e, 0x4c8c7eaf7cc2d697, 0x629810b2d8044817, 0x25110bc01b06c9c1, 0x1bf9c06bf39eaff7, 0x6cc36f151f52b4e8, 0x76b73a6a14b62068, 0x47dcb0dc89db3821, 0xfe9dfeac2f670f41, 0x625b5c93b973c417, 0x5f8c917930133c1a, 0x6bd35f3e0992bb2b - , 0x03b5391a85409e5e, 0x7981d8fd16362767, 0xdb45c80a32a23cb6, 0x67356a7ef48b2dc3, 0x6189236e9f01adaf, 0x07a1e954e5032bd6, 0x53d627199c69727e, 0x25d67e4163cec014, 0x18e7bb6a63a80738, 0x3112be4cb5dcbc74, 0xad9ad6d381643f04, 0x116112cbeabb734d - , 0x32623abe2d66ff07, 0x4d780300822436de, 0x9bed066c04497808, 0x40db29b39ce86700, 0x6e5e5eb3805602a5, 0x52f227f2b1b9b40d, 0x51c2c4c197a18394, 0x6d8bca423ee270bc, 0xd6e60cfe8fb07f72, 0x7dd66c3970f940c6, 0x66aea7b59a0b17cc, 0x75fcf8b00160d729 - , 0xbedc5ea39b2402b5, 0x0dc3600425feedd5, 0xadc1ddf2cb1b6631, 0x205ee93e3aae976a, 0x7a2cb4e333c98498, 0x7d12eb776d56872c, 0x8e339bc1b41599fe, 0x4600f0a53fac9427, 0x1049d3a372f14304, 0x7b54e020b22db742, 0xd567962272a35739, 0x27a1178b1115f0c4 - , 0x6cfb39d619c35e1b, 0x5cb96fd1a9d9d486, 0xaf45cef7fb4fffea, 0x4a73d7b2ba9321d1, 0x44b46b4a80be86ac, 0x2769b50579e8f734, 0xab5d109e7472f372, 0x2bccfba1cbe995b6, 0xc00026115332f6a3, 0x7acb287da1561c53, 0x21555c608cd90dd9, 0x7731d1b2878dae13 - , 0x32122bf5ec1a0649, 0x592b5fa180ec8467, 0x876be1b5ad9ce66f, 0x484c1cc5bb34819d, 0x08e4cc425b30b06c, 0x2766065f0e4d22ce, 0xd90825644987aeff, 0x3a835fcc7fc456a6, 0xf4d801d2cc806d69, 0x41d767ecca55f839, 0xf2dea9fd01f1e74f, 0x74d01b97462211cb - , 0xe43e280ad29f80cc, 0x5cdf66a69029b231, 0xe8d655a03c862cd9, 0x388e38b58d0e8c79, 0x5d9aaa4848ff83a2, 0x14d6fbee4d6cbe74, 0x0426dcda912109ea, 0x1bb7b9cd75d4b541, 0x3a3c0504b39b8505, 0x35a3c5882b31367a, 0x678793d635a6473a, 0x66abca7e20202034 - , 0x4a90ff1dad300021, 0x18f29036544d2684, 0x2036d39b8f69095d, 0x36490f5645d18cc8, 0x9414d7368ad3562e, 0x7f8108a04558487e, 0x93db0e56d653e40b, 0x03f413ea960537bb, 0x984717b77f7267ef, 0x6c5d9da4a5ee7305, 0x725318dc36060a49, 0x274397f8e79a239e - , 0xbda7965b4095bab0, 0x6292b2505c7866e3, 0x451fb6a0672d6733, 0x37c560f40242a859, 0x151e56eb818f1423, 0x63451986f0c22ee1, 0x9275ff873a5c75e1, 0x178cdc734a32b96a, 0xff7adbb24244aacc, 0x76518aa0dfd96ddc, 0x161c1c8c81071219, 0x0584d44c10a3e6dc - , 0x2727282a09e9acab, 0x1298e49c34514ebd, 0x0323d059ca1c0e6d, 0x6072c8b87dd26bc6, 0x36eca2ab28d36f26, 0x2a977cb5aae4ea2a, 0xf157d43a0b9546a7, 0x04d60af0ed661d29, 0x34bc1080126e4402, 0x7677ef9a21589171, 0xbd13797278f07a40, 0x32c0daf0b57f20ac - , 0xbc83fd1b8366dc2e, 0x6cd07286c4e670ec, 0xf35485a3f339dc8a, 0x6e7e9285f2247e8b, 0xa9d19d3a09943bae, 0x43fa5197eed852a6, 0xf911398a043242fe, 0x4a100dcb1312cbe9, 0xbe2fd86be910a692, 0x614fd829368d7937, 0xdb5a98b1a92d578f, 0x46f1d23e1b0dca7e - , 0x8bf4c6725e813f36, 0x68bc89078129ce91, 0xff56503ae28f5c7f, 0x2b6e0f4e42178ce5, 0xa97cd947ec65895b, 0x7aa90b66280ff6c9, 0xebbaf32df158a0a0, 0x6a748d0ac02bb713, 0xdf79b5d619e83397, 0x16934947f6485b69, 0xe75185521ab32881, 0x20791e276a7460c9 - , 0xd25c403e22c70bc9, 0x0bf079518e66e1d3, 0x45dd5c971d3711de, 0x66bd2c6a30be232c, 0x607829e5b29e53ca, 0x30ed414e71dc08a2, 0x3fd38589ea0f1d39, 0x5a881a121f37fc5c, 0x27b9394368987a4f, 0x321fe45e13afae2d, 0xc6feb75080f33ea0, 0x02166d52f45eebbd - , 0x15026a1b0ccd2fc9, 0x1141be93d5bc3d6d, 0xfd20df606fc676c9, 0x4059e26b00ad78c4, 0x0709b409cec6b505, 0x68f020e8acf478e5, 0x875d77d1f5df0cfc, 0x66eb377735162ff1, 0x860482ab417a32ae, 0x21175f47da213935, 0xa07ff0cda099ecdb, 0x26ae5f177ae2b8e7 - , 0xa9a070ea5120eaf7, 0x2581feeba7383f81, 0x49e0f137f1fa2a7a, 0x7fe93c51cfd1ec62, 0x2d74dbdca7777f7e, 0x562da2ba74e823ff, 0x543b4f8609d77a2e, 0x3a0f65212f234ec8, 0xf842e3fea270ebc6, 0x4524322c6a289e11, 0x80815887aa6a8576, 0x46f49d53c3fe29a3 - , 0xbcc93cedfdb0d388, 0x4db312076ef0ad2b, 0x1f2cd56373654ad9, 0x4c6446970034d15f, 0x34d2cdbfd5d7130c, 0x6198950d03db2ae5, 0x736094b72faf1b1a, 0x1f6ca46a9f2588f7, 0xcba0b03d6259772a, 0x24e5a23d8d6be3a8, 0x7090e340c94f6d6f, 0x287ba27ee54e8466 - , 0x87320c8822d607f0, 0x44fd5802509df171, 0xf35c09860bf6ba4a, 0x6cf53130ef77cc0a, 0xaa81167a00b48ce4, 0x649f4c775b0d8b48, 0x59a25683ee98d33d, 0x651479007d1061a6, 0x155487411f6e16da, 0x411d036475404bf2, 0xc231f1344162458a, 0x4f36b7633f7dd368 - , 0xa98ddc0a4e7a89a4, 0x55d8a5da6eacd542, 0x5c3fb48b1001ed45, 0x5c7785ccafa702b9, 0xa64369fd216afb79, 0x1f405ef10e940669, 0x755f4831bc327b6f, 0x2bc1b67d71f1882d, 0x8eab15cfed7777d0, 0x517370d580d99326, 0x0811b75701c9db39, 0x234d84cb52f7b621 - , 0x970c4fbddddae49c, 0x3ba8d842475e41e1, 0xb0720f6ad75e7008, 0x275cd5c5184bf345, 0x5eb9833888d3796a, 0x1b3a42dfde11c2f3, 0x946548fe092b5f4d, 0x119917b50f263cc9, 0x622de955a20a3f82, 0x6a552ea3a60c7ff4, 0xc79230138150372a, 0x18083b9518de76a7 - , 0x55fb74dd7d3b5455, 0x523eea9a70ff8334, 0x5994a7335e356271, 0x3bb011f60430f1d2, 0x1ec434cba1d6ea7c, 0x69b632960feb5780, 0x46c50417541ebf07, 0x01470bfbf9d23830, 0xe9551f4c049bc5cc, 0x1c124638f35ee8ed, 0x09ca3a9141e83a38, 0x44daaf3e7411127b - , 0x0e54717b6c2fcd10, 0x518ab46b26d5914b, 0x528ac6c82341e833, 0x2247fa99d41f4672, 0xabe30c65c0f327a2, 0x3ac74e012b77e1b4, 0x35defd694c0e86b3, 0x7c382e10bfe60e4e, 0xf37e382996b8461c, 0x4d47481c53631e1a, 0xac8f167884f7b7b1, 0x5ae1bb6ab1a4c643 - , 0x63eb02590829df80, 0x623126862a793fa1, 0x6e1e242f1ce09807, 0x7bf96130aaecfd2b, 0xedc5e9ea10bff70a, 0x66b548233b94d26e, 0x70c70ee4594d30ab, 0x79b0006c8811353e, 0x4352792c91710c1f, 0x0c7bf15181a9f539, 0xfc995ee769e3779c, 0x44871c6cb9dcedcd - , 0x0d180bbf2c9a046b, 0x5445c598c45d0cd9, 0xdefb32386875fb94, 0x5b0d235355660f35, 0xbe1dea825b3a7973, 0x10658ec4e1bbe147, 0x48af5e87fad77504, 0x55f5d3c94a7dd694, 0xa9a3e7062cad6ba2, 0x36c0a7e3f9e0ea31, 0xc4bd65217010aebc, 0x1d031dfc8b9fb598 - , 0xe3621c104113889e, 0x774b77ee1e6a6477, 0x124c5b8a07785fd7, 0x5a6c0df18188cada, 0xf4adcd545e72d7be, 0x38100fffb66ba966, 0x2100cbe35fe4a4d0, 0x4489be2df052c175, 0xa03a22403b26899f, 0x5ae4a0a0fec13928, 0x89dfbfb802795eaa, 0x34917e9c4ecf2532 - , 0x64b93674c60cbbb3, 0x25c098506334c71d, 0x8a723f66f1ee34e1, 0x3a960adf48f141e4, 0x659f386695e440bb, 0x577a0fbf6e8095e6, 0x8ef419b0f4b25496, 0x044176a30b9e465b, 0x7a98705df2013e6f, 0x77d0b2483aa95ce7, 0x309e917b978effd7, 0x08f1e55bfe942c7f - , 0xfc241629b8d613c8, 0x140f2e35cd68949f, 0x38899f6a3ee4f9fa, 0x7abc8ecdd300f3b5, 0xd3dad23505d23eaf, 0x75e73f09376b2c7c, 0x5644a663b60ec5c4, 0x511ade8afe1eaec9, 0xbb005fe4e1abca89, 0x2838de73b0ca1f6c, 0x800a6658b80d28c8, 0x48aaba61c91641ec - , 0x222759cab704d4e2, 0x106dd3c0ce85beca, 0xa1ce1ce341f69d03, 0x1651b210e8e4ee10, 0x47329a5e7133e136, 0x58c02f47dc9367b9, 0x09dcba56947b02af, 0x435c251178125b48, 0xd56979a3f0cd9315, 0x2f02b0a6422afddb, 0x23920f500731f32d, 0x0ab833238232cb5d - , 0xa7b3d1bfb0bb60db, 0x2342c2a03c6eaec2, 0xac5e6e5a14d5282e, 0x5b9a421ddc42a24b, 0x018506414543e056, 0x6d7c377c084954e6, 0x4f8bf71ed3db1ced, 0x5150dbc15ab10979, 0x00b50a1b373a7fbf, 0x140be5c3d3244705, 0x5005bfe96e5b7911, 0x77cea555bb133f3e - , 0x2ab1e1a9d7a973c6, 0x3897ac98314968d3, 0x9e0f74764b23c9c3, 0x2e5ecbbae41997cd, 0x43e2ea5648f12433, 0x3a515a0e4808e69c, 0x17d36c03c36bb343, 0x44cebd053481ce43, 0x89008656c21b0d76, 0x2f8513fcb9009be6, 0x2e223f90208a0e83, 0x3828c2d4efd36a73 - , 0xbf17d64f89a8527d, 0x59ebb42b9656151d, 0x7d7bc7245c7dc5ef, 0x191b682a0cb695ec, 0x8931172fad9f9add, 0x239b6cbbab2ebdcf, 0x76932f9ca7002dd1, 0x0c140548f858d8b5, 0x6c7adfddcf741ea5, 0x3b39c4b9e2e1a567, 0xc5135a25f87436fe, 0x690d8fecb7dd0ae0 - , 0xd782a618ecda10c2, 0x4f2a84b3134cf832, 0x35a81f71bbc955a4, 0x457f88ed64ae6398, 0xc27eb71c31479985, 0x4ae91808569aab32, 0xa5f2e9785a75eb11, 0x619cb199b837ed36, 0x0e7e5912b9484e40, 0x3b5831e87fdbcaf0, 0x49a2779c2d2b039d, 0x3d4b81e07f49061a - , 0xaa119b0fa222b55c, 0x265c1b11b42fd4e2, 0x6b4d28e519dd7637, 0x3d2da7900de5a4b2, 0x99b06586b5f21d63, 0x4ce62bd9e6a1ee18, 0xb671e753932f8c92, 0x390b7821d0987834, 0x1adf7c73c3f1fc2f, 0x78c636a8514a7af9, 0xaee3b35fe11e7533, 0x7fbd199278f6ffd7 - , 0x41aabbf4363d77de, 0x1b27fdf18b96bf6a, 0xda264a1dff9a981c, 0x36efc08530c0bf9a, 0x5bd8862a5d830854, 0x23d7c905e656e6cb, 0x4523324c5b64fdcf, 0x36627f376238665f, 0x564f53925c6d5ea4, 0x17c7cc86a1913022, 0xf90db52a543b009b, 0x15192dc91f8b994b - , 0x80bfa3c1a79ec6e2, 0x48fca8ea99772ecc, 0xfee6a3b98c0f1824, 0x46a8c75601b81e22, 0x2cb3c402a8895fcc, 0x1d1dff9c04305ce2, 0xc1aefe78e85971d7, 0x79c6a083ab5a80b2, 0x379c7bca5dbf2518, 0x2419358989b3ca02, 0xc9c42c9cfa5f470e, 0x4481c2ce91b14459 - , 0x6b04dea1ea26deca, 0x26ee3ceee0d0a101, 0xe36cc6bcd8fa4f26, 0x4d14709719764fbd, 0xe0572a706f1fef52, 0x0f75fb69a23f2ec1, 0x32ae4b04a864cf3b, 0x0b6373a91b944773, 0x1a8f2bc20bd088af, 0x586b0d5ace401747, 0xa0e6b094a3c51433, 0x1752a123c268c1c7 - , 0x643c2a93b5770ea1, 0x536cb9d1b71eeb43, 0x6bfb0525d0cc6b3f, 0x1f4dcfeec3adefc3, 0x28a0169dd0bf57f0, 0x1336c9aa20a35449, 0xbbcda068703ad7a1, 0x5e33478283c1e03d, 0xf1997733d18fdaf2, 0x789af507a17bb867, 0x79970c14d5695613, 0x79452342e845256f - , 0x6c12f9367a26a018, 0x11beda1c8f9cdfbe, 0x720e6ddf24b30929, 0x7706e91e3e544755, 0x4460381d3a6c9059, 0x7e01916c3678c424, 0x6024355a61d2bb07, 0x68bae01d79c869e2, 0xf21cbcff285df659, 0x02f7ee6aeb57c933, 0xce0f078c17266467, 0x039b4fdb5170a103 - , 0xd5de0fec61a4ae1b, 0x33d37a152a778695, 0xea64e40e6a10ded9, 0x1f1d394373bdb213, 0xf63598b6ef59fd14, 0x57922adc3ae52283, 0xe39a90e18b76f4a1, 0x27f3dbebd98a9dae, 0x18179dd9c03804b3, 0x511d72c1912e2d73, 0x88e1f6d24b2f3225, 0x56009999cdc2997f - , 0xda6df977b7d82fe4, 0x76f746bba63da226, 0x0b5facfc3bf13bd7, 0x4a31eb04f66f0e18, 0x8ace73d5e7cfe28f, 0x19aa731bc30c20b1, 0xa91979fe73400317, 0x6795ce71a09c7c9f, 0x93d55501933700ba, 0x3850eaf08b1fd14d, 0x450c5abc89edca71, 0x1be5db848bdfa5ef - , 0x77667d3f4fcf082b, 0x673b6e6c4824bc45, 0x6f22c12a5fe0ed6d, 0x006ee6722b5dfed1, 0xb47a13c1468d0c62, 0x40564879a378e6e4, 0x0bc6b553a9d3ab58, 0x21761c79e44dfcfd, 0x66f36ed3eb1050fb, 0x2e67df1312dd01d3, 0x48744c4a68dbf2ad, 0x7844962b6d6e039c - , 0xe07b5675d378b65f, 0x336262aa3d2c1df0, 0x320a5667d78c2e2b, 0x4f668dd96dda5e2a, 0xe21556795c7b8470, 0x3061905b2ef82bb1, 0xaee53211472206b6, 0x1f87377fee0d7a39, 0xdac58c52a3b1a0c7, 0x6e3c4ce04f0d7ffd, 0xfdffec45d4a3990f, 0x4b5340f79e2ae2c2 - , 0x0537c8b7b3d1f332, 0x55292744ae35ee1a, 0x42336d0e6d057f1e, 0x5ac40e9e645cb3d7, 0x848f7b7f845e46c7, 0x74bda86736eff150, 0x891acf622baf4f35, 0x14bcef9cf39667bb, 0x9aa1354d9731b9b3, 0x27e855a19295e59f, 0x1a829a8e10662ed0, 0x3bbc43f9ec4437a7 - , 0x8bfa8b1cb1de5341, 0x3432778068d35549, 0xe3d807da41f25a48, 0x1bb6ee1ce2efe552, 0x08d9bded0bd3affc, 0x290f1c5299a917a8, 0xda8dfd79562f8939, 0x1bf7aae68686211f, 0x2ab6daf9bc860765, 0x7bef6e2f0eb58a0b, 0x8746faab7c439b94, 0x017ea87750bb8bda - , 0xf8dfeb22239c9a7c, 0x35cec0d2887b3a13, 0x68aa94ac601f1606, 0x7470553f8ba61767, 0x37894f91c9eac410, 0x55b22aeb8337f732, 0x53f8d90f29a2fe94, 0x0aec068aec69023a, 0x40506162ad6182ee, 0x6a327ff1ac1e5475, 0x968d7095492df3c8, 0x3f93f46195f67521 - , 0x4983bca28970d546, 0x2716b931296b53c2, 0xf42b013266b6f8b3, 0x76f29b084b6a369f, 0x8e28749222216249, 0x4f2fa1d3a6c1acfd, 0x0ee66697eab8f954, 0x37c33e28fec0cce5, 0x7d0419e2bafd1dd1, 0x01f04d4299b94daa, 0x5ec06abbc1e5c7e6, 0x3a24c66060ed72a9 - , 0x0db764e15f960f26, 0x1d5973d5d59f9c3a, 0xf3dc2608dc6d9149, 0x1d80e0461b72f518, 0x2264dccd49c8b09c, 0x1f03e7a246334d5e, 0x2d6e38871b1fc2ad, 0x418588ae4f284bd3, 0x3efb071bafe1afa2, 0x0799ba0c80bdd8dc, 0xa6b273222dcc4a76, 0x13859f08ac8a4b23 - , 0x0194acc2663c5acb, 0x459fa55bd0bbedf6, 0x1b055550f06f8cc1, 0x09e5fad46599ea75, 0x6b3916ef772958a3, 0x4aaaa5c18093a431, 0x8e1503e36610f594, 0x620ef55048a263b9, 0x5a28963c8cb8ecbc, 0x6aee46b1b740c15a, 0x67e39606f59cfea9, 0x13a579e3777ca8b1 - , 0x45ad92f61cbb8de3, 0x53068a1a42460eab, 0x9b163546de379578, 0x07bf38a7cecd4860, 0xf84c77031d282de1, 0x402aed6399f78ffc, 0xfb83dd20295f6d45, 0x3702e257340d2ecd, 0xb8db2d8b979b97c8, 0x617526d2a50b0c51, 0xd86f6278313017db, 0x2f35eedec55f9d92 - , 0xeecb69493517973b, 0x7a111a74e0baf09a, 0xb82c6da8ec39f63d, 0x4217076312833746, 0x5d36d11f3dda88d9, 0x7baebcb360f2a887, 0x9829b62d093d6cbb, 0x10f17a2f6edf28fd, 0xfe3efa4353f40626, 0x731ca3065c118e34, 0x6185678827960895, 0x07f906a4f4c6355c - , 0x361d9cd10e657142, 0x2b5f5d452dd861ce, 0xa3e01df05d04b69a, 0x533723bc4cfcc0db, 0x820384afa1bbccb5, 0x4e67e941595d8dfd, 0x0f8da50839e13646, 0x6887a0573a596968, 0xe93dd1df5ace7343, 0x0d4076f28ecf96c8, 0x0ba2f854988074c1, 0x5eb2a314a41a40b6 - , 0x49ff6d27a676b27e, 0x15f7ca40acd5114e, 0xc171f9a750d7da95, 0x3bedbe891f79eb5c, 0x5b643bceb83f74ff, 0x088b1af3aa331a4c, 0xde294c7e0a60c4a9, 0x0a0770fc8120b151, 0xf09b757a0c7c1937, 0x34b797c03efd9c88, 0x051e3edb2c28cc49, 0x66db34ec5ac5122c - , 0x95fde0d3d3dc8cbf, 0x797897c8121818cf, 0x1fd46d197710f89d, 0x533a505803f809c5, 0xb60f1c090c9fd211, 0x4a7c3479af5c9d82, 0x4bfc3ffa4c8cf5a5, 0x6949f4a61306821f, 0xd814c949c67abcdc, 0x419a5e33166863c4, 0x9de646f6bd0895e0, 0x497cc1449a54545a - , 0x69eb31247fe126f2, 0x323c83233967f477, 0x52e0db4d3d78127d, 0x42a0e188e7b9380c, 0x3a6b011c46e34e7e, 0x79f4168aa9a0b4aa, 0x94270a25d708fa4d, 0x2bb28618cbc9cdc8, 0x741e46bb04606819, 0x02790c52fb2ce982, 0x6dbb92d0c6d0af10, 0x32aa96ae061e9412 - , 0x1376700c90d98eaa, 0x4d1dfe650c0a7136, 0xb397f8eef89aff20, 0x4836ac4a041bae37, 0xf37c1076a80a02b8, 0x0d063fa2467b3a37, 0x498f2617b56b7e7b, 0x65ef1194db859a5d, 0xd1fe25d5d28ffcb6, 0x228ee6f49459c083, 0x6b7e82b3b009b15b, 0x713b185ef1fccbfc - , 0x552468f1ff60c298, 0x2b7ba65d02519614, 0x8a86ad90ff0816c2, 0x7bf9249284bd02e5, 0x3008c56e474c2d10, 0x171473b77f804540, 0x15fb79d07bdea766, 0x66ac67c7b9b0951f, 0x34bca15bb6d2f652, 0x13c63dd2687d617b, 0xc515ae237715c19c, 0x0e543c6765fbfef2 - , 0x668c80faf156fb5e, 0x1e2e9e3b3d9962b8, 0x89ebaa264394e113, 0x322add21cf1659cf, 0xf9e6e26733619f8e, 0x723bfc8b792147f0, 0x79aef2837d7e092f, 0x1aa61c59290b5011, 0x9955ae576a499cd3, 0x2c3d6e6a5a1ce0da, 0xb864cfa199a8676b, 0x4961a21f1080285f - , 0x828e184adf9d997b, 0x0c84bda97e7ce725, 0xe6974677094cfcc5, 0x4ec8cd773946105b, 0xa48681bcc95fb5c6, 0x6ade87f8f7a5f269, 0x9b97628fdd39c03d, 0x3bde0ee1f19f1842, 0x4ef8c8fb117c0ca1, 0x769bf8f8d07de9bf, 0xc8f5f435b78a57e5, 0x79987aa861bbcf9c - , 0x7f6c557204b02022, 0x119bd819111c69d1, 0xf0c61ef00b3eb70b, 0x4317f0511bfb7b39, 0x36a2b944e84d608e, 0x1c1a3862da3369cb, 0x37dbf471085f1775, 0x3835751e107419ad, 0x04ab0c84bb07a3fe, 0x63758bfbc7df13a0, 0x15ffd20cb554f23e, 0x1ff11c442b1515b7 - , 0x171377f1bf937186, 0x615efe82b83538f8, 0x321e7cfae352a761, 0x7af02427d7241502, 0x86546e47f2cc559f, 0x65a1d8a017659d75, 0xc95d8aa5b8bfdac9, 0x01e887cb68990623, 0xf1f8ee8c466bcc3d, 0x40ce5e4f2ba3908f, 0xd2b81a3480c16b35, 0x51625d3eabf708cd - , 0x44d770a210105739, 0x7f1de74a022958a0, 0xfbe4c91bd1e8f732, 0x204fbacb13586460, 0x97d79097d62e3cf8, 0x541ad5591934b114, 0xfdfb47919c141909, 0x354926e5244fdecf, 0x6291b0a0e2e994b0, 0x2b9a9a69d3a6c3d1, 0x8189be54302371e7, 0x3645c65df1a881cd - , 0xdf0460f445e3877b, 0x7ea384dc52d0d26e, 0x0c2e5f768d46b6b0, 0x1f6e62daa7c5d4e6, 0xf8b026b33b2343ee, 0x2b7183c8767d372c, 0xbd45d1b6b6731517, 0x4ddb3d287c470d60, 0x1031dba40263ece2, 0x4e737fa0d659045f, 0x8cbc98d07d09b455, 0x34a35128a2bcb7f5 }; - - void mod1271(felm_t a) - { // Modular correction, a = a mod (2^127-1) - _subborrow_u64(_subborrow_u64(0, a[0], 0xFFFFFFFFFFFFFFFF, &a[0]), a[1], 0x7FFFFFFFFFFFFFFF, &a[1]); - unsigned long long mask = 0 - (a[1] >> 63); - _addcarry_u64(_addcarry_u64(0, a[0], mask, &a[0]), a[1], 0x7FFFFFFFFFFFFFFF & mask, &a[1]); - } - - void fpadd1271(felm_t a, felm_t b, felm_t c) - { // Field addition, c = a+b mod (2^127-1) - _addcarry_u64(_addcarry_u64(0, a[0], b[0], &c[0]), a[1], b[1], &c[1]); - _addcarry_u64(_addcarry_u64(0, c[0], c[1] >> 63, &c[0]), c[1] & 0x7FFFFFFFFFFFFFFF, 0, &c[1]); - } - - void fpsub1271(felm_t a, felm_t b, felm_t c) - { // Field subtraction, c = a-b mod (2^127-1) - _subborrow_u64(_subborrow_u64(0, a[0], b[0], &c[0]), a[1], b[1], &c[1]); - _subborrow_u64(_subborrow_u64(0, c[0], c[1] >> 63, &c[0]), c[1] & 0x7FFFFFFFFFFFFFFF, 0, &c[1]); - } - - void fpneg1271(felm_t a) - { // Field negation, a = -a mod (2^127-1) - a[0] = ~a[0]; - a[1] = 0x7FFFFFFFFFFFFFFF - a[1]; - } - - void fpmul1271(felm_t a, felm_t b, felm_t c) - { // Field multiplication, c = a*b mod (2^127-1) - unsigned long long tt1[2], tt2[2], tt3[2]; - - tt1[0] = _umul128(a[0], b[0], &tt3[0]); - tt2[0] = _umul128(a[0], b[1], &tt2[1]); - _addcarry_u64(_addcarry_u64(0, tt2[0], tt3[0], &tt2[0]), tt2[1], 0, &tt2[1]); - tt3[0] = _umul128(a[1], b[0], &tt3[1]); - _addcarry_u64(_addcarry_u64(0, tt2[0], tt3[0], &tt2[0]), tt2[1], tt3[1], &tt2[1]); - tt3[0] = _umul128(a[1], b[1], &tt3[1]); - tt3[1] = __shiftleft128(tt3[0], tt3[1], 1); - _addcarry_u64(_addcarry_u64(0, __shiftright128(tt2[0], tt2[1], 63), tt3[0] << 1, &tt3[0]), tt2[1] >> 63, tt3[1], &tt3[1]); - _addcarry_u64(_addcarry_u64(0, tt1[0], tt3[0], &tt1[0]), tt2[0] & 0x7FFFFFFFFFFFFFFF, tt3[1], &tt1[1]); - _addcarry_u64(_addcarry_u64(0, tt1[0], tt1[1] >> 63, &c[0]), tt1[1] & 0x7FFFFFFFFFFFFFFF, 0, &c[1]); - } - - void fpsqr1271(felm_t a, felm_t c) - { // Field squaring, c = a^2 mod (2^127-1) - unsigned long long tt1[2], tt2[2], tt3[2]; - - tt1[0] = _umul128(a[0], a[0], &tt3[0]); - tt2[0] = _umul128(a[0], a[1], &tt2[1]); - _addcarry_u64(_addcarry_u64(0, tt2[0], tt3[0], &tt3[0]), tt2[1], 0, &tt3[1]); - _addcarry_u64(_addcarry_u64(0, tt2[0], tt3[0], &tt2[0]), tt2[1], tt3[1], &tt2[1]); - tt3[0] = _umul128(a[1], a[1], &tt3[1]); - tt3[1] = __shiftleft128(tt3[0], tt3[1], 1); - _addcarry_u64(_addcarry_u64(0, __shiftright128(tt2[0], tt2[1], 63), tt3[0] << 1, &tt3[0]), tt2[1] >> 63, tt3[1], &tt3[1]); - _addcarry_u64(_addcarry_u64(0, tt1[0], tt3[0], &tt1[0]), tt2[0] & 0x7FFFFFFFFFFFFFFF, tt3[1], &tt1[1]); - _addcarry_u64(_addcarry_u64(0, tt1[0], tt1[1] >> 63, &c[0]), tt1[1] & 0x7FFFFFFFFFFFFFFF, 0, &c[1]); - } - - void fpexp1251(felm_t a, felm_t af) - { // Exponentiation over GF(p), af = a^(125-1) - felm_t t1, t2, t3, t4, t5; - - fpsqr1271(a, t2); - fpmul1271(a, t2, t2); - fpsqr1271(t2, t3); - fpsqr1271(t3, t3); - fpmul1271(t2, t3, t3); - fpsqr1271(t3, t4); - fpsqr1271(t4, t4); - fpsqr1271(t4, t4); - fpsqr1271(t4, t4); - fpmul1271(t3, t4, t4); - fpsqr1271(t4, t5); - for (unsigned int i = 0; i < 7; i++) fpsqr1271(t5, t5); - fpmul1271(t4, t5, t5); - fpsqr1271(t5, t2); - for (unsigned int i = 0; i < 15; i++) fpsqr1271(t2, t2); - fpmul1271(t5, t2, t2); - fpsqr1271(t2, t1); - for (unsigned int i = 0; i < 31; i++) fpsqr1271(t1, t1); - fpmul1271(t2, t1, t1); - for (unsigned int i = 0; i < 32; i++) fpsqr1271(t1, t1); - fpmul1271(t1, t2, t1); - for (unsigned int i = 0; i < 16; i++) fpsqr1271(t1, t1); - fpmul1271(t5, t1, t1); - for (unsigned int i = 0; i < 8; i++) fpsqr1271(t1, t1); - fpmul1271(t4, t1, t1); - for (unsigned int i = 0; i < 4; i++) fpsqr1271(t1, t1); - fpmul1271(t3, t1, t1); - fpsqr1271(t1, t1); - fpmul1271(a, t1, af); - } - - void fp2div1271(f2elm_t a) - { // GF(p^2) division by two c = a/2 mod p - unsigned long long mask, temp[2]; - - mask = (0 - (1 & a[0][0])); - _addcarry_u64(_addcarry_u64(0, a[0][0], mask, &temp[0]), a[0][1], (mask >> 1), &temp[1]); - a[0][0] = __shiftright128(temp[0], temp[1], 1); - a[0][1] = (temp[1] >> 1); - - mask = (0 - (1 & a[1][0])); - _addcarry_u64(_addcarry_u64(0, a[1][0], mask, &temp[0]), a[1][1], (mask >> 1), &temp[1]); - a[1][0] = __shiftright128(temp[0], temp[1], 1); - a[1][1] = (temp[1] >> 1); - } - - void fp2neg1271(f2elm_t a) - { // GF(p^2) negation, a = -a in GF((2^127-1)^2) - fpneg1271(a[0]); - fpneg1271(a[1]); - } - - void fp2sqr1271(f2elm_t a, f2elm_t c) - { // GF(p^2) squaring, c = a^2 in GF((2^127-1)^2) - felm_t t1, t2, t3; - - fpadd1271(a[0], a[1], t1); // t1 = a0+a1 - fpsub1271(a[0], a[1], t2); // t2 = a0-a1 - fpmul1271(a[0], a[1], t3); // t3 = a0*a1 - fpmul1271(t1, t2, c[0]); // c0 = (a0+a1)(a0-a1) - fpadd1271(t3, t3, c[1]); // c1 = 2a0*a1 - } - - void fp2mul1271(f2elm_t a, f2elm_t b, f2elm_t c) - { // GF(p^2) multiplication, c = a*b in GF((2^127-1)^2) - felm_t t1, t2, t3, t4; - - fpmul1271(a[0], b[0], t1); // t1 = a0*b0 - fpmul1271(a[1], b[1], t2); // t2 = a1*b1 - fpadd1271(a[0], a[1], t3); // t3 = a0+a1 - fpadd1271(b[0], b[1], t4); // t4 = b0+b1 - fpsub1271(t1, t2, c[0]); // c[0] = a0*b0 - a1*b1 - fpmul1271(t3, t4, t3); // t3 = (a0+a1)*(b0+b1) - fpsub1271(t3, t1, t3); // t3 = (a0+a1)*(b0+b1) - a0*b0 - fpsub1271(t3, t2, c[1]); // c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 - } - - void fp2add1271(f2elm_t a, f2elm_t b, f2elm_t c) - { // GF(p^2) addition, c = a+b in GF((2^127-1)^2) - fpadd1271(a[0], b[0], c[0]); - fpadd1271(a[1], b[1], c[1]); - } - - void fp2sub1271(f2elm_t a, f2elm_t b, f2elm_t c) - { // GF(p^2) subtraction, c = a-b in GF((2^127-1)^2) - fpsub1271(a[0], b[0], c[0]); - fpsub1271(a[1], b[1], c[1]); - } - - void fp2addsub1271(f2elm_t a, f2elm_t b, f2elm_t c) - { // GF(p^2) addition followed by subtraction, c = 2a-b in GF((2^127-1)^2) - - fp2add1271(a, a, a); - fp2sub1271(a, b, c); - } - - void table_lookup_fixed_base(point_precomp_t P, unsigned int digit, unsigned int sign) - { // Table lookup to extract a point represented as (x+y,y-x,2t) corresponding to extended twisted Edwards coordinates (X:Y:Z:T) with Z=1 - if (sign) - { - *((__m256i*)P->xy) = *((__m256i*)((point_precomp_t*)FIXED_BASE_TABLE)[digit]->yx); - *((__m256i*)P->yx) = *((__m256i*)((point_precomp_t*)FIXED_BASE_TABLE)[digit]->xy); - P->t2[0][0] = ~(((point_precomp_t*)FIXED_BASE_TABLE)[digit])->t2[0][0]; - P->t2[0][1] = 0x7FFFFFFFFFFFFFFF - (((point_precomp_t*)FIXED_BASE_TABLE)[digit])->t2[0][1]; - P->t2[1][0] = ~(((point_precomp_t*)FIXED_BASE_TABLE)[digit])->t2[1][0]; - P->t2[1][1] = 0x7FFFFFFFFFFFFFFF - (((point_precomp_t*)FIXED_BASE_TABLE)[digit])->t2[1][1]; - } - else - { - *((__m256i*)P->xy) = *((__m256i*)((point_precomp_t*)FIXED_BASE_TABLE)[digit]->xy); - *((__m256i*)P->yx) = *((__m256i*)((point_precomp_t*)FIXED_BASE_TABLE)[digit]->yx); - *((__m256i*)P->t2) = *((__m256i*)((point_precomp_t*)FIXED_BASE_TABLE)[digit]->t2); - } - } - - void multiply(const unsigned long long* a, const unsigned long long* b, unsigned long long* c) - { - unsigned long long u, v, uv; - - c[0] = _umul128(a[0], b[0], &u); - u = _addcarry_u64(0, _umul128(a[0], b[1], &uv), u, &c[1]) + uv; - u = _addcarry_u64(0, _umul128(a[0], b[2], &uv), u, &c[2]) + uv; - c[4] = _addcarry_u64(0, _umul128(a[0], b[3], &uv), u, &c[3]) + uv; - - u = _addcarry_u64(0, c[1], _umul128(a[1], b[0], &uv), &c[1]) + uv; - u = _addcarry_u64(0, _umul128(a[1], b[1], &uv), u, &v) + uv; - u = _addcarry_u64(_addcarry_u64(0, c[2], v, &c[2]), _umul128(a[1], b[2], &uv), u, &v) + uv; - c[5] = _addcarry_u64(_addcarry_u64(0, c[3], v, &c[3]), _umul128(a[1], b[3], &uv), u, &v) + uv + _addcarry_u64(0, c[4], v, &c[4]); - - u = _addcarry_u64(0, c[2], _umul128(a[2], b[0], &uv), &c[2]) + uv; - u = _addcarry_u64(0, _umul128(a[2], b[1], &uv), u, &v) + uv; - u = _addcarry_u64(_addcarry_u64(0, c[3], v, &c[3]), _umul128(a[2], b[2], &uv), u, &v) + uv; - c[6] = _addcarry_u64(_addcarry_u64(0, c[4], v, &c[4]), _umul128(a[2], b[3], &uv), u, &v) + uv + _addcarry_u64(0, c[5], v, &c[5]); - - u = _addcarry_u64(0, c[3], _umul128(a[3], b[0], &uv), &c[3]) + uv; - u = _addcarry_u64(0, _umul128(a[3], b[1], &uv), u, &v) + uv; - u = _addcarry_u64(_addcarry_u64(0, c[4], v, &c[4]), _umul128(a[3], b[2], &uv), u, &v) + uv; - c[7] = _addcarry_u64(_addcarry_u64(0, c[5], v, &c[5]), _umul128(a[3], b[3], &uv), u, &v) + uv + _addcarry_u64(0, c[6], v, &c[6]); - } - - void Montgomery_multiply_mod_order(const unsigned long long* ma, const unsigned long long* mb, unsigned long long* mc) - { // 256-bit Montgomery multiplication modulo the curve order, mc = ma*mb*r' mod order, where ma,mb,mc in [0, order-1] - // ma, mb and mc are assumed to be in Montgomery representation - // The Montgomery constant r' = -r^(-1) mod 2^(log_2(r)) is the global value "Montgomery_rprime", where r is the order - unsigned long long P[8], Q[4], temp[8]; - - if (mb[0] == 1 && !mb[1] && !mb[2] && !mb[3]) - { - *((__m256i*) & P[0]) = *((__m256i*)ma); - *((__m256i*) & P[4]) = ZERO; - } - else - { - multiply(ma, mb, P); // P = ma * mb - } - - unsigned long long u, v, uv; - Q[0] = _umul128(P[0], MONTGOMERY_SMALL_R_PRIME_0, &u); - u = _addcarry_u64(0, _umul128(P[0], MONTGOMERY_SMALL_R_PRIME_1, &uv), u, &Q[1]) + uv; - u = _addcarry_u64(0, _umul128(P[0], MONTGOMERY_SMALL_R_PRIME_2, &uv), u, &Q[2]) + uv; - _addcarry_u64(0, P[0] * MONTGOMERY_SMALL_R_PRIME_3, u, &Q[3]); - u = _addcarry_u64(0, Q[1], _umul128(P[1], MONTGOMERY_SMALL_R_PRIME_0, &uv), &Q[1]) + uv; - u = _addcarry_u64(0, _umul128(P[1], MONTGOMERY_SMALL_R_PRIME_1, &uv), u, &v) + uv; - _addcarry_u64(_addcarry_u64(0, Q[2], v, &Q[2]), P[1] * MONTGOMERY_SMALL_R_PRIME_2, u, &v); - _addcarry_u64(0, Q[3], v, &Q[3]); - u = _addcarry_u64(0, Q[2], _umul128(P[2], MONTGOMERY_SMALL_R_PRIME_0, &uv), &Q[2]) + uv; - _addcarry_u64(0, P[2] * MONTGOMERY_SMALL_R_PRIME_1, u, &v); - _addcarry_u64(0, Q[3], v, &Q[3]); - _addcarry_u64(0, Q[3], P[3] * MONTGOMERY_SMALL_R_PRIME_0, &Q[3]); - - multiply(Q, curve_order, temp); // temp = Q * r - - if (_addcarry_u64(_addcarry_u64(_addcarry_u64(_addcarry_u64(_addcarry_u64(_addcarry_u64(_addcarry_u64(_addcarry_u64(0, P[0], temp[0], &temp[0]), P[1], temp[1], &temp[1]), P[2], temp[2], &temp[2]), P[3], temp[3], &temp[3]), P[4], temp[4], &temp[4]), P[5], temp[5], &temp[5]), P[6], temp[6], &temp[6]), P[7], temp[7], &temp[7]) - - _subborrow_u64(_subborrow_u64(_subborrow_u64(_subborrow_u64(0, temp[4], CURVE_ORDER_0, &mc[0]), temp[5], CURVE_ORDER_1, &mc[1]), temp[6], CURVE_ORDER_2, &mc[2]), temp[7], CURVE_ORDER_3, &mc[3])) - { - _addcarry_u64(_addcarry_u64(_addcarry_u64(_addcarry_u64(0, mc[0], CURVE_ORDER_0, &mc[0]), mc[1], CURVE_ORDER_1, &mc[1]), mc[2], CURVE_ORDER_2, &mc[2]), mc[3], CURVE_ORDER_3, &mc[3]); - } - } - - void eccnorm(point_extproj_t P, point_t Q) - { // Normalize a projective point (X1:Y1:Z1), including full reduction - - // Z1 = Z1^-1 - f2elm_t t1; - fpsqr1271(P->z[0], t1[0]); - fpsqr1271(P->z[1], t1[1]); - fpadd1271(t1[0], t1[1], t1[0]); - fpexp1251(t1[0], t1[1]); - fpsqr1271(t1[1], t1[1]); - fpsqr1271(t1[1], t1[1]); - fpmul1271(t1[0], t1[1], t1[0]); - fpneg1271(P->z[1]); - fpmul1271(P->z[0], t1[0], P->z[0]); - fpmul1271(P->z[1], t1[0], P->z[1]); - - fp2mul1271(P->x, P->z, Q->x); // X1 = X1/Z1 - fp2mul1271(P->y, P->z, Q->y); // Y1 = Y1/Z1 - mod1271(Q->x[0]); - mod1271(Q->x[1]); - mod1271(Q->y[0]); - mod1271(Q->y[1]); - } - - void R1_to_R2(point_extproj_t P, point_extproj_precomp_t Q) - { // Conversion from representation (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT), where T = Ta*Tb - fp2add1271(P->ta, P->ta, Q->t2); // T = 2*Ta - fp2add1271(P->x, P->y, Q->xy); // QX = X+Y - fp2sub1271(P->y, P->x, Q->yx); // QY = Y-X - fp2mul1271(Q->t2, P->tb, Q->t2); // T = 2*T - fp2add1271(P->z, P->z, Q->z2); // QZ = 2*Z - fp2mul1271(Q->t2, (felm_t*)&PARAMETER_d, Q->t2); // QT = 2d*T - } - - void R1_to_R3(point_extproj_t P, point_extproj_precomp_t Q) - { // Conversion from representation (X,Y,Z,Ta,Tb) to (X+Y,Y-X,Z,T), where T = Ta*Tb - fp2add1271(P->x, P->y, Q->xy); // XQ = (X1+Y1) - fp2sub1271(P->y, P->x, Q->yx); // YQ = (Y1-X1) - fp2mul1271(P->ta, P->tb, Q->t2); // TQ = T1 - *((__m256i*) & Q->z2) = *((__m256i*) & P->z); // ZQ = Z1 - } - - void R2_to_R4(point_extproj_precomp_t P, point_extproj_t Q) - { // Conversion from representation (X+Y,Y-X,2Z,2dT) to (2X,2Y,2Z,2dT) - fp2sub1271(P->xy, P->yx, Q->x); // XQ = 2*X1 - fp2add1271(P->xy, P->yx, Q->y); // YQ = 2*Y1 - *((__m256i*) & Q->z) = *((__m256i*) & P->z2); // ZQ = 2*Z1 - } - - void eccdouble(point_extproj_t P) - { // Point doubling 2P - f2elm_t t1, t2; - - fp2sqr1271(P->x, t1); // t1 = X1^2 - fp2sqr1271(P->y, t2); // t2 = Y1^2 - fp2add1271(P->x, P->y, P->x); // t3 = X1+Y1 - fp2add1271(t1, t2, P->tb); // Tbfinal = X1^2+Y1^2 - fp2sub1271(t2, t1, t1); // t1 = Y1^2-X1^2 - fp2sqr1271(P->x, P->ta); // Ta = (X1+Y1)^2 - fp2sqr1271(P->z, t2); // t2 = Z1^2 - fp2sub1271(P->ta, P->tb, P->ta); // Tafinal = 2X1*Y1 = (X1+Y1)^2-(X1^2+Y1^2) - fp2addsub1271(t2, t1, t2); // t2 = 2Z1^2-(Y1^2-X1^2) - fp2mul1271(t1, P->tb, P->y); // Yfinal = (X1^2+Y1^2)(Y1^2-X1^2) - fp2mul1271(t2, P->ta, P->x); // Xfinal = 2X1*Y1*[2Z1^2-(Y1^2-X1^2)] - fp2mul1271(t1, t2, P->z); // Zfinal = (Y1^2-X1^2)[2Z1^2-(Y1^2-X1^2)] - } - - void eccadd_core(point_extproj_precomp_t P, point_extproj_precomp_t Q, point_extproj_t R) - { // Basic point addition R = P+Q or R = P+P - f2elm_t t1, t2; - - fp2mul1271(P->t2, Q->t2, R->z); // Z = 2dT1*T2 - fp2mul1271(P->z2, Q->z2, t1); // t1 = 2Z1*Z2 - fp2mul1271(P->xy, Q->xy, R->x); // X = (X1+Y1)(X2+Y2) - fp2mul1271(P->yx, Q->yx, R->y); // Y = (Y1-X1)(Y2-X2) - fp2sub1271(t1, R->z, t2); // t2 = theta - fp2add1271(t1, R->z, t1); // t1 = alpha - fp2sub1271(R->x, R->y, R->tb); // Tbfinal = beta - fp2add1271(R->x, R->y, R->ta); // Tafinal = omega - fp2mul1271(R->tb, t2, R->x); // Xfinal = beta*theta - fp2mul1271(t1, t2, R->z); // Zfinal = theta*alpha - fp2mul1271(R->ta, t1, R->y); // Yfinal = alpha*omega - } - - void eccadd(point_extproj_precomp_t Q, point_extproj_t P) - { // Complete point addition P = P+Q or P = P+P - point_extproj_precomp_t R; - - R1_to_R3(P, R); // R = (X1+Y1,Y1-Z1,Z1,T1) - eccadd_core(Q, R, P); // P = (X2+Y2,Y2-X2,2Z2,2dT2) + (X1+Y1,Y1-Z1,Z1,T1) - } - - void point_setup(point_t P, point_extproj_t Q) - { // Point conversion to representation (X,Y,Z,Ta,Tb) - *((__m256i*) & Q->x) = *((__m256i*) & P->x); - *((__m256i*) & Q->y) = *((__m256i*) & P->y); - *((__m256i*) & Q->ta) = *((__m256i*) & Q->x); // Ta = X1 - *((__m256i*) & Q->tb) = *((__m256i*) & Q->y); // Tb = Y1 - Q->z[0][0] = 1; Q->z[0][1] = 0; Q->z[1][0] = 0; Q->z[1][1] = 0; // Z1 = 1 - } - - bool ecc_point_validate(point_extproj_t P) - { // Point validation: check if point lies on the curve - f2elm_t t1, t2, t3; - - fp2sqr1271(P->y, t1); - fp2sqr1271(P->x, t2); - fp2sub1271(t1, t2, t3); // -x^2 + y^2 - fp2mul1271(t1, t2, t1); // x^2*y^2 - fp2mul1271(t1, (felm_t*)&PARAMETER_d, t2); // dx^2*y^2 - t1[0][0] = 1; t1[0][1] = 0; t1[1][0] = 0; t1[1][1] = 0; // t1 = 1 - fp2add1271(t2, t1, t2); // 1 + dx^2*y^2 - fp2sub1271(t3, t2, t1); // -x^2 + y^2 - 1 - dx^2*y^2 - - return ((!(t1[0][0] | t1[0][1]) || !((t1[0][0] + 1) | (t1[0][1] + 1))) - && (!(t1[1][0] | t1[1][1]) || !((t1[1][0] + 1) | (t1[1][1] + 1)))); - } - - void eccmadd(point_precomp_t Q, point_extproj_t P) - { // Mixed point addition P = P+Q or P = P+P - f2elm_t t1, t2; - - fp2mul1271(P->ta, P->tb, P->ta); // Ta = T1 - fp2add1271(P->z, P->z, t1); // t1 = 2Z1 - fp2mul1271(P->ta, Q->t2, P->ta); // Ta = 2dT1*t2 - fp2add1271(P->x, P->y, P->z); // Z = (X1+Y1) - fp2sub1271(P->y, P->x, P->tb); // Tb = (Y1-X1) - fp2sub1271(t1, P->ta, t2); // t2 = theta - fp2add1271(t1, P->ta, t1); // t1 = alpha - fp2mul1271(Q->xy, P->z, P->ta); // Ta = (X1+Y1)(x2+y2) - fp2mul1271(Q->yx, P->tb, P->x); // X = (Y1-X1)(y2-x2) - fp2mul1271(t1, t2, P->z); // Zfinal = theta*alpha - fp2sub1271(P->ta, P->x, P->tb); // Tbfinal = beta - fp2add1271(P->ta, P->x, P->ta); // Tafinal = omega - fp2mul1271(P->tb, t2, P->x); // Xfinal = beta*theta - fp2mul1271(P->ta, t1, P->y); // Yfinal = alpha*omega - } - - void ecc_mul_fixed(unsigned long long* k, point_t Q) - { // Fixed-base scalar multiplication Q = k*G, where G is the generator. FIXED_BASE_TABLE stores v*2^(w-1) = 80 multiples of G. - unsigned int digits[250]; - unsigned long long scalar[4]; - - Montgomery_multiply_mod_order(k, Montgomery_Rprime, scalar); - Montgomery_multiply_mod_order(scalar, ONE, scalar); - - - // Converting scalar to odd using the prime subgroup order - // If (k is odd) then k_odd = k else k_odd = k + r - if (!(scalar[0] & 1)) - { - unsigned char carry = _addcarry_u64(0, scalar[0], CURVE_ORDER_0, &scalar[0]); - carry = _addcarry_u64(carry, scalar[1], CURVE_ORDER_1, &scalar[1]); - carry = _addcarry_u64(carry, scalar[2], CURVE_ORDER_2, &scalar[2]); - _addcarry_u64(carry, scalar[3], CURVE_ORDER_3, &scalar[3]); - } - - // Shift scalar to the right by 1 - scalar[0] = __shiftright128(scalar[0], scalar[1], 1); - scalar[1] = __shiftright128(scalar[1], scalar[2], 1); - scalar[2] = __shiftright128(scalar[2], scalar[3], 1); - scalar[3] >>= 1; - - for (unsigned int i = 0; i < 49; i++) - { - digits[i] = (unsigned int)((scalar[0] & 1) - 1); // Convention for the "sign" row: if scalar_(i+1) = 0 then digit_i = -1 (negative), else if scalar_(i+1) = 1 then digit_i = 0 (positive) - - // Shift scalar to the right by 1 - scalar[0] = __shiftright128(scalar[0], scalar[1], 1); - scalar[1] = __shiftright128(scalar[1], scalar[2], 1); - scalar[2] = __shiftright128(scalar[2], scalar[3], 1); - scalar[3] >>= 1; - } - digits[49] = 0; - for (unsigned int i = 50; i < 250; i++) - { - digits[i] = (unsigned int)(scalar[0] & 1); // digits_i = k mod 2. Sign is determined by the "sign" row - - // Shift scalar to the right by 1 - scalar[0] = __shiftright128(scalar[0], scalar[1], 1); - scalar[1] = __shiftright128(scalar[1], scalar[2], 1); - scalar[2] = __shiftright128(scalar[2], scalar[3], 1); - scalar[3] >>= 1; - - const unsigned long long temp = (0 - digits[i - (i / 50) * 50]) & digits[i]; // if (digits_i=0 \/ 1) then temp = 0, else if (digits_i=-1) then temp = 1 - - // floor(scalar/2) + temp - scalar[0] += temp; - unsigned long long carry = scalar[0] ? 0 : (temp & 1); // carry = (scalar[0] < temp); - scalar[1] += carry; - carry = scalar[1] ? 0 : (carry & 1); // carry = (scalar[j] < temp); - scalar[2] += carry; - scalar[3] += (scalar[2] ? 0 : (carry & 1)); // carry = (scalar[j] < temp); - } - - point_extproj_t R; - point_precomp_t S; - - table_lookup_fixed_base(S, 64 + (((((digits[249] << 1) + digits[199]) << 1) + digits[149]) << 1) + digits[99], 0); - // Conversion from representation (x+y,y-x,2dt) to (X,Y,Z,Ta,Tb) - fp2sub1271(S->xy, S->yx, R->x); // 2*x1 - fp2add1271(S->xy, S->yx, R->y); // 2*y1 - fp2div1271(R->x); // XQ = x1 - fp2div1271(R->y); // YQ = y1 - R->z[0][0] = 1; R->z[0][1] = 0; R->z[1][0] = 0; R->z[1][1] = 0; // ZQ = 1 - *((__m256i*) & R->ta) = *((__m256i*) & R->x); // TaQ = x1 - *((__m256i*) & R->tb) = *((__m256i*) & R->y); // TbQ = y1 - - table_lookup_fixed_base(S, 48 + (((((digits[239] << 1) + digits[189]) << 1) + digits[139]) << 1) + digits[89], digits[39]); - eccmadd(S, R); - table_lookup_fixed_base(S, 32 + (((((digits[229] << 1) + digits[179]) << 1) + digits[129]) << 1) + digits[79], digits[29]); - eccmadd(S, R); - table_lookup_fixed_base(S, 16 + (((((digits[219] << 1) + digits[169]) << 1) + digits[119]) << 1) + digits[69], digits[19]); - eccmadd(S, R); - table_lookup_fixed_base(S, 00 + (((((digits[209] << 1) + digits[159]) << 1) + digits[109]) << 1) + digits[59], digits[9]); - eccmadd(S, R); - - eccdouble(R); - table_lookup_fixed_base(S, 64 + (((((digits[248] << 1) + digits[198]) << 1) + digits[148]) << 1) + digits[98], digits[48]); - eccmadd(S, R); - table_lookup_fixed_base(S, 48 + (((((digits[238] << 1) + digits[188]) << 1) + digits[138]) << 1) + digits[88], digits[38]); - eccmadd(S, R); - table_lookup_fixed_base(S, 32 + (((((digits[228] << 1) + digits[178]) << 1) + digits[128]) << 1) + digits[78], digits[28]); - eccmadd(S, R); - table_lookup_fixed_base(S, 16 + (((((digits[218] << 1) + digits[168]) << 1) + digits[118]) << 1) + digits[68], digits[18]); - eccmadd(S, R); - table_lookup_fixed_base(S, 00 + (((((digits[208] << 1) + digits[158]) << 1) + digits[108]) << 1) + digits[58], digits[8]); - eccmadd(S, R); - - eccdouble(R); - table_lookup_fixed_base(S, 64 + (((((digits[247] << 1) + digits[197]) << 1) + digits[147]) << 1) + digits[97], digits[47]); - eccmadd(S, R); - table_lookup_fixed_base(S, 48 + (((((digits[237] << 1) + digits[187]) << 1) + digits[137]) << 1) + digits[87], digits[37]); - eccmadd(S, R); - table_lookup_fixed_base(S, 32 + (((((digits[227] << 1) + digits[177]) << 1) + digits[127]) << 1) + digits[77], digits[27]); - eccmadd(S, R); - table_lookup_fixed_base(S, 16 + (((((digits[217] << 1) + digits[167]) << 1) + digits[117]) << 1) + digits[67], digits[17]); - eccmadd(S, R); - table_lookup_fixed_base(S, 00 + (((((digits[207] << 1) + digits[157]) << 1) + digits[107]) << 1) + digits[57], digits[7]); - eccmadd(S, R); - - eccdouble(R); - table_lookup_fixed_base(S, 64 + (((((digits[246] << 1) + digits[196]) << 1) + digits[146]) << 1) + digits[96], digits[46]); - eccmadd(S, R); - table_lookup_fixed_base(S, 48 + (((((digits[236] << 1) + digits[186]) << 1) + digits[136]) << 1) + digits[86], digits[36]); - eccmadd(S, R); - table_lookup_fixed_base(S, 32 + (((((digits[226] << 1) + digits[176]) << 1) + digits[126]) << 1) + digits[76], digits[26]); - eccmadd(S, R); - table_lookup_fixed_base(S, 16 + (((((digits[216] << 1) + digits[166]) << 1) + digits[116]) << 1) + digits[66], digits[16]); - eccmadd(S, R); - table_lookup_fixed_base(S, 00 + (((((digits[206] << 1) + digits[156]) << 1) + digits[106]) << 1) + digits[56], digits[6]); - eccmadd(S, R); - - eccdouble(R); - table_lookup_fixed_base(S, 64 + (((((digits[245] << 1) + digits[195]) << 1) + digits[145]) << 1) + digits[95], digits[45]); - eccmadd(S, R); - table_lookup_fixed_base(S, 48 + (((((digits[235] << 1) + digits[185]) << 1) + digits[135]) << 1) + digits[85], digits[35]); - eccmadd(S, R); - table_lookup_fixed_base(S, 32 + (((((digits[225] << 1) + digits[175]) << 1) + digits[125]) << 1) + digits[75], digits[25]); - eccmadd(S, R); - table_lookup_fixed_base(S, 16 + (((((digits[215] << 1) + digits[165]) << 1) + digits[115]) << 1) + digits[65], digits[15]); - eccmadd(S, R); - table_lookup_fixed_base(S, 00 + (((((digits[205] << 1) + digits[155]) << 1) + digits[105]) << 1) + digits[55], digits[5]); - eccmadd(S, R); - - eccdouble(R); - table_lookup_fixed_base(S, 64 + (((((digits[244] << 1) + digits[194]) << 1) + digits[144]) << 1) + digits[94], digits[44]); - eccmadd(S, R); - table_lookup_fixed_base(S, 48 + (((((digits[234] << 1) + digits[184]) << 1) + digits[134]) << 1) + digits[84], digits[34]); - eccmadd(S, R); - table_lookup_fixed_base(S, 32 + (((((digits[224] << 1) + digits[174]) << 1) + digits[124]) << 1) + digits[74], digits[24]); - eccmadd(S, R); - table_lookup_fixed_base(S, 16 + (((((digits[214] << 1) + digits[164]) << 1) + digits[114]) << 1) + digits[64], digits[14]); - eccmadd(S, R); - table_lookup_fixed_base(S, 00 + (((((digits[204] << 1) + digits[154]) << 1) + digits[104]) << 1) + digits[54], digits[4]); - eccmadd(S, R); - - eccdouble(R); - table_lookup_fixed_base(S, 64 + (((((digits[243] << 1) + digits[193]) << 1) + digits[143]) << 1) + digits[93], digits[43]); - eccmadd(S, R); - table_lookup_fixed_base(S, 48 + (((((digits[233] << 1) + digits[183]) << 1) + digits[133]) << 1) + digits[83], digits[33]); - eccmadd(S, R); - table_lookup_fixed_base(S, 32 + (((((digits[223] << 1) + digits[173]) << 1) + digits[123]) << 1) + digits[73], digits[23]); - eccmadd(S, R); - table_lookup_fixed_base(S, 16 + (((((digits[213] << 1) + digits[163]) << 1) + digits[113]) << 1) + digits[63], digits[13]); - eccmadd(S, R); - table_lookup_fixed_base(S, 00 + (((((digits[203] << 1) + digits[153]) << 1) + digits[103]) << 1) + digits[53], digits[3]); - eccmadd(S, R); - - eccdouble(R); - table_lookup_fixed_base(S, 64 + (((((digits[242] << 1) + digits[192]) << 1) + digits[142]) << 1) + digits[92], digits[42]); - eccmadd(S, R); - table_lookup_fixed_base(S, 48 + (((((digits[232] << 1) + digits[182]) << 1) + digits[132]) << 1) + digits[82], digits[32]); - eccmadd(S, R); - table_lookup_fixed_base(S, 32 + (((((digits[222] << 1) + digits[172]) << 1) + digits[122]) << 1) + digits[72], digits[22]); - eccmadd(S, R); - table_lookup_fixed_base(S, 16 + (((((digits[212] << 1) + digits[162]) << 1) + digits[112]) << 1) + digits[62], digits[12]); - eccmadd(S, R); - table_lookup_fixed_base(S, 00 + (((((digits[202] << 1) + digits[152]) << 1) + digits[102]) << 1) + digits[52], digits[2]); - eccmadd(S, R); - - eccdouble(R); - table_lookup_fixed_base(S, 64 + (((((digits[241] << 1) + digits[191]) << 1) + digits[141]) << 1) + digits[91], digits[41]); - eccmadd(S, R); - table_lookup_fixed_base(S, 48 + (((((digits[231] << 1) + digits[181]) << 1) + digits[131]) << 1) + digits[81], digits[31]); - eccmadd(S, R); - table_lookup_fixed_base(S, 32 + (((((digits[221] << 1) + digits[171]) << 1) + digits[121]) << 1) + digits[71], digits[21]); - eccmadd(S, R); - table_lookup_fixed_base(S, 16 + (((((digits[211] << 1) + digits[161]) << 1) + digits[111]) << 1) + digits[61], digits[11]); - eccmadd(S, R); - table_lookup_fixed_base(S, 00 + (((((digits[201] << 1) + digits[151]) << 1) + digits[101]) << 1) + digits[51], digits[1]); - eccmadd(S, R); - - eccdouble(R); - table_lookup_fixed_base(S, 64 + (((((digits[240] << 1) + digits[190]) << 1) + digits[140]) << 1) + digits[90], digits[40]); - eccmadd(S, R); - table_lookup_fixed_base(S, 48 + (((((digits[230] << 1) + digits[180]) << 1) + digits[130]) << 1) + digits[80], digits[30]); - eccmadd(S, R); - table_lookup_fixed_base(S, 32 + (((((digits[220] << 1) + digits[170]) << 1) + digits[120]) << 1) + digits[70], digits[20]); - eccmadd(S, R); - table_lookup_fixed_base(S, 16 + (((((digits[210] << 1) + digits[160]) << 1) + digits[110]) << 1) + digits[60], digits[10]); - eccmadd(S, R); - table_lookup_fixed_base(S, 00 + (((((digits[200] << 1) + digits[150]) << 1) + digits[100]) << 1) + digits[50], digits[0]); - eccmadd(S, R); - - eccnorm(R, Q); - } - - void ecc_tau(point_extproj_t P) - { // Apply tau mapping to a point, P = tau(P) - f2elm_t t0, t1; - - fp2sqr1271(P->x, t0); // t0 = X1^2 - fp2sqr1271(P->y, t1); // t1 = Y1^2 - fp2mul1271(P->x, P->y, P->x); // X = X1*Y1 - fp2sqr1271(P->z, P->y); // Y = Z1^2 - fp2add1271(t0, t1, P->z); // Z = X1^2+Y1^2 - fp2sub1271(t1, t0, t0); // t0 = Y1^2-X1^2 - fp2add1271(P->y, P->y, P->y); // Y = 2*Z1^2 - fp2mul1271(P->x, t0, P->x); // X = X1*Y1*(Y1^2-X1^2) - fp2sub1271(P->y, t0, P->y); // Y = 2*Z1^2-(Y1^2-X1^2) - fp2mul1271(P->x, (felm_t*)&ctau1, P->x); // Xfinal = X*ctau1 - fp2mul1271(P->y, P->z, P->y); // Yfinal = Y*Z - fp2mul1271(P->z, t0, P->z); // Zfinal = t0*Z - } - - void ecc_tau_dual(point_extproj_t P) - { // Apply tau_dual mapping to a point, P = tau_dual(P) - f2elm_t t0, t1; - - fp2sqr1271(P->x, t0); // t0 = X1^2 - fp2sqr1271(P->z, P->ta); // Ta = Z1^2 - fp2sqr1271(P->y, t1); // t1 = Y1^2 - fp2add1271(P->ta, P->ta, P->z); // Z = 2*Z1^2 - fp2sub1271(t1, t0, P->ta); // Tafinal = Y1^2-X1^2 - fp2add1271(t0, t1, t0); // t0 = X1^2+Y1^2 - fp2mul1271(P->x, P->y, P->x); // X = X1*Y1 - fp2sub1271(P->z, P->ta, P->z); // Z = 2*Z1^2-(Y1^2-X1^2) - fp2mul1271(P->x, (felm_t*)&ctaudual1, P->tb); // Tbfinal = ctaudual1*X1*X1 - fp2mul1271(P->z, P->ta, P->y); // Yfinal = Z*Tafinal - fp2mul1271(P->tb, t0, P->x); // Xfinal = Tbfinal*t0 - fp2mul1271(P->z, t0, P->z); // Zfinal = Z*t0 - } - - void ecc_delphidel(point_extproj_t P) - { // Apply delta_phi_delta mapping to a point, P = delta(phi_W(delta_inv(P))), - // where phi_W is the endomorphism on the Weierstrass form - f2elm_t t0, t1, t2, t3, t4, t5, t6; - - fp2sqr1271(P->z, t4); // t4 = Z1^2 - fp2mul1271(P->y, P->z, t3); // t3 = Y1*Z1 - fp2mul1271(t4, (felm_t*)&cphi4, t0); // t0 = cphi4*t4 - fp2sqr1271(P->y, t2); // t2 = Y1^2 - fp2add1271(t0, t2, t0); // t0 = t0+t2 - fp2mul1271(t3, (felm_t*)&cphi3, t1); // t1 = cphi3*t3 - fp2sub1271(t0, t1, t5); // t5 = t0-t1 - fp2add1271(t0, t1, t0); // t0 = t0+t1 - fp2mul1271(t0, P->z, t0); // t0 = t0*Z1 - fp2mul1271(t3, (felm_t*)&cphi1, t1); // t1 = cphi1*t3 - fp2mul1271(t0, t5, t0); // t0 = t0*t5 - fp2mul1271(t4, (felm_t*)&cphi2, t5); // t5 = cphi2*t4 - fp2add1271(t2, t5, t5); // t5 = t2+t5 - fp2sub1271(t1, t5, t6); // t6 = t1-t5 - fp2add1271(t1, t5, t1); // t1 = t1+t5 - fp2mul1271(t6, t1, t6); // t6 = t1*t6 - fp2mul1271(t6, (felm_t*)&cphi0, t6); // t6 = cphi0*t6 - fp2mul1271(P->x, t6, P->x); // X = X1*t6 - fp2sqr1271(t2, t6); // t6 = t2^2 - fp2sqr1271(t3, t2); // t2 = t3^2 - fp2sqr1271(t4, t3); // t3 = t4^2 - fp2mul1271(t2, (felm_t*)&cphi8, t1); // t1 = cphi8*t2 - fp2mul1271(t3, (felm_t*)&cphi9, t5); // t5 = cphi9*t3 - fp2add1271(t1, t6, t1); // t1 = t1+t6 - fp2mul1271(t2, (felm_t*)&cphi6, t2); // t2 = cphi6*t2 - fp2mul1271(t3, (felm_t*)&cphi7, t3); // t3 = cphi7*t3 - fp2add1271(t1, t5, t1); // t1 = t1+t5 - fp2add1271(t2, t3, t2); // t2 = t2+t3 - fp2mul1271(t1, P->y, t1); // t1 = Y1*t1 - fp2add1271(t6, t2, P->y); // Y = t6+t2 - fp2mul1271(P->x, t1, P->x); // X = X*t1 - fp2mul1271(P->y, (felm_t*)&cphi5, P->y); // Y = cphi5*Y - fpneg1271(P->x[1]); // Xfinal = X^p - fp2mul1271(P->y, P->z, P->y); // Y = Y*Z1 - fp2mul1271(t0, t1, P->z); // Z = t0*t1 - fp2mul1271(P->y, t0, P->y); // Y = Y*t0 - fpneg1271(P->z[1]); // Zfinal = Z^p - fpneg1271(P->y[1]); // Yfinal = Y^p - } - - void ecc_delpsidel(point_extproj_t P) - { // Apply delta_psi_delta mapping to a point, P = delta(psi_W(delta_inv(P))), - // where psi_W is the endomorphism on the Weierstrass form - f2elm_t t0, t1, t2; - - fpneg1271(P->x[1]); // X = X1^p - fpneg1271(P->z[1]); // Z = Z1^p - fpneg1271(P->y[1]); // Y = Y1^p - fp2sqr1271(P->z, t2); // t2 = Z1^p^2 - fp2sqr1271(P->x, t0); // t0 = X1^p^2 - fp2mul1271(P->x, t2, P->x); // X = X1^p*Z1^p^2 - fp2mul1271(t2, (felm_t*)&cpsi2, P->z); // Z = cpsi2*Z1^p^2 - fp2mul1271(t2, (felm_t*)&cpsi3, t1); // t1 = cpsi3*Z1^p^2 - fp2mul1271(t2, (felm_t*)&cpsi4, t2); // t2 = cpsi4*Z1^p^2 - fp2add1271(t0, P->z, P->z); // Z = X1^p^2 + cpsi2*Z1^p^2 - fp2add1271(t0, t2, t2); // t2 = X1^p^2 + cpsi4*Z1^p^2 - fp2add1271(t0, t1, t1); // t1 = X1^p^2 + cpsi3*Z1^p^2 - fp2neg1271(t2); // t2 = -(X1^p^2 + cpsi4*Z1^p^2) - fp2mul1271(P->z, P->y, P->z); // Z = Y1^p*(X1^p^2 + cpsi2*Z1^p^2) - fp2mul1271(P->x, t2, P->x); // X = -X1^p*Z1^p^2*(X1^p^2 + cpsi4*Z1^p^2) - fp2mul1271(t1, P->z, P->y); // Yfinal = t1*Z - fp2mul1271(P->x, (felm_t*)&cpsi1, P->x); // Xfinal = cpsi1*X - fp2mul1271(P->z, t2, P->z); // Zfinal = Z*t2 - } - - void ecc_psi(point_extproj_t P) - { // Apply psi mapping to a point, P = psi(P) - ecc_tau(P); - ecc_delpsidel(P); - ecc_tau_dual(P); - } - - void ecc_phi(point_extproj_t P) - { // Apply phi mapping to a point, P = phi(P) - ecc_tau(P); - ecc_delphidel(P); - ecc_tau_dual(P); - } - - void eccneg_extproj_precomp(point_extproj_precomp_t P, point_extproj_precomp_t Q) - { // Point negation - *((__m256i*) & Q->t2) = *((__m256i*) & P->t2); - *((__m256i*) & Q->yx) = *((__m256i*) & P->xy); - *((__m256i*) & Q->xy) = *((__m256i*) & P->yx); - *((__m256i*) & Q->z2) = *((__m256i*) & P->z2); - fp2neg1271(Q->t2); - } - - void eccneg_precomp(point_precomp_t P, point_precomp_t Q) - { // Point negation - *((__m256i*) & Q->t2) = *((__m256i*) & P->t2); - *((__m256i*) & Q->yx) = *((__m256i*) & P->xy); - *((__m256i*) & Q->xy) = *((__m256i*) & P->yx); - fp2neg1271(Q->t2); - } - - unsigned long long mul_truncate(unsigned long long* s, unsigned long long* C) - { - unsigned long long t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15, t16; - unsigned long long high00, low10, high10, low01, high01, low20, high20, low02, high02, low11, high11, low03, high03, low30, high30, low12, high12, high21; - - _umul128(s[0], C[0], &high00); - low10 = _umul128(s[1], C[0], &high10); - _addcarry_u64(_addcarry_u64(0, high00, low10, &t0), high10, 0, &t1); - low01 = _umul128(s[0], C[1], &high01); - t2 = _addcarry_u64(_addcarry_u64(0, t0, low01, &t0), t1, high01, &t3); - low20 = _umul128(s[2], C[0], &high20); - _addcarry_u64(_addcarry_u64(0, t3, low20, &t4), t2, high20, &t5); - low02 = _umul128(s[0], C[2], &high02); - t6 = _addcarry_u64(_addcarry_u64(0, t4, low02, &t7), t5, high02, &t8); - low11 = _umul128(s[1], C[1], &high11); - t9 = _addcarry_u64(_addcarry_u64(0, t7, low11, &t0), t8, high11, &t10); - low03 = _umul128(s[0], C[3], &high03); - _addcarry_u64(_addcarry_u64(0, t10, low03, &t11), t6 + t9, high03, &t12); - low30 = _umul128(s[3], C[0], &high30); - _addcarry_u64(_addcarry_u64(0, t11, low30, &t13), t12, high30, &t14); - low12 = _umul128(s[1], C[2], &high12); - _addcarry_u64(_addcarry_u64(0, t13, low12, &t15), t14, high12, &t16); - - return _addcarry_u64(0, t15, _umul128(s[2], C[1], &high21), &t0) + t16 + high21 + s[1] * C[3] + s[2] * C[2] + s[3] * C[1]; - } - - void decompose(unsigned long long* k, unsigned long long* scalars) - { // Scalar decomposition for the variable-base scalar multiplication - const unsigned long long a1 = mul_truncate(k, ell1); - const unsigned long long a2 = mul_truncate(k, ell2); - const unsigned long long a3 = mul_truncate(k, ell3); - const unsigned long long a4 = mul_truncate(k, ell4); - - scalars[0] = a1 * B11 + a2 * B21 + a3 * B31 + a4 * B41 + C1 + k[0]; - scalars[1] = a1 * B12 + a2 * B22 + a3 * B32 + a4 * B42 + C2; - scalars[2] = a1 * B13 + a2 * B23 + a3 * B33 + a4 * B43 + C3; - scalars[3] = a1 * B14 + a2 * B24 + a3 * B34 + a4 * B44 + C4; - if (!(scalars[0] & 1)) - { - scalars[0] -= B41; - scalars[1] -= B42; - scalars[2] -= B43; - scalars[3] -= B44; - } - } - - void wNAF_recode(unsigned long long scalar, unsigned int w, char* digits) - { // Computes wNAF recoding of a scalar, where digits are in set {0,+-1,+-3,...,+-(2^(w-1)-1)} - const int val1 = (int)(1 << (w - 1)) - 1; // 2^(w-1) - 1 - const int val2 = (int)(1 << w); // 2^w; - const unsigned long long mask = (unsigned long long)val2 - 1; // 2^w - 1 - int index = 0; - - while (scalar) - { - int digit = (int)(scalar & 1); - if (!digit) - { - scalar >>= 1; // Shift scalar to the right by 1 - digits[index] = 0; - } - else - { - digit = (int)(scalar & mask); - scalar >>= w; // Shift scalar to the right by w - - if (digit > val1) - { - digit -= val2; - } - if (digit < 0) - { - scalar++; // scalar + 1 - } - digits[index] = digit; - - if (scalar) // Check if scalar != 0 - { - for (unsigned int i = 0; i < (w - 1); i++) - { - digits[++index] = 0; - } - } - } - index++; - } - - ZeroMemory(&digits[index], 65 - index); - } - - void ecc_precomp_double(point_extproj_t P, point_extproj_precomp_t* Table) - { // Generation of the precomputation table used internally by the double scalar multiplication function ecc_mul_double() - point_extproj_t Q; - point_extproj_precomp_t PP; - - R1_to_R2(P, Table[0]); // Precomputed point Table[0] = P in coordinates (X+Y,Y-X,2Z,2dT) - eccdouble(P); // A = 2*P in (X,Y,Z,Ta,Tb) - R1_to_R3(P, PP); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,Z,T) - - eccadd_core(Table[0], PP, Q); // Table[i] = Table[i-1]+2P using the representations (X,Y,Z,Ta,Tb) <- (X+Y,Y-X,2Z,2dT) + (X+Y,Y-X,Z,T) - R1_to_R2(Q, Table[1]); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT) - - eccadd_core(Table[1], PP, Q); // Table[i] = Table[i-1]+2P using the representations (X,Y,Z,Ta,Tb) <- (X+Y,Y-X,2Z,2dT) + (X+Y,Y-X,Z,T) - R1_to_R2(Q, Table[2]); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT) - - eccadd_core(Table[2], PP, Q); // Table[i] = Table[i-1]+2P using the representations (X,Y,Z,Ta,Tb) <- (X+Y,Y-X,2Z,2dT) + (X+Y,Y-X,Z,T) - R1_to_R2(Q, Table[3]); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT) - } - - bool ecc_mul_double(unsigned long long* k, unsigned long long* l, point_t Q) - { // Double scalar multiplication R = k*G + l*Q, where the G is the generator - // Uses DOUBLE_SCALAR_TABLE, which contains multiples of G, Phi(G), Psi(G) and Phi(Psi(G)) - // The function uses wNAF with interleaving. - char digits_k1[65], digits_k2[65], digits_k3[65], digits_k4[65]; - char digits_l1[65], digits_l2[65], digits_l3[65], digits_l4[65]; - point_precomp_t V; - point_extproj_t Q1, Q2, Q3, Q4, T; - point_extproj_precomp_t U, Q_table1[4], Q_table2[4], Q_table3[4], Q_table4[4]; - unsigned long long k_scalars[4], l_scalars[4]; - - point_setup(Q, Q1); // Convert to representation (X,Y,1,Ta,Tb) - - if (!ecc_point_validate(Q1)) // Check if point lies on the curve - { - return false; - } - - // Computing endomorphisms over point Q - *((__m256i*) & Q2->x) = *((__m256i*) & Q1->x); - *((__m256i*) & Q2->y) = *((__m256i*) & Q1->y); - *((__m256i*) & Q2->z) = *((__m256i*) & Q1->z); - *((__m256i*) & Q2->ta) = *((__m256i*) & Q1->ta); - *((__m256i*) & Q2->tb) = *((__m256i*) & Q1->tb); - ecc_phi(Q2); - *((__m256i*) & Q3->x) = *((__m256i*) & Q1->x); - *((__m256i*) & Q3->y) = *((__m256i*) & Q1->y); - *((__m256i*) & Q3->z) = *((__m256i*) & Q1->z); - *((__m256i*) & Q3->ta) = *((__m256i*) & Q1->ta); - *((__m256i*) & Q3->tb) = *((__m256i*) & Q1->tb); - ecc_psi(Q3); - *((__m256i*) & Q4->x) = *((__m256i*) & Q2->x); - *((__m256i*) & Q4->y) = *((__m256i*) & Q2->y); - *((__m256i*) & Q4->z) = *((__m256i*) & Q2->z); - *((__m256i*) & Q4->ta) = *((__m256i*) & Q2->ta); - *((__m256i*) & Q4->tb) = *((__m256i*) & Q2->tb); - ecc_psi(Q4); - - decompose((unsigned long long*)k, k_scalars); // Scalar decomposition - decompose((unsigned long long*)l, l_scalars); - wNAF_recode(k_scalars[0], 8, digits_k1); // Scalar recoding - wNAF_recode(k_scalars[1], 8, digits_k2); - wNAF_recode(k_scalars[2], 8, digits_k3); - wNAF_recode(k_scalars[3], 8, digits_k4); - wNAF_recode(l_scalars[0], 4, digits_l1); - wNAF_recode(l_scalars[1], 4, digits_l2); - wNAF_recode(l_scalars[2], 4, digits_l3); - wNAF_recode(l_scalars[3], 4, digits_l4); - ecc_precomp_double(Q1, Q_table1); - ecc_precomp_double(Q2, Q_table2); - ecc_precomp_double(Q3, Q_table3); - ecc_precomp_double(Q4, Q_table4); - - T->x[0][0] = 0; T->x[0][1] = 0; T->x[1][0] = 0; T->x[1][1] = 0; // Initialize T as the neutral point (0:1:1) - T->y[0][0] = 1; T->y[0][1] = 0; T->y[1][0] = 0; T->y[1][1] = 0; - T->z[0][0] = 1; T->z[0][1] = 0; T->z[1][0] = 0; T->z[1][1] = 0; - - for (unsigned int i = 65; i--; ) - { - eccdouble(T); - - if (digits_l1[i] < 0) - { - eccneg_extproj_precomp(Q_table1[(-digits_l1[i]) >> 1], U); - eccadd(U, T); - } - else if (digits_l1[i] > 0) - { - eccadd(Q_table1[(digits_l1[i]) >> 1], T); - } - - if (digits_l2[i] < 0) - { - eccneg_extproj_precomp(Q_table2[(-digits_l2[i]) >> 1], U); - eccadd(U, T); - } - else if (digits_l2[i] > 0) - { - eccadd(Q_table2[(digits_l2[i]) >> 1], T); - } - - if (digits_l3[i] < 0) - { - eccneg_extproj_precomp(Q_table3[(-digits_l3[i]) >> 1], U); - eccadd(U, T); - } - else if (digits_l3[i] > 0) - { - eccadd(Q_table3[(digits_l3[i]) >> 1], T); - } - - if (digits_l4[i] < 0) - { - eccneg_extproj_precomp(Q_table4[(-digits_l4[i]) >> 1], U); - eccadd(U, T); - } - else if (digits_l4[i] > 0) - { - eccadd(Q_table4[(digits_l4[i]) >> 1], T); - } - - if (digits_k1[i] < 0) - { - eccneg_precomp(((point_precomp_t*)&DOUBLE_SCALAR_TABLE)[(-digits_k1[i]) >> 1], V); - eccmadd(V, T); - } - else if (digits_k1[i] > 0) - { - eccmadd(((point_precomp_t*)&DOUBLE_SCALAR_TABLE)[(digits_k1[i]) >> 1], T); - } - - if (digits_k2[i] < 0) - { - eccneg_precomp(((point_precomp_t*)&DOUBLE_SCALAR_TABLE)[64 + ((-digits_k2[i]) >> 1)], V); - eccmadd(V, T); - } - else if (digits_k2[i] > 0) - { - eccmadd(((point_precomp_t*)&DOUBLE_SCALAR_TABLE)[64 + ((digits_k2[i]) >> 1)], T); - } - - if (digits_k3[i] < 0) - { - eccneg_precomp(((point_precomp_t*)&DOUBLE_SCALAR_TABLE)[2 * 64 + ((-digits_k3[i]) >> 1)], V); - eccmadd(V, T); - } - else if (digits_k3[i] > 0) - { - eccmadd(((point_precomp_t*)&DOUBLE_SCALAR_TABLE)[2 * 64 + ((digits_k3[i]) >> 1)], T); - } - - if (digits_k4[i] < 0) - { - eccneg_precomp(((point_precomp_t*)&DOUBLE_SCALAR_TABLE)[3 * 64 + ((-digits_k4[i]) >> 1)], V); - eccmadd(V, T); - } - else if (digits_k4[i] > 0) - { - eccmadd(((point_precomp_t*)&DOUBLE_SCALAR_TABLE)[3 * 64 + ((digits_k4[i]) >> 1)], T); - } - } - - eccnorm(T, Q); - - return true; - } - - void ecc_precomp(point_extproj_t P, point_extproj_precomp_t* T) - { // Generation of the precomputation table used by the variable-base scalar multiplication ecc_mul() - point_extproj_precomp_t Q, R, S; - point_extproj_t PP; - - // Generating Q = phi(P) = (XQ+YQ,YQ-XQ,ZQ,TQ) - *((__m256i*) & PP->x) = *((__m256i*) & P->x); - *((__m256i*) & PP->y) = *((__m256i*) & P->y); - *((__m256i*) & PP->z) = *((__m256i*) & P->z); - *((__m256i*) & PP->ta) = *((__m256i*) & P->ta); - *((__m256i*) & PP->tb) = *((__m256i*) & P->tb); - ecc_phi(PP); - R1_to_R3(PP, Q); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,Z,T) - - // Generating S = psi(Q) = (XS+YS,YS-XS,ZS,TS) - ecc_psi(PP); - R1_to_R3(PP, S); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,Z,T) - - // Generating T[0] = P = (XP+YP,YP-XP,2ZP,2dTP) - R1_to_R2(P, T[0]); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT) - - // Generating R = psi(P) = (XR+YR,YR-XR,ZR,TR) - ecc_psi(P); - R1_to_R3(P, R); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,Z,T) - - eccadd_core(T[0], Q, PP); // T[1] = P+Q using the representations (X,Y,Z,Ta,Tb) <- (X+Y,Y-X,2Z,2dT) + (X+Y,Y-X,Z,T) - R1_to_R2(PP, T[1]); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT) - eccadd_core(T[0], R, PP); // T[2] = P+R - R1_to_R2(PP, T[2]); - eccadd_core(T[1], R, PP); // T[3] = P+Q+R - R1_to_R2(PP, T[3]); - eccadd_core(T[0], S, PP); // T[4] = P+S - R1_to_R2(PP, T[4]); - eccadd_core(T[1], S, PP); // T[5] = P+Q+S - R1_to_R2(PP, T[5]); - eccadd_core(T[2], S, PP); // T[6] = P+R+S - R1_to_R2(PP, T[6]); - eccadd_core(T[3], S, PP); // T[7] = P+Q+R+S - R1_to_R2(PP, T[7]); - } - - void cofactor_clearing(point_extproj_t R) - { // Co-factor clearing - point_extproj_precomp_t Q; - - R1_to_R2(R, Q); // Converting from (X,Y,Z,Ta,Tb) to (X+Y,Y-X,2Z,2dT) - eccdouble(R); // P = 2*P using representations (X,Y,Z,Ta,Tb) <- 2*(X,Y,Z) - eccadd(Q, R); // P = P+Q using representations (X,Y,Z,Ta,Tb) <- (X,Y,Z,Ta,Tb) + (X+Y,Y-X,2Z,2dT) - eccdouble(R); - eccdouble(R); - eccdouble(R); - eccdouble(R); - eccadd(Q, R); - eccdouble(R); - eccdouble(R); - eccdouble(R); - } - - bool ecc_mul(point_t P, unsigned long long* k, point_t Q) - { // Variable-base scalar multiplication Q = k*P using a 4-dimensional decomposition - // This function performs point validation and (if selected) cofactor clearing - point_extproj_t R; - point_extproj_precomp_t Table[2][8]; - unsigned long long scalars[4]; - unsigned int digits[64], sign_masks[64]; - - point_setup(P, R); // Convert to representation (X,Y,1,Ta,Tb) - - if (!ecc_point_validate(R)) // Check if point lies on the curve - { - return false; - } - - decompose((unsigned long long*)k, scalars); // Scalar decomposition - - cofactor_clearing(R); - - // Recoding sub-scalars for use in the variable-base scalar multiplication - for (unsigned int i = 0; i < 64; i++) - { - scalars[0] >>= 1; - const unsigned int bit0 = scalars[0] & 1; - sign_masks[i] = bit0; - - digits[i] = scalars[1] & 1; - scalars[1] = (scalars[1] >> 1) + ((bit0 | digits[i]) ^ bit0); - - unsigned int bit = scalars[2] & 1; - scalars[2] = (scalars[2] >> 1) + ((bit0 | bit) ^ bit0); - digits[i] += (bit << 1); - - bit = scalars[3] & 1; - scalars[3] = (scalars[3] >> 1) + ((bit0 | bit) ^ bit0); - digits[i] += (bit << 2); - } - - ecc_precomp(R, Table[1]); // Precomputation - for (unsigned int i = 0; i < 8; i++) - { - *((__m256i*)Table[0][i]->xy) = *((__m256i*)Table[1][i]->yx); - *((__m256i*)Table[0][i]->yx) = *((__m256i*)Table[1][i]->xy); - *((__m256i*)Table[0][i]->t2) = *((__m256i*)Table[1][i]->t2); - *((__m256i*)Table[0][i]->z2) = *((__m256i*)Table[1][i]->z2); - fp2neg1271(Table[0][i]->t2); - } - R2_to_R4(Table[1][scalars[1] + (scalars[2] << 1) + (scalars[3] << 2)], R); - - for (unsigned int i = 64; i--; ) - { - eccdouble(R); // P = 2*P using representations (X,Y,Z,Ta,Tb) <- 2*(X,Y,Z) - eccadd(Table[sign_masks[i]][digits[i]], R); // P = P+S using representations (X,Y,Z,Ta,Tb) <- (X,Y,Z,Ta,Tb) + (X+Y,Y-X,2Z,2dT) - } - eccnorm(R, Q); // Conversion to affine coordinates (x,y) and modular correction. - - return true; - } - - void encode(point_t P, unsigned char* Pencoded) - { // Encode point P - const unsigned long long temp1 = (P->x[1][1] & 0x4000000000000000) << 1; - const unsigned long long temp2 = (P->x[0][1] & 0x4000000000000000) << 1; - - *((__m256i*)Pencoded) = *((__m256i*)P->y); - if (!P->x[0][0] && !P->x[0][1]) - { - ((unsigned long long*)Pencoded)[3] |= temp1; - } - else - { - ((unsigned long long*)Pencoded)[3] |= temp2; - } - } - - bool decode(const unsigned char* Pencoded, point_t P) - { // Decode point P - felm_t r, t, t0, t1, t2, t3, t4; - f2elm_t u, v; - point_extproj_t R; - unsigned int i; - - *((__m256i*)P->y) = *((__m256i*)Pencoded); // Decoding y-coordinate and sign - P->y[1][1] &= 0x7FFFFFFFFFFFFFFF; - - fp2sqr1271(P->y, u); - fp2mul1271(u, (felm_t*)&PARAMETER_d, v); - fp2sub1271(u, (felm_t*)&ONE, u); - fp2add1271(v, (felm_t*)&ONE, v); - - fpsqr1271(v[0], t0); // t0 = v0^2 - fpsqr1271(v[1], t1); // t1 = v1^2 - fpadd1271(t0, t1, t0); // t0 = t0+t1 - fpmul1271(u[0], v[0], t1); // t1 = u0*v0 - fpmul1271(u[1], v[1], t2); // t2 = u1*v1 - fpadd1271(t1, t2, t1); // t1 = t1+t2 - fpmul1271(u[1], v[0], t2); // t2 = u1*v0 - fpmul1271(u[0], v[1], t3); // t3 = u0*v1 - fpsub1271(t2, t3, t2); // t2 = t2-t3 - fpsqr1271(t1, t3); // t3 = t1^2 - fpsqr1271(t2, t4); // t4 = t2^2 - fpadd1271(t3, t4, t3); // t3 = t3+t4 - for (i = 0; i < 125; i++) fpsqr1271(t3, t3); // t3 = t3^(2^125) - - fpadd1271(t1, t3, t); // t = t1+t3 - mod1271(t); - if (!t[0] && !t[1]) - { - fpsub1271(t1, t3, t); // t = t1-t3 - } - fpadd1271(t, t, t); // t = 2*t - fpsqr1271(t0, t3); // t3 = t0^2 - fpmul1271(t0, t3, t3); // t3 = t3*t0 - fpmul1271(t, t3, t3); // t3 = t3*t - fpexp1251(t3, r); // r = t3^(2^125-1) - fpmul1271(t0, r, t3); // t3 = t0*r - fpmul1271(t, t3, P->x[0]); // x0 = t*t3 - fpsqr1271(P->x[0], t1); - fpmul1271(t0, t1, t1); // t1 = t0*x0^2 - - // x0 = x0/2 - unsigned long long mask, temp[2]; - mask = (0 - (1 & P->x[0][0])); - _addcarry_u64(_addcarry_u64(0, P->x[0][0], mask, &temp[0]), P->x[0][1], (mask >> 1), &temp[1]); - P->x[0][0] = __shiftright128(temp[0], temp[1], 1); - P->x[0][1] = (temp[1] >> 1); - - fpmul1271(t2, t3, P->x[1]); // x1 = t3*t2 - - fpsub1271(t, t1, t); - mod1271(t); - if (t[0] || t[1]) // If t != t1 then swap x0 and x1 - { - t0[0] = P->x[0][0]; - t0[1] = P->x[0][1]; - P->x[0][0] = P->x[1][0]; - P->x[0][1] = P->x[1][1]; - P->x[1][0] = t0[0]; - P->x[1][1] = t0[1]; - } - - mod1271(P->x[0]); - if (((unsigned int)(Pencoded[31] >> 7)) - != (unsigned int)(P->x[(!P->x[0][0] && !P->x[0][1]) ? 1 : 0][1] >> 62)) // If sign of x-coordinate decoded != input sign bit, then negate x-coordinate - { - fp2neg1271(P->x); - } - - point_setup(P, R); - if (!ecc_point_validate(R)) - { - fpneg1271(R->x[1]); - P->x[1][0] = R->x[1][0]; - P->x[1][1] = R->x[1][1]; - if (!ecc_point_validate(R)) // Final point validation - { - return false; - } - } - - return true; - } - - bool getSubseed(const unsigned char* seed, unsigned char* subseed) - { - unsigned char seedBytes[55]; - for (int i = 0; i < 55; i++) - { - if (seed[i] < 'a' || seed[i] > 'z') - { - return false; - } - seedBytes[i] = seed[i] - 'a'; - } - KangarooTwelve(seedBytes, sizeof(seedBytes), subseed, 32); - - return true; - } - - void getPrivateKey(unsigned char* subseed, unsigned char* privateKey) - { - KangarooTwelve(subseed, 32, privateKey, 32); - } - - void getPublicKey(const unsigned char* privateKey, unsigned char* publicKey) - { // SchnorrQ public key generation - // It produces a public key publicKey, which is the encoding of P = s*G, where G is the generator and - // s is the output of hashing publicKey and taking the least significant 32 bytes of the result - // Input: 32-byte privateKey - // Output: 32-byte publicKey - point_t P; - - ecc_mul_fixed((unsigned long long*)privateKey, P); // Compute public key - encode(P, publicKey); // Encode public key - } - - bool getPublicKeyFromIdentity(const unsigned char* identity, unsigned char* publicKey) - { - unsigned char publicKeyBuffer[32]; - for (int i = 0; i < 4; i++) - { - *((unsigned long long*) & publicKeyBuffer[i << 3]) = 0; - for (int j = 14; j-- > 0; ) - { - if (identity[i * 14 + j] < 'A' || identity[i * 14 + j] > 'Z') - { - return false; - } - - *((unsigned long long*) & publicKeyBuffer[i << 3]) = *((unsigned long long*) & publicKeyBuffer[i << 3]) * 26 + (identity[i * 14 + j] - 'A'); - } - } - unsigned int identityBytesChecksum; - KangarooTwelve(publicKeyBuffer, 32, (unsigned char*)&identityBytesChecksum, 3); - identityBytesChecksum &= 0x3FFFF; - for (int i = 0; i < 4; i++) - { - if (identityBytesChecksum % 26 + 'A' != identity[56 + i]) - { - return false; - } - identityBytesChecksum /= 26; - } - *((__m256i*)publicKey) = *((__m256i*)publicKeyBuffer); - - return true; - } - - bool getSharedKey(const unsigned char* privateKey, const unsigned char* publicKey, unsigned char* sharedKey) - { // Secret agreement computation for key exchange using a compressed, 32-byte public key - // The output is the y-coordinate of privateKey*A, where A is the decoding of the public key publicKey - // Inputs: 32-byte privateKey and 32-byte publicKey - // Output: 32-byte sharedKey - point_t A; - - if (publicKey[15] & 0x80) // Is bit128(PublicKey) = 0? - { - return false; - } - - if (!decode(publicKey, A)) // Also verifies that A is on the curve, if it is not it fails - { - return false; - } - - if (!ecc_mul(A, (unsigned long long*)privateKey, A)) - { - return false; - } - - if (!A->x[0][0] && !A->x[0][1] && !A->x[1][0] && !A->x[1][1] - && A->y[0][0] == 1 && !A->y[0][1] && !A->y[1][0] && !A->y[1][1]) // Is output = neutral point (0,1)? - { - return false; - } - - *((__m256i*)sharedKey) = *((__m256i*)A->y); - - return true; - } - - void getIdentity(unsigned char* publicKey, char* identity, bool isLowerCase) - { - for (int i = 0; i < 4; i++) - { - unsigned long long publicKeyFragment = *((unsigned long long*) & publicKey[i << 3]); - for (int j = 0; j < 14; j++) - { - identity[i * 14 + j] = publicKeyFragment % 26 + (isLowerCase ? 'a' : 'A'); - publicKeyFragment /= 26; - } - } - unsigned int identityBytesChecksum; - KangarooTwelve(publicKey, 32, (unsigned char*)&identityBytesChecksum, 3); - identityBytesChecksum &= 0x3FFFF; - for (int i = 0; i < 4; i++) - { - identity[56 + i] = identityBytesChecksum % 26 + (isLowerCase ? 'a' : 'A'); - identityBytesChecksum /= 26; - } - identity[60] = 0; - } - - void sign(const unsigned char* subseed, const unsigned char* publicKey, const unsigned char* messageDigest, unsigned char* signature) - { // SchnorrQ signature generation - // It produces the signature signature of a message messageDigest of size 32 in bytes - // Inputs: 32-byte subseed, 32-byte publicKey, and messageDigest of size 32 in bytes - // Output: 64-byte signature - point_t R; - unsigned char k[64], h[64], temp[32 + 64]; - unsigned long long r[8]; - - KangarooTwelve((unsigned char*)subseed, 32, k, 64); - - *((__m256i*)(temp + 32)) = *((__m256i*)(k + 32)); - *((__m256i*)(temp + 64)) = *((__m256i*)messageDigest); - - KangarooTwelve(temp + 32, 32 + 32, (unsigned char*)r, 64); - - ecc_mul_fixed(r, R); - encode(R, signature); // Encode lowest 32 bytes of signature - *((__m256i*)temp) = *((__m256i*)signature); - *((__m256i*)(temp + 32)) = *((__m256i*)publicKey); - - KangarooTwelve(temp, 32 + 64, h, 64); - Montgomery_multiply_mod_order(r, Montgomery_Rprime, r); - Montgomery_multiply_mod_order(r, ONE, r); - Montgomery_multiply_mod_order((unsigned long long*)h, Montgomery_Rprime, (unsigned long long*)h); - Montgomery_multiply_mod_order((unsigned long long*)h, ONE, (unsigned long long*)h); - Montgomery_multiply_mod_order((unsigned long long*)k, Montgomery_Rprime, (unsigned long long*)(signature + 32)); - Montgomery_multiply_mod_order((unsigned long long*)h, Montgomery_Rprime, (unsigned long long*)h); - Montgomery_multiply_mod_order((unsigned long long*)(signature + 32), (unsigned long long*)h, (unsigned long long*)(signature + 32)); - Montgomery_multiply_mod_order((unsigned long long*)(signature + 32), ONE, (unsigned long long*)(signature + 32)); - if (_subborrow_u64(_subborrow_u64(_subborrow_u64(_subborrow_u64(0, r[0], ((unsigned long long*)signature)[4], &((unsigned long long*)signature)[4]), r[1], ((unsigned long long*)signature)[5], &((unsigned long long*)signature)[5]), r[2], ((unsigned long long*)signature)[6], &((unsigned long long*)signature)[6]), r[3], ((unsigned long long*)signature)[7], &((unsigned long long*)signature)[7])) - { - _addcarry_u64(_addcarry_u64(_addcarry_u64(_addcarry_u64(0, ((unsigned long long*)signature)[4], CURVE_ORDER_0, &((unsigned long long*)signature)[4]), ((unsigned long long*)signature)[5], CURVE_ORDER_1, &((unsigned long long*)signature)[5]), ((unsigned long long*)signature)[6], CURVE_ORDER_2, &((unsigned long long*)signature)[6]), ((unsigned long long*)signature)[7], CURVE_ORDER_3, &((unsigned long long*)signature)[7]); - } - } - - bool verify(const unsigned char* publicKey, const unsigned char* messageDigest, const unsigned char* signature) - { // SchnorrQ signature verification - // It verifies the signature Signature of a message MessageDigest of size 32 in bytes - // Inputs: 32-byte PublicKey, 64-byte Signature, and MessageDigest of size 32 in bytes - // Output: TRUE (valid signature) or FALSE (invalid signature) - point_t A; - unsigned char temp[32 + 64], h[64]; - - if ((publicKey[15] & 0x80) || (signature[15] & 0x80) || (signature[62] & 0xC0) || signature[63]) - { // Are bit128(PublicKey) = bit128(Signature) = 0 and Signature+32 < 2^246? - return false; - } - - if (!decode(publicKey, A)) // Also verifies that A is on the curve, if it is not it fails - { - return false; - } - - *((__m256i*)temp) = *((__m256i*)signature); - *((__m256i*)(temp + 32)) = *((__m256i*)publicKey); - *((__m256i*)(temp + 64)) = *((__m256i*)messageDigest); - - KangarooTwelve(temp, 32 + 64, h, 64); - - if (!ecc_mul_double((unsigned long long*)(signature + 32), (unsigned long long*)h, A)) - { - return false; - } - - encode(A, (unsigned char*)A); - - return EQUAL(*((__m256i*)A), *((__m256i*)signature)); - } -} \ No newline at end of file diff --git a/ffi-deps/simde/CONTRIBUTING.md b/ffi-deps/simde/CONTRIBUTING.md deleted file mode 100644 index 7bf1292..0000000 --- a/ffi-deps/simde/CONTRIBUTING.md +++ /dev/null @@ -1,85 +0,0 @@ -# Contributing to SIMDe - -First off, if you're even reading this, thank you! There is a lot of -work to do, and any help is appreciated. - -If you haven't already, please read the -[README](https://github.com/simd-everywhere/simde/blob/master/README.md). The -[wiki](https://github.com/simd-everywhere/simde/wiki) also has some good -information, especially the -[FAQ](https://github.com/simd-everywhere/simde/wiki/FAQ) and a guide on how to -[implement a new -function](https://github.com/simd-everywhere/simde/wiki/Implementing-a-New-Function). - -For information on developing for architectures you don't have access -to, please see the [Development -Environment](https://github.com/simd-everywhere/simde/wiki/Development-Environment) -page on the wiki. - -If you still have questions, or if anything below doesn't make sense -to you, please feel free to use the [issue -tracker](https://github.com/simd-everywhere/simde/issues) or the [mailing -list](https://groups.google.com/forum/#!forum/simde) to ask. I know -the SIMDe documentation needs a lot of improvement, and asking -questions will help us understand what is missing, so please don't be -shy! - -## Building the Tests - -SIMDe contains an extensive test suite used for development. Most -users will never need to build the suite, but if you're contributing -code to SIMDe you'll need to build them. - -Here is the basic procedure for compiling and running the tests: - -### On Unix -```bash -mkdir -p build -cd build -CFLAGS="-march=native" CXXFLAGS="-march=native" meson setup .. -ninja test -``` - -Note that `-march=native` may not be the right flag for your compiler. -That should work for most compilers on x86/x86_64, though MSVC is an -exception (try `/arch:AVX2` instead of `-march=native`). On other -architectures please consult your compiler documentation to find out -what flags you should use to enable the SIMD extension for your target -platform. Here are a few to try: - - * ARM: - * `-march=armv8-a+simd` (for AArch64) - * `-march=armv8-a+simd -mfpu=auto` (for ARMv8) - * `-march=armv7-a -mfpu=neon` (for ARMv7) - * POWER - * `-mcpu=native` - -If you need a flag not listed above, please let us know so we can add -it to the list. - -### On Windows: -```bash -"C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Auxiliary\Build\vcvars64.bat" arm64 -mkdir build -cd build -set CFLAGS="/std:c11" -set CXXFLAGS="/Zc:preprocessor" -meson setup .. -ninja test -``` -Note change arm64 to x64 on x86_64 CPU. - -You may also want to take a look at the -[Docker container](https://github.com/simd-everywhere/simde/tree/master/docker) -which has many builds pre-configured, including cross-compilers and emulators. - -## Coding Style - -SIMDe has an [EditorConfig](https://editorconfig.org/) file to -configure your editor for things like tabs vs. spaces, how many spaces, -etc. If you use an editor which doesn't support it out of the box then -odds are good there is a plugin you can download; please do so. - -For other coding style information, please see the -[Coding Style](https://github.com/simd-everywhere/simde/wiki/Coding-Style) -document in the Wiki. diff --git a/ffi-deps/simde/COPYING b/ffi-deps/simde/COPYING deleted file mode 100644 index c7f6b6d..0000000 --- a/ffi-deps/simde/COPYING +++ /dev/null @@ -1,20 +0,0 @@ -Copyright (c) 2017 Evan Nemerson - -Permission is hereby granted, free of charge, to any person obtaining -a copy of this software and associated documentation files (the -"Software"), to deal in the Software without restriction, including -without limitation the rights to use, copy, modify, merge, publish, -distribute, sublicense, and/or sell copies of the Software, and to -permit persons to whom the Software is furnished to do so, subject to -the following conditions: - -The above copyright notice and this permission notice shall be -included in all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND -NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE -LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION -OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION -WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/ffi-deps/simde/README.md b/ffi-deps/simde/README.md deleted file mode 100644 index 7195d64..0000000 --- a/ffi-deps/simde/README.md +++ /dev/null @@ -1,496 +0,0 @@ -# SIMD Everywhere - -[![All Contributors](https://img.shields.io/badge/all_contributors-70-orange.svg?style=flat-square)](#contributors-) - -[![Chat](https://badges.gitter.im/gitterHQ/gitter.png)](https://matrix.to/#/#simd-everywhere_community:gitter.im) -[![codecov](https://codecov.io/gh/simd-everywhere/simde/branch/master/graph/badge.svg?token=jcMBoRk0ui)](https://codecov.io/gh/simd-everywhere/simde) - -The SIMDe header-only library provides fast, portable implementations of -[SIMD intrinsics](https://en.wikipedia.org/wiki/SIMD) on hardware which -doesn't natively support them, such as calling [SSE](https://en.wikipedia.org/wiki/Streaming_SIMD_Extensions) -functions on ARM. There is no performance penalty if the hardware -supports the native implementation (*e.g.*, SSE/[AVX](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) -runs at full speed on [x86](https://en.wikipedia.org/wiki/X86), -[NEON](https://en.wikipedia.org/wiki/ARM_architecture#Advanced_SIMD_(Neon)) on [ARM](https://en.wikipedia.org/wiki/ARM_architecture), -*etc.*). - -This makes porting code to other architectures much easier in a few -key ways: - -First, instead of forcing you to rewrite everything for each -architecture, SIMDe lets you get a port up and running almost -effortlessly. You can then start working on switching the most -performance-critical sections to native intrinsics, improving -performance gradually. SIMDe lets (for example) SSE/AVX and NEON code -exist side-by-side, in the same implementation. - -Second, SIMDe makes it easier to write code targeting [ISA](https://en.wikipedia.org/wiki/Instruction_set_architecture) -extensions you don't have convenient access to. You can run NEON code on your -x86 machine *without an emulator*. Obviously you'll eventually want -to test on the actual hardware you're targeting, but for most -development, SIMDe can provide a much easier path. - -SIMDe takes a very different approach from most other SIMD abstraction -layers in that it aims to expose the entire functionality of the -underlying instruction set. Instead of limiting functionality to the -lowest common denominator, SIMDe tries to minimize the amount of -effort required to port while still allowing you the space to optimize -as needed. - -The current focus is on writing complete portable implementations, -though a large number of functions already have accelerated -implementations using one (or more) of the following: - - * SIMD intrinsics from other ISA extensions (e.g., using NEON to - implement SSE). - * Compiler-specific vector extensions and built-ins such as - [`__builtin_shufflevector`](http://clang.llvm.org/docs/LanguageExtensions.html#langext-builtin-shufflevector) - and - [`__builtin_convertvector`](http://clang.llvm.org/docs/LanguageExtensions.html#langext-builtin-convertvector) - * Compiler auto-vectorization hints, using: - * [OpenMP 4 SIMD](http://www.openmp.org/) - * [Cilk Plus](https://www.cilkplus.org/) - * [GCC loop-specific pragmas](https://gcc.gnu.org/onlinedocs/gcc/Loop-Specific-Pragmas.html) - * [clang pragma loop hint directives](http://llvm.org/docs/Vectorizers.html#pragma-loop-hint-directives) - -You can [try SIMDe online](https://simde.netlify.app/godbolt/demo) -using Compiler Explorer and an amalgamated SIMDe header. - -If you have any questions, please feel free to use the -[issue tracker](https://github.com/simd-everywhere/simde/issues) or the -[mailing list](https://groups.google.com/forum/#!forum/simde). - -## Current Status - -There are currently complete implementations of the following instruction -set extensions: - -* ARM - * [NEON](https://en.wikipedia.org/wiki/ARM_architecture_family#Advanced_SIMD_(Neon)) [List](https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon]) -* x86 / x86_64 - * [MMX](https://en.wikipedia.org/wiki/MMX_(instruction_set)) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#techs=MMX) - * [SSE](https://en.wikipedia.org/wiki/Streaming_SIMD_Extensions) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ssetechs=SSE) - * [SSE2](https://en.wikipedia.org/wiki/SSE2) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ssetechs=SSE2) - * [SSE3](https://en.wikipedia.org/wiki/SSE3) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ssetechs=SSE3) - * [SSSE3](https://en.wikipedia.org/wiki/SSSE3) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ssetechs=SSSE3) - * [SSE4.1](https://en.wikipedia.org/wiki/SSE4#SSE4.1) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ssetechs=SSE4_1) - * [AVX](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=AVX) - * [AVX2](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions#Advanced_Vector_Extensions_2) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=AVX2) - * [F16C](https://en.wikipedia.org/wiki/F16C) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=F16C) - * [FMA](https://en.wikipedia.org/wiki/FMA_instruction_set) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#avxnewtechs=FMA) - * [GFNI](https://en.wikipedia.org/wiki/AVX-512#GFNI) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#othertechs=GFNI) - * [CLMUL](https://en.wikipedia.org/wiki/CLMUL_instruction_set) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=clmul&ig_expand=770&othertechs=PCLMULQDQ,VPCLMULQDQ) - * [XOP](https://en.wikipedia.org/wiki/XOP_instruction_set) - * [SVML](https://software.intel.com/content/www/us/en/develop/documentation/cpp-compiler-developer-guide-and-reference/top/compiler-reference/intrinsics/intrinsics-for-intel-advanced-vector-extensions-512-intel-avx-512-instructions/intrinsics-for-arithmetic-operations-1/intrinsics-for-short-vector-math-library-svml-operations.html) [List](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#ig_expand=770&techs=SVML) -* WebAssembly - * [SIMD128](https://github.com/WebAssembly/simd) - -As well as partial support for many others, including NEON and SVE in -addition to several AVX-512 extensions. See the -[instruction-set-support](https://github.com/simd-everywhere/simde/issues?q=is%3Aissue+is%3Aopen+label%3Ainstruction-set-support+sort%3Aupdated-desc) -label in the issue tracker for details on progress. If you'd like to -be notified when an instruction set is available you may subscribe to -the relevant issue. - -If you have a project you're interested in using with SIMDe but we -don't yet support all the functions you need, please file an issue -with a list of what's missing so we know what to prioritize. - -The default branch is protected so commits never reach it unless -they have passed extensive CI checks. Status badges don't really -make sense since they will always be green, but here are the links: - -* [GitHub Actions](https://github.com/simd-everywhere/simde/actions) -* [Cirrus CI](https://cirrus-ci.com/github/simd-everywhere/simde) -* [Semaphore CI](https://nemequ.semaphoreci.com/projects/simde) -* [Circle CI](https://app.circleci.com/pipelines/github/simd-everywhere/simde) -* [AppVeyor](https://ci.appveyor.com/project/nemequ/simde) -* [Azure Pipelines](https://dev.azure.com/simd-everywhere/SIMDe/_build) -* [Drone CI](https://cloud.drone.io/simd-everywhere/simde/) -* [Travis CI](https://app.travis-ci.com/github/simd-everywhere/simde/) -* [Packit CI](https://dashboard.packit.dev/projects/github.com/simd-everywhere/simde) - -If you're adding a new build I suggest Cirrus CI, which is where we -currently have the most room given the number of builds currently on -the platform and the quotas for free/open-source usage. Alternately, -feel free to set up another provider (such as -[Codefresh](https://codefresh.io/), -[Shippable](https://www.shippable.com/), -[Bitrise](https://www.bitrise.io/), -[Werkaer](https://app.wercker.com/), etc.). - -*Notice*: we plan on changing the name of the default branch from -"master" to something else soon; we are just trying to wait to see what -name git settles on so we can be consistent. - -## Contributing - -First off, if you're reading this: thank you! Even considering -contributing to SIMDe is very much appreciated! - -SIMDe is a fairly large undertaking; there are a *lot* of functions to -get through and a lot of opportunities for optimization on different -platforms, so we're very happy for any help you can provide. - -Programmers of all skill levels are welcome, there are lots of tasks -which are pretty straightforward and don't require any special -expertise. - -If you're not sure how you'd like to contribute, please consider taking -a look at [the issue tracker](https://github.com/simd-everywhere/simde/issues). -There is a [good first issue](https://github.com/simd-everywhere/simde/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22) -tag if you want to ease into a your first contributions, but if you're -interested in something else please get in touch via the issue tracker; -we're happy to help you get a handle on whatever you are interested in. - -If you're interested in implementing currently unimplemented functions, -there is [a -guide](https://github.com/simd-everywhere/simde/wiki/Implementing-a-New-Function) -explaining how to add new functions and how to quickly and easily get -a test case in place. It's a bit rough right now, but if anything is -unclear please feel free to use the issue tracker to ask about -anything you're not clear on. - -## Usage - -First, it is important to note that *you do not need two separate -versions* (one using SIMDe, the other native). If the native functions -are available SIMDe will use them, and compilers easily optimize away -any overhead from SIMDe; all they have to do is some basic inlining. -`-O2` should be enough, but we strongly recommend `-O3` (or whatever -flag instructs your compiler to aggressizely optimize) since many of -the portable fallbacks are substantially faster with aggressive -auto-vectorization that isn't enabled at lower optimization levels. - -Each instruction set has a separate file; `x86/mmx.h` for MMX, -`x86/sse.h` for SSE, `x86/sse2.h` for SSE2, and so on. Just include -the header for whichever instruction set(s) you want *instead of the -native version* (if you include the native version after SIMDe it will -result in compile-time errors if native aliases are enabled). SIMDe -will provide the fastest implementation it can given which extensions -you've enabled in your compiler (i.e., if you want to use NEON to -implement SSE, you may need to pass something like `-mfpu=neon` -or `-march=armv8-a+simd`. See -[GCC ARM-Options](https://gcc.gnu.org/onlinedocs/gcc/ARM-Options.html) -for more information). - -If you define `SIMDE_ENABLE_NATIVE_ALIASES` before including SIMDe -you can use the same names as the native functions. Unfortunately, -this is somewhat error-prone due to portability issues in the APIs, so -it's recommended to only do this for testing. When -`SIMDE_ENABLE_NATIVE_ALIASES` is undefined only the versions prefixed -with `simde_` will be available; for example, the MMX `_mm_add_pi8` -intrinsic becomes `simde_mm_add_pi8`, and `__m64` becomes `simde__m64`. - -Since SIMDe is meant to be portable, many functions which assume types -are of a specific size have been altered to use fixed-width types -instead. For example, Intel's APIs use `char` for signed 8-bit -integers, but `char` on ARM is generally unsigned. SIMDe uses `int8_t` -to make the API portable, but that means your code may require some -minor changes (such as using `int8_t` instead of `char`) to work on -other platforms. - -That said, the changes are usually quite minor. It's often enough to -just use search and replace, manual changes are required pretty -infrequently. - -### OpenMP 4 SIMD - -SIMDe makes extensive use of annotations to help the compiler vectorize -code. By far the best annotations use the SIMD support built in to -OpenMP 4, so if your compiler supports these annotations we strongly -recommend you enable them. - -If you are already using OpenMP, SIMDe will automatically detect it -using the `_OPENMP` macro and no further action is required. - -Some compilers allow you to enable OpenMP SIMD *without* enabling the -full OpenMP. In such cases there is no runtime dependency on OpenMP -and no runtime overhead; SIMDe will just be faster. Unfortunately, -SIMDe has no way to detect such situations (the `_OPENMP` macro is not -defined), so after enabling it in your compiler you'll need to define -`SIMDE_ENABLE_OPENMP` (e.g., by passing `-DSIMDE_ENABLE_OPENMP`) to get -SIMDe to output the relevant pragmas. - -Enabling OpenMP SIMD support varies by compiler: - - * GCC 4.9+ and clang 6+ support a `-fopenmp-simd` command line flag. - * ICC supports a `-qopenmp-simd` command line flag. - * MCST's LCC enables OpenMP SIMD by default, so no flags are needed - (technically you don't even need to pass `-DSIMDE_ENABLE_OPENMP`). - -We are not currently aware of any other compilers which allow you to -enable OpenMP SIMD support without enabling full OpenMP (if you are -please file an issue to let us know). You should determine whether you -wish to enable full OpenMP support on a case-by-case basis, but it is -likely that the overhead of linking to (but not using) the OpenMP -runtime library will be dwarfed by the performance improvements from -using the OpenMP SIMD annotations in SIMDe. - -If you choose not to use OpenMP SIMD, SIMDe also supports -using [Cilk Plus](https://www.cilkplus.org/), [GCC loop-specific -pragmas](https://gcc.gnu.org/onlinedocs/gcc/Loop-Specific-Pragmas.html), -or [clang pragma loop hint -directives](http://llvm.org/docs/Vectorizers.html#pragma-loop-hint-directives), -though these are not nearly as effective as OpenMP SIMD and depending -on them will likely result in less efficient code. All of these are -detected automatically by SIMDe, so if they are enabled in your -compiler nothing more is required. - -If for some reason you do not wish to enable OpenMP 4 SIMD support even -though SIMDe detects it, you should define `SIMDE_DISABLE_OPENMP` prior -to including SIMDe. - -## Portability - -### Compilers - -SIMDe does depend on some C99 features, though the subset supported by -MSVC also works. While we do our best to make sure we provide optimized -implementations where they are supported, SIMDe does contain portable -fallbacks which are designed to work on any C99 compiler. - -Every commit is tested in CI on multiple compilers, platforms, and -configurations, and our test coverage is extremely extensive. -Currently tested compilers include: - - * GCC versions back to 4.8 - * Clang versions back to 3.8 - * Microsoft Visual Studio back to 12 (2013) - * IBM XL C/C++ - * Intel C/C++ Compiler (ICC) - -I'm generally willing to accept patches to add support for other -compilers, as long as they're not too disruptive, *especially* if we -can get CI support going. If using one of our existing CI providers -isn't an option then other CI platforms can be added. - -### Hardware - -The following architectures are tested in CI for every commit: - - * x86_64 - * x86 - * AArch64 - * ARMv8 - * ARMv7 - * PPC64 - * MIPS Loongson - -We would love to add more, so patches are extremely welcome! - -## Related Projects - - * The "builtins" module in - [portable-snippets](https://github.com/nemequ/portable-snippets) - does much the same thing, but for compiler-specific intrinsics - (think `__builtin_clz` and `_BitScanForward`), **not** SIMD - intrinsics. - * Intel offers an emulator, the [Intel® Software Development - Emulator](https://software.intel.com/en-us/articles/intel-software-development-emulator/) - which can be used to develop software which uses Intel intrinsics - without having to own hardware which supports them, though it - doesn't help for deployment. - * [Iris](https://github.com/AlexYaruki/iris) is the only other project - I'm aware of which is attempting to create portable implementations - like SIMDe. SIMDe is much further along on the Intel side, but Iris - looks to be in better shape on ARM. C++-only, Apache 2.0 license. - AFAICT there are no accelerated fallbacks, nor is there a good way to - add them since it relies extensively on templates. - * There are a few projects trying to implement one set with another: - * [ARM_NEON_2_x86_SSE](https://github.com/intel/ARM_NEON_2_x86_SSE) - — implementing NEON using SSE. Quite extensive, Apache 2.0 - license. - * [sse2neon](https://github.com/jratcliff63367/sse2neon) — - implementing SSE using NEON. This code has already been merged - into SIMDe. - * [veclib](https://github.com/IvantheDugtrio/veclib) — implementing - SSE2 using AltiVec/VMX, using a non-free IBM library called - [powerveclib](https://www.ibm.com/developerworks/community/groups/community/powerveclib/) - * [SSE-to-NEON](https://github.com/otim/SSE-to-NEON) — implementing - SSE with NEON. Non-free, C++. - * [AvxToNeon](https://github.com/kunpengcompute/AvxToNeon) — Popular - AVX+ intrinsincs implemented in NEON. C, Apache 2.0 license. - * [arm-neon-tests](https://github.com/christophe-lyon/arm-neon-tests) - contains tests to verify NEON implementations. - -If you know of any other related projects, please [let us -know](https://github.com/simd-everywhere/simde/issues/new)! - -## Caveats - -Sometime features can't be emulated. If SIMDe is operating in native -mode the functions will work as expected, but if there is no native -support some caveats apply: - - * Many functions require and/or . SIMDe will still - work without those headers, but the results of those functions are - undefined. - * x86 / x86_64 - * SSE - * `SIMDE_MM_SET_ROUNDING_MODE()` will use `fesetround()`, altering - the global rounding mode. - * `simde_mm_getcsr` and `simde_mm_setcsr` only implement bits 13 - and 14 (rounding mode). - * AVX - * `simde_mm256_test*` do not set the CF/ZF registers as there is - no portable way to implement that functionality. - * `simde_mm256_zeroall` and `simde_mm256_zeroupper` are not - implemented as there is no portable way to implement that - functionality. - -Additionally, there are some known limitations which apply when using -native aliases (`SIMDE_ENABLE_NATIVE_ALIASES`): - -* On Windows x86 (but not x86_64), some MMX functions and SSE/SSE2 - functions which use MMX types (__m64) other than for pointers may - return incorrect results. - -Also, as mentioned earlier, while some APIs make assumptions about -basic types (*e.g.*, `int` is 32 bits), SIMDe does not, so many types -have been altered to use portable fixed-width versions such as -`int32_t`. - -If you find any other differences, please file an issue so we can either fix -it or add it to the list above. - -## Benefactors - -SIMDe uses resources provided for free by a number of organizations. -While this shouldn't be taken to imply endorsement of SIMDe, we're -tremendously grateful for their support: - - * [IntegriCloud](https://integricloud.com/) — provides access to a very - fast POWER9 server for developing AltiVec/VMX support. - * [GCC Compile Farm](https://gcc.gnu.org/wiki/CompileFarm) — provides - access to a wide range of machines with different architectures for - developing support for various ISA extensions. - * [CodeCov.io](https://codecov.io/) — provides code coverage analysis - for our test cases. - * [Google](https://www.google.com/) ­— financing - [Summer of Code](https://summerofcode.withgoogle.com/), substantial - amounts of code (Sean Maher's contributions), and an [Open Source Peer - Bonus](https://opensource.google/docs/growing/peer-bonus/). - -Without such organizations donating resources, SIMDe wouldn't be nearly -as useful or usable as it is today. - -We would also like to thank anyone who has helped develop the myriad -of software on which SIMDe relies, including compilers and analysis -tools. - -Finally, a special thank you to -[anyone who has contributed](https://github.com/simd-everywhere/simde/graphs/contributors) -to SIMDe, filed bugs, provided suggestions, or helped with SIMDe -development in any way. - -## License - -SIMDe is distributed under an MIT-style license; see COPYING for -details. - -## Contributors ✨ - -Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)): - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Evan Nemerson

💻 🖋 📖 💡 🤔 💬 👀 ⚠️ 📢 🐛 🚇 🚧 📆

Michael R. Crusoe

🐛 💻 📋 🔍 🤔 🚇 📦 ⚠️ 🚧 📆 👀

HIMANSHI MATHUR

💻 ⚠️

Hidayat Khan

💻 ⚠️

rosbif

💻 ⚠️ 🐛 🤔 📖

Jun Aruga

💻 🤔 📦 🚇 🚧 ⚠️ 🐛

Élie ROUDNINSKI

💻 ⚠️

Jesper Storm Bache

💻

Jeff Daily

💻 🚇

Pavel

💻

Sabarish Bollapragada

💻

Gavin Li

💻

Yining Karl Li

💻

Anirban Dey

📖

Darren Ng

📖

FaresSalem

📖

Pradnyesh Gore

💻

Sean Maher

💻

Mingye Wang

📖

Ng Zhi An

💻 📖

Atharva Nimbalkar

💻 ⚠️

simba611

💻 ⚠️

Ashleigh Newman-Jones

💻 ⚠️

Willy R. Vasquez

💻 🚧 ⚠️

Keith Winstein

💻 🚧 ⚠️

David Seifert

🚧

Milot Mirdita

💻 🚧 ⚠️

aqrit

💻 🚧

Décio Luiz Gazzoni Filho

💻 🚧 ⚠️

Igor Molchanov

💻 🚧 📦

Andrew Rodriguez

💻 🚧 ⚠️

Changqing Jing

🚧

JP Cimalando

💻 🚇

Jiaxun Yang

💻 📦

Masahiro Kitagawa

💻 ⚠️

Pavel Iatchenii

💻 ⚠️

Tommy Vercetti

🚧

Robert Cohn

🚧

Adam Novak

📖

boris-kuz

🚧

Dimo Markov

🚧

dblue

🚧

zekehul

💻 🚧

Laurent Thomas

💻

Max Bachmann

📖

psaab

🚧

Sam Clegg

🚧

Thomas Lively

🐛 🤔 🚧

coderzh

💻 ⚠️

Dominik Kutra

💻 ⚠️

Lithrein

🚧

Nick

🚧

thomasdwu

🚧

Stephen

🐛

John Platts

🐛

Steven Noonan

🐛

p0nce

🐛

Paul Wise

🐛

easyaspi314 (Devin)

🐛 💻

JonLiu1993

📦

Cheney Wang

📦

myd7349

📦

chausner

📦

Yi-Yen Chung

💻 ⚠️

Chi-Wei Chu

💻 ⚠️

M-HT

💻

Simon Gene Gottlieb

💻

Chris Bielow

💻

gu xiwei

📦 ⚠️

George Vinokhodov

💻
- - - - - -This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind are welcome! diff --git a/ffi-deps/simde/simde/check.h b/ffi-deps/simde/simde/check.h deleted file mode 100644 index 7d17d29..0000000 --- a/ffi-deps/simde/simde/check.h +++ /dev/null @@ -1,276 +0,0 @@ -/* Check (assertions) - * Portable Snippets - https://github.com/nemequ/portable-snippets - * Created by Evan Nemerson - * - * To the extent possible under law, the authors have waived all - * copyright and related or neighboring rights to this code. For - * details, see the Creative Commons Zero 1.0 Universal license at - * https://creativecommons.org/publicdomain/zero/1.0/ - * - * SPDX-License-Identifier: CC0-1.0 - */ - -#if !defined(SIMDE_CHECK_H) -#define SIMDE_CHECK_H - -#if !defined(SIMDE_NDEBUG) && !defined(SIMDE_DEBUG) -# define SIMDE_NDEBUG 1 -#endif - -#include "hedley.h" -#include "simde-diagnostic.h" -#include - -#if !defined(_WIN32) -# define SIMDE_SIZE_MODIFIER "z" -# define SIMDE_CHAR_MODIFIER "hh" -# define SIMDE_SHORT_MODIFIER "h" -#else -# if defined(_M_X64) || defined(__amd64__) -# define SIMDE_SIZE_MODIFIER "I64" -# else -# define SIMDE_SIZE_MODIFIER "" -# endif -# define SIMDE_CHAR_MODIFIER "" -# define SIMDE_SHORT_MODIFIER "" -#endif - -#if defined(_MSC_VER) && (_MSC_VER >= 1500) -# define SIMDE_PUSH_DISABLE_MSVC_C4127_ __pragma(warning(push)) __pragma(warning(disable:4127)) -# define SIMDE_POP_DISABLE_MSVC_C4127_ __pragma(warning(pop)) -#else -# define SIMDE_PUSH_DISABLE_MSVC_C4127_ -# define SIMDE_POP_DISABLE_MSVC_C4127_ -#endif - -#if !defined(simde_errorf) -# if defined(__has_include) -# if __has_include() -# include -# endif -# elif defined(SIMDE_STDC_HOSTED) -# if SIMDE_STDC_HOSTED == 1 -# include -# endif -# elif defined(__STDC_HOSTED__) -# if __STDC_HOSTETD__ == 1 -# include -# endif -# endif - -# include "debug-trap.h" - - HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_ -# if defined(EOF) -# define simde_errorf(format, ...) (fprintf(stderr, format, __VA_ARGS__), abort()) -# else -# define simde_errorf(format, ...) (simde_trap()) -# endif - HEDLEY_DIAGNOSTIC_POP -#endif - -#define simde_error(msg) simde_errorf("%s", msg) - -#if defined(SIMDE_NDEBUG) || \ - (defined(__cplusplus) && (__cplusplus < 201103L)) || \ - (defined(__STDC__) && (__STDC__ < 199901L)) -# if defined(SIMDE_CHECK_FAIL_DEFINED) -# define simde_assert(expr) -# else -# if defined(HEDLEY_ASSUME) -# define simde_assert(expr) HEDLEY_ASSUME(expr) -# elif HEDLEY_GCC_VERSION_CHECK(4,5,0) -# define simde_assert(expr) ((void) (!!(expr) ? 1 : (__builtin_unreachable(), 1))) -# elif HEDLEY_MSVC_VERSION_CHECK(13,10,0) -# define simde_assert(expr) __assume(expr) -# else -# define simde_assert(expr) -# endif -# endif -# define simde_assert_true(expr) simde_assert(expr) -# define simde_assert_false(expr) simde_assert(!(expr)) -# define simde_assert_type_full(prefix, suffix, T, fmt, a, op, b) simde_assert(((a) op (b))) -# define simde_assert_double_equal(a, b, precision) -# define simde_assert_string_equal(a, b) -# define simde_assert_string_not_equal(a, b) -# define simde_assert_memory_equal(size, a, b) -# define simde_assert_memory_not_equal(size, a, b) -#else -# define simde_assert(expr) \ - do { \ - if (!HEDLEY_LIKELY(expr)) { \ - simde_error("assertion failed: " #expr "\n"); \ - } \ - SIMDE_PUSH_DISABLE_MSVC_C4127_ \ - } while (0) \ - SIMDE_POP_DISABLE_MSVC_C4127_ - -# define simde_assert_true(expr) \ - do { \ - if (!HEDLEY_LIKELY(expr)) { \ - simde_error("assertion failed: " #expr " is not true\n"); \ - } \ - SIMDE_PUSH_DISABLE_MSVC_C4127_ \ - } while (0) \ - SIMDE_POP_DISABLE_MSVC_C4127_ - -# define simde_assert_false(expr) \ - do { \ - if (!HEDLEY_LIKELY(!(expr))) { \ - simde_error("assertion failed: " #expr " is not false\n"); \ - } \ - SIMDE_PUSH_DISABLE_MSVC_C4127_ \ - } while (0) \ - SIMDE_POP_DISABLE_MSVC_C4127_ - -# define simde_assert_type_full(prefix, suffix, T, fmt, a, op, b) \ - do { \ - T simde_tmp_a_ = (a); \ - T simde_tmp_b_ = (b); \ - if (!(simde_tmp_a_ op simde_tmp_b_)) { \ - simde_errorf("assertion failed: %s %s %s (" prefix "%" fmt suffix " %s " prefix "%" fmt suffix ")\n", \ - #a, #op, #b, simde_tmp_a_, #op, simde_tmp_b_); \ - } \ - SIMDE_PUSH_DISABLE_MSVC_C4127_ \ - } while (0) \ - SIMDE_POP_DISABLE_MSVC_C4127_ - -# define simde_assert_double_equal(a, b, precision) \ - do { \ - const double simde_tmp_a_ = (a); \ - const double simde_tmp_b_ = (b); \ - const double simde_tmp_diff_ = ((simde_tmp_a_ - simde_tmp_b_) < 0) ? \ - -(simde_tmp_a_ - simde_tmp_b_) : \ - (simde_tmp_a_ - simde_tmp_b_); \ - if (HEDLEY_UNLIKELY(simde_tmp_diff_ > 1e-##precision)) { \ - simde_errorf("assertion failed: %s == %s (%0." #precision "g == %0." #precision "g)\n", \ - #a, #b, simde_tmp_a_, simde_tmp_b_); \ - } \ - SIMDE_PUSH_DISABLE_MSVC_C4127_ \ - } while (0) \ - SIMDE_POP_DISABLE_MSVC_C4127_ - -# include -# define simde_assert_string_equal(a, b) \ - do { \ - const char* simde_tmp_a_ = a; \ - const char* simde_tmp_b_ = b; \ - if (HEDLEY_UNLIKELY(strcmp(simde_tmp_a_, simde_tmp_b_) != 0)) { \ - simde_errorf("assertion failed: string %s == %s (\"%s\" == \"%s\")\n", \ - #a, #b, simde_tmp_a_, simde_tmp_b_); \ - } \ - SIMDE_PUSH_DISABLE_MSVC_C4127_ \ - } while (0) \ - SIMDE_POP_DISABLE_MSVC_C4127_ - -# define simde_assert_string_not_equal(a, b) \ - do { \ - const char* simde_tmp_a_ = a; \ - const char* simde_tmp_b_ = b; \ - if (HEDLEY_UNLIKELY(strcmp(simde_tmp_a_, simde_tmp_b_) == 0)) { \ - simde_errorf("assertion failed: string %s != %s (\"%s\" == \"%s\")\n", \ - #a, #b, simde_tmp_a_, simde_tmp_b_); \ - } \ - SIMDE_PUSH_DISABLE_MSVC_C4127_ \ - } while (0) \ - SIMDE_POP_DISABLE_MSVC_C4127_ - -# define simde_assert_memory_equal(size, a, b) \ - do { \ - const unsigned char* simde_tmp_a_ = (const unsigned char*) (a); \ - const unsigned char* simde_tmp_b_ = (const unsigned char*) (b); \ - const size_t simde_tmp_size_ = (size); \ - if (HEDLEY_UNLIKELY(memcmp(simde_tmp_a_, simde_tmp_b_, simde_tmp_size_)) != 0) { \ - size_t simde_tmp_pos_; \ - for (simde_tmp_pos_ = 0 ; simde_tmp_pos_ < simde_tmp_size_ ; simde_tmp_pos_++) { \ - if (simde_tmp_a_[simde_tmp_pos_] != simde_tmp_b_[simde_tmp_pos_]) { \ - simde_errorf("assertion failed: memory %s == %s, at offset %" SIMDE_SIZE_MODIFIER "u\n", \ - #a, #b, simde_tmp_pos_); \ - break; \ - } \ - } \ - } \ - SIMDE_PUSH_DISABLE_MSVC_C4127_ \ - } while (0) \ - SIMDE_POP_DISABLE_MSVC_C4127_ - -# define simde_assert_memory_not_equal(size, a, b) \ - do { \ - const unsigned char* simde_tmp_a_ = (const unsigned char*) (a); \ - const unsigned char* simde_tmp_b_ = (const unsigned char*) (b); \ - const size_t simde_tmp_size_ = (size); \ - if (HEDLEY_UNLIKELY(memcmp(simde_tmp_a_, simde_tmp_b_, simde_tmp_size_)) == 0) { \ - simde_errorf("assertion failed: memory %s != %s (%" SIMDE_SIZE_MODIFIER "u bytes)\n", \ - #a, #b, simde_tmp_size_); \ - } \ - SIMDE_PUSH_DISABLE_MSVC_C4127_ \ - } while (0) \ - SIMDE_POP_DISABLE_MSVC_C4127_ -#endif - -#define simde_assert_type(T, fmt, a, op, b) \ - simde_assert_type_full("", "", T, fmt, a, op, b) - -#define simde_assert_char(a, op, b) \ - simde_assert_type_full("'\\x", "'", char, "02" SIMDE_CHAR_MODIFIER "x", a, op, b) -#define simde_assert_uchar(a, op, b) \ - simde_assert_type_full("'\\x", "'", unsigned char, "02" SIMDE_CHAR_MODIFIER "x", a, op, b) -#define simde_assert_short(a, op, b) \ - simde_assert_type(short, SIMDE_SHORT_MODIFIER "d", a, op, b) -#define simde_assert_ushort(a, op, b) \ - simde_assert_type(unsigned short, SIMDE_SHORT_MODIFIER "u", a, op, b) -#define simde_assert_int(a, op, b) \ - simde_assert_type(int, "d", a, op, b) -#define simde_assert_uint(a, op, b) \ - simde_assert_type(unsigned int, "u", a, op, b) -#define simde_assert_long(a, op, b) \ - simde_assert_type(long int, "ld", a, op, b) -#define simde_assert_ulong(a, op, b) \ - simde_assert_type(unsigned long int, "lu", a, op, b) -#define simde_assert_llong(a, op, b) \ - simde_assert_type(long long int, "lld", a, op, b) -#define simde_assert_ullong(a, op, b) \ - simde_assert_type(unsigned long long int, "llu", a, op, b) - -#define simde_assert_size(a, op, b) \ - simde_assert_type(size_t, SIMDE_SIZE_MODIFIER "u", a, op, b) - -#define simde_assert_float(a, op, b) \ - simde_assert_type(float, "f", a, op, b) -#define simde_assert_double(a, op, b) \ - simde_assert_type(double, "g", a, op, b) -#define simde_assert_ptr(a, op, b) \ - simde_assert_type(const void*, "p", a, op, b) - -#define simde_assert_int8(a, op, b) \ - simde_assert_type(int8_t, PRIi8, a, op, b) -#define simde_assert_uint8(a, op, b) \ - simde_assert_type(uint8_t, PRIu8, a, op, b) -#define simde_assert_int16(a, op, b) \ - simde_assert_type(int16_t, PRIi16, a, op, b) -#define simde_assert_uint16(a, op, b) \ - simde_assert_type(uint16_t, PRIu16, a, op, b) -#define simde_assert_int32(a, op, b) \ - simde_assert_type(int32_t, PRIi32, a, op, b) -#define simde_assert_uint32(a, op, b) \ - simde_assert_type(uint32_t, PRIu32, a, op, b) -#define simde_assert_int64(a, op, b) \ - simde_assert_type(int64_t, PRIi64, a, op, b) -#define simde_assert_uint64(a, op, b) \ - simde_assert_type(uint64_t, PRIu64, a, op, b) - -#define simde_assert_ptr_equal(a, b) \ - simde_assert_ptr(a, ==, b) -#define simde_assert_ptr_not_equal(a, b) \ - simde_assert_ptr(a, !=, b) -#define simde_assert_null(ptr) \ - simde_assert_ptr(ptr, ==, NULL) -#define simde_assert_not_null(ptr) \ - simde_assert_ptr(ptr, !=, NULL) -#define simde_assert_ptr_null(ptr) \ - simde_assert_ptr(ptr, ==, NULL) -#define simde_assert_ptr_not_null(ptr) \ - simde_assert_ptr(ptr, !=, NULL) - -#endif /* !defined(SIMDE_CHECK_H) */ diff --git a/ffi-deps/simde/simde/debug-trap.h b/ffi-deps/simde/simde/debug-trap.h deleted file mode 100644 index 2d3c60f..0000000 --- a/ffi-deps/simde/simde/debug-trap.h +++ /dev/null @@ -1,85 +0,0 @@ -/* Debugging assertions and traps - * Portable Snippets - https://github.com/nemequ/portable-snippets - * Created by Evan Nemerson - * - * To the extent possible under law, the authors have waived all - * copyright and related or neighboring rights to this code. For - * details, see the Creative Commons Zero 1.0 Universal license at - * https://creativecommons.org/publicdomain/zero/1.0/ - * - * SPDX-License-Identifier: CC0-1.0 - */ - -#if !defined(SIMDE_DEBUG_TRAP_H) -#define SIMDE_DEBUG_TRAP_H - -#if !defined(SIMDE_NDEBUG) && defined(NDEBUG) && !defined(SIMDE_DEBUG) -# define SIMDE_NDEBUG 1 -#endif - -#if defined(__has_builtin) && !defined(__ibmxl__) -# if __has_builtin(__builtin_debugtrap) -# define simde_trap() __builtin_debugtrap() -# elif __has_builtin(__debugbreak) -# define simde_trap() __debugbreak() -# endif -#endif -#if !defined(simde_trap) -# if defined(_MSC_VER) || defined(__INTEL_COMPILER) -# define simde_trap() __debugbreak() -# elif defined(__ARMCC_VERSION) -# define simde_trap() __breakpoint(42) -# elif defined(__ibmxl__) || defined(__xlC__) -# include -# define simde_trap() __trap(42) -# elif defined(__DMC__) && defined(_M_IX86) - static inline void simde_trap(void) { __asm int 3h; } -# elif defined(__i386__) || defined(__x86_64__) - static inline void simde_trap(void) { __asm__ __volatile__("int $03"); } -# elif defined(__thumb__) - static inline void simde_trap(void) { __asm__ __volatile__(".inst 0xde01"); } -# elif defined(__aarch64__) - static inline void simde_trap(void) { __asm__ __volatile__(".inst 0xd4200000"); } -# elif defined(__arm__) - static inline void simde_trap(void) { __asm__ __volatile__(".inst 0xe7f001f0"); } -# elif defined (__alpha__) && !defined(__osf__) - static inline void simde_trap(void) { __asm__ __volatile__("bpt"); } -# elif defined(_54_) - static inline void simde_trap(void) { __asm__ __volatile__("ESTOP"); } -# elif defined(_55_) - static inline void simde_trap(void) { __asm__ __volatile__(";\n .if (.MNEMONIC)\n ESTOP_1\n .else\n ESTOP_1()\n .endif\n NOP"); } -# elif defined(_64P_) - static inline void simde_trap(void) { __asm__ __volatile__("SWBP 0"); } -# elif defined(_6x_) - static inline void simde_trap(void) { __asm__ __volatile__("NOP\n .word 0x10000000"); } -# elif defined(__STDC_HOSTED__) && (__STDC_HOSTED__ == 0) && defined(__GNUC__) -# define simde_trap() __builtin_trap() -# else -# include -# if defined(SIGTRAP) -# define simde_trap() raise(SIGTRAP) -# else -# define simde_trap() raise(SIGABRT) -# endif -# endif -#endif - -#if defined(HEDLEY_LIKELY) -# define SIMDE_DBG_LIKELY(expr) HEDLEY_LIKELY(expr) -#elif defined(__GNUC__) && (__GNUC__ >= 3) -# define SIMDE_DBG_LIKELY(expr) __builtin_expect(!!(expr), 1) -#else -# define SIMDE_DBG_LIKELY(expr) (!!(expr)) -#endif - -#if !defined(SIMDE_NDEBUG) || (SIMDE_NDEBUG == 0) -# define simde_dbg_assert(expr) do { \ - if (!SIMDE_DBG_LIKELY(expr)) { \ - simde_trap(); \ - } \ - } while (0) -#else -# define simde_dbg_assert(expr) -#endif - -#endif /* !defined(SIMDE_DEBUG_TRAP_H) */ diff --git a/ffi-deps/simde/simde/hedley.h b/ffi-deps/simde/simde/hedley.h deleted file mode 100644 index 41ac302..0000000 --- a/ffi-deps/simde/simde/hedley.h +++ /dev/null @@ -1,2044 +0,0 @@ -/* Hedley - https://nemequ.github.io/hedley - * Created by Evan Nemerson - * - * To the extent possible under law, the author(s) have dedicated all - * copyright and related and neighboring rights to this software to - * the public domain worldwide. This software is distributed without - * any warranty. - * - * For details, see . - * SPDX-License-Identifier: CC0-1.0 - */ - -#if !defined(HEDLEY_VERSION) || (HEDLEY_VERSION < 16) -#if defined(HEDLEY_VERSION) -# undef HEDLEY_VERSION -#endif -#define HEDLEY_VERSION 16 - -#if defined(HEDLEY_STRINGIFY_EX) -# undef HEDLEY_STRINGIFY_EX -#endif -#define HEDLEY_STRINGIFY_EX(x) #x - -#if defined(HEDLEY_STRINGIFY) -# undef HEDLEY_STRINGIFY -#endif -#define HEDLEY_STRINGIFY(x) HEDLEY_STRINGIFY_EX(x) - -#if defined(HEDLEY_CONCAT_EX) -# undef HEDLEY_CONCAT_EX -#endif -#define HEDLEY_CONCAT_EX(a,b) a##b - -#if defined(HEDLEY_CONCAT) -# undef HEDLEY_CONCAT -#endif -#define HEDLEY_CONCAT(a,b) HEDLEY_CONCAT_EX(a,b) - -#if defined(HEDLEY_CONCAT3_EX) -# undef HEDLEY_CONCAT3_EX -#endif -#define HEDLEY_CONCAT3_EX(a,b,c) a##b##c - -#if defined(HEDLEY_CONCAT3) -# undef HEDLEY_CONCAT3 -#endif -#define HEDLEY_CONCAT3(a,b,c) HEDLEY_CONCAT3_EX(a,b,c) - -#if defined(HEDLEY_VERSION_ENCODE) -# undef HEDLEY_VERSION_ENCODE -#endif -#define HEDLEY_VERSION_ENCODE(major,minor,revision) (((major) * 1000000) + ((minor) * 1000) + (revision)) - -#if defined(HEDLEY_VERSION_DECODE_MAJOR) -# undef HEDLEY_VERSION_DECODE_MAJOR -#endif -#define HEDLEY_VERSION_DECODE_MAJOR(version) ((version) / 1000000) - -#if defined(HEDLEY_VERSION_DECODE_MINOR) -# undef HEDLEY_VERSION_DECODE_MINOR -#endif -#define HEDLEY_VERSION_DECODE_MINOR(version) (((version) % 1000000) / 1000) - -#if defined(HEDLEY_VERSION_DECODE_REVISION) -# undef HEDLEY_VERSION_DECODE_REVISION -#endif -#define HEDLEY_VERSION_DECODE_REVISION(version) ((version) % 1000) - -#if defined(HEDLEY_GNUC_VERSION) -# undef HEDLEY_GNUC_VERSION -#endif -#if defined(__GNUC__) && defined(__GNUC_PATCHLEVEL__) -# define HEDLEY_GNUC_VERSION HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__) -#elif defined(__GNUC__) -# define HEDLEY_GNUC_VERSION HEDLEY_VERSION_ENCODE(__GNUC__, __GNUC_MINOR__, 0) -#endif - -#if defined(HEDLEY_GNUC_VERSION_CHECK) -# undef HEDLEY_GNUC_VERSION_CHECK -#endif -#if defined(HEDLEY_GNUC_VERSION) -# define HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (HEDLEY_GNUC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_MSVC_VERSION) -# undef HEDLEY_MSVC_VERSION -#endif -#if defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 140000000) && !defined(__ICL) -# define HEDLEY_MSVC_VERSION HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 10000000, (_MSC_FULL_VER % 10000000) / 100000, (_MSC_FULL_VER % 100000) / 100) -#elif defined(_MSC_FULL_VER) && !defined(__ICL) -# define HEDLEY_MSVC_VERSION HEDLEY_VERSION_ENCODE(_MSC_FULL_VER / 1000000, (_MSC_FULL_VER % 1000000) / 10000, (_MSC_FULL_VER % 10000) / 10) -#elif defined(_MSC_VER) && !defined(__ICL) -# define HEDLEY_MSVC_VERSION HEDLEY_VERSION_ENCODE(_MSC_VER / 100, _MSC_VER % 100, 0) -#endif - -#if defined(HEDLEY_MSVC_VERSION_CHECK) -# undef HEDLEY_MSVC_VERSION_CHECK -#endif -#if !defined(HEDLEY_MSVC_VERSION) -# define HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (0) -#elif defined(_MSC_VER) && (_MSC_VER >= 1400) -# define HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 10000000) + (minor * 100000) + (patch))) -#elif defined(_MSC_VER) && (_MSC_VER >= 1200) -# define HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_FULL_VER >= ((major * 1000000) + (minor * 10000) + (patch))) -#else -# define HEDLEY_MSVC_VERSION_CHECK(major,minor,patch) (_MSC_VER >= ((major * 100) + (minor))) -#endif - -#if defined(HEDLEY_INTEL_VERSION) -# undef HEDLEY_INTEL_VERSION -#endif -#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && !defined(__ICL) -# define HEDLEY_INTEL_VERSION HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, __INTEL_COMPILER_UPDATE) -#elif defined(__INTEL_COMPILER) && !defined(__ICL) -# define HEDLEY_INTEL_VERSION HEDLEY_VERSION_ENCODE(__INTEL_COMPILER / 100, __INTEL_COMPILER % 100, 0) -#endif - -#if defined(HEDLEY_INTEL_VERSION_CHECK) -# undef HEDLEY_INTEL_VERSION_CHECK -#endif -#if defined(HEDLEY_INTEL_VERSION) -# define HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (HEDLEY_INTEL_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_INTEL_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_INTEL_CL_VERSION) -# undef HEDLEY_INTEL_CL_VERSION -#endif -#if defined(__INTEL_COMPILER) && defined(__INTEL_COMPILER_UPDATE) && defined(__ICL) -# define HEDLEY_INTEL_CL_VERSION HEDLEY_VERSION_ENCODE(__INTEL_COMPILER, __INTEL_COMPILER_UPDATE, 0) -#endif - -#if defined(HEDLEY_INTEL_CL_VERSION_CHECK) -# undef HEDLEY_INTEL_CL_VERSION_CHECK -#endif -#if defined(HEDLEY_INTEL_CL_VERSION) -# define HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (HEDLEY_INTEL_CL_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_INTEL_CL_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_PGI_VERSION) -# undef HEDLEY_PGI_VERSION -#endif -#if defined(__PGI) && defined(__PGIC__) && defined(__PGIC_MINOR__) && defined(__PGIC_PATCHLEVEL__) -# define HEDLEY_PGI_VERSION HEDLEY_VERSION_ENCODE(__PGIC__, __PGIC_MINOR__, __PGIC_PATCHLEVEL__) -#endif - -#if defined(HEDLEY_PGI_VERSION_CHECK) -# undef HEDLEY_PGI_VERSION_CHECK -#endif -#if defined(HEDLEY_PGI_VERSION) -# define HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (HEDLEY_PGI_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_PGI_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_SUNPRO_VERSION) -# undef HEDLEY_SUNPRO_VERSION -#endif -#if defined(__SUNPRO_C) && (__SUNPRO_C > 0x1000) -# define HEDLEY_SUNPRO_VERSION HEDLEY_VERSION_ENCODE((((__SUNPRO_C >> 16) & 0xf) * 10) + ((__SUNPRO_C >> 12) & 0xf), (((__SUNPRO_C >> 8) & 0xf) * 10) + ((__SUNPRO_C >> 4) & 0xf), (__SUNPRO_C & 0xf) * 10) -#elif defined(__SUNPRO_C) -# define HEDLEY_SUNPRO_VERSION HEDLEY_VERSION_ENCODE((__SUNPRO_C >> 8) & 0xf, (__SUNPRO_C >> 4) & 0xf, (__SUNPRO_C) & 0xf) -#elif defined(__SUNPRO_CC) && (__SUNPRO_CC > 0x1000) -# define HEDLEY_SUNPRO_VERSION HEDLEY_VERSION_ENCODE((((__SUNPRO_CC >> 16) & 0xf) * 10) + ((__SUNPRO_CC >> 12) & 0xf), (((__SUNPRO_CC >> 8) & 0xf) * 10) + ((__SUNPRO_CC >> 4) & 0xf), (__SUNPRO_CC & 0xf) * 10) -#elif defined(__SUNPRO_CC) -# define HEDLEY_SUNPRO_VERSION HEDLEY_VERSION_ENCODE((__SUNPRO_CC >> 8) & 0xf, (__SUNPRO_CC >> 4) & 0xf, (__SUNPRO_CC) & 0xf) -#endif - -#if defined(HEDLEY_SUNPRO_VERSION_CHECK) -# undef HEDLEY_SUNPRO_VERSION_CHECK -#endif -#if defined(HEDLEY_SUNPRO_VERSION) -# define HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (HEDLEY_SUNPRO_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_SUNPRO_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_EMSCRIPTEN_VERSION) -# undef HEDLEY_EMSCRIPTEN_VERSION -#endif -#if defined(__EMSCRIPTEN__) -# define HEDLEY_EMSCRIPTEN_VERSION HEDLEY_VERSION_ENCODE(__EMSCRIPTEN_major__, __EMSCRIPTEN_minor__, __EMSCRIPTEN_tiny__) -#endif - -#if defined(HEDLEY_EMSCRIPTEN_VERSION_CHECK) -# undef HEDLEY_EMSCRIPTEN_VERSION_CHECK -#endif -#if defined(HEDLEY_EMSCRIPTEN_VERSION) -# define HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (HEDLEY_EMSCRIPTEN_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_EMSCRIPTEN_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_ARM_VERSION) -# undef HEDLEY_ARM_VERSION -#endif -#if defined(__CC_ARM) && defined(__ARMCOMPILER_VERSION) -# define HEDLEY_ARM_VERSION HEDLEY_VERSION_ENCODE(__ARMCOMPILER_VERSION / 1000000, (__ARMCOMPILER_VERSION % 1000000) / 10000, (__ARMCOMPILER_VERSION % 10000) / 100) -#elif defined(__CC_ARM) && defined(__ARMCC_VERSION) -# define HEDLEY_ARM_VERSION HEDLEY_VERSION_ENCODE(__ARMCC_VERSION / 1000000, (__ARMCC_VERSION % 1000000) / 10000, (__ARMCC_VERSION % 10000) / 100) -#endif - -#if defined(HEDLEY_ARM_VERSION_CHECK) -# undef HEDLEY_ARM_VERSION_CHECK -#endif -#if defined(HEDLEY_ARM_VERSION) -# define HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (HEDLEY_ARM_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_ARM_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_IBM_VERSION) -# undef HEDLEY_IBM_VERSION -#endif -#if defined(__ibmxl__) -# define HEDLEY_IBM_VERSION HEDLEY_VERSION_ENCODE(__ibmxl_version__, __ibmxl_release__, __ibmxl_modification__) -#elif defined(__xlC__) && defined(__xlC_ver__) -# define HEDLEY_IBM_VERSION HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, (__xlC_ver__ >> 8) & 0xff) -#elif defined(__xlC__) -# define HEDLEY_IBM_VERSION HEDLEY_VERSION_ENCODE(__xlC__ >> 8, __xlC__ & 0xff, 0) -#endif - -#if defined(HEDLEY_IBM_VERSION_CHECK) -# undef HEDLEY_IBM_VERSION_CHECK -#endif -#if defined(HEDLEY_IBM_VERSION) -# define HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (HEDLEY_IBM_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_IBM_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_TI_VERSION) -# undef HEDLEY_TI_VERSION -#endif -#if \ - defined(__TI_COMPILER_VERSION__) && \ - ( \ - defined(__TMS470__) || defined(__TI_ARM__) || \ - defined(__MSP430__) || \ - defined(__TMS320C2000__) \ - ) -# if (__TI_COMPILER_VERSION__ >= 16000000) -# define HEDLEY_TI_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) -# endif -#endif - -#if defined(HEDLEY_TI_VERSION_CHECK) -# undef HEDLEY_TI_VERSION_CHECK -#endif -#if defined(HEDLEY_TI_VERSION) -# define HEDLEY_TI_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_TI_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_TI_CL2000_VERSION) -# undef HEDLEY_TI_CL2000_VERSION -#endif -#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C2000__) -# define HEDLEY_TI_CL2000_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) -#endif - -#if defined(HEDLEY_TI_CL2000_VERSION_CHECK) -# undef HEDLEY_TI_CL2000_VERSION_CHECK -#endif -#if defined(HEDLEY_TI_CL2000_VERSION) -# define HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL2000_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_TI_CL2000_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_TI_CL430_VERSION) -# undef HEDLEY_TI_CL430_VERSION -#endif -#if defined(__TI_COMPILER_VERSION__) && defined(__MSP430__) -# define HEDLEY_TI_CL430_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) -#endif - -#if defined(HEDLEY_TI_CL430_VERSION_CHECK) -# undef HEDLEY_TI_CL430_VERSION_CHECK -#endif -#if defined(HEDLEY_TI_CL430_VERSION) -# define HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL430_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_TI_CL430_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_TI_ARMCL_VERSION) -# undef HEDLEY_TI_ARMCL_VERSION -#endif -#if defined(__TI_COMPILER_VERSION__) && (defined(__TMS470__) || defined(__TI_ARM__)) -# define HEDLEY_TI_ARMCL_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) -#endif - -#if defined(HEDLEY_TI_ARMCL_VERSION_CHECK) -# undef HEDLEY_TI_ARMCL_VERSION_CHECK -#endif -#if defined(HEDLEY_TI_ARMCL_VERSION) -# define HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_ARMCL_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_TI_ARMCL_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_TI_CL6X_VERSION) -# undef HEDLEY_TI_CL6X_VERSION -#endif -#if defined(__TI_COMPILER_VERSION__) && defined(__TMS320C6X__) -# define HEDLEY_TI_CL6X_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) -#endif - -#if defined(HEDLEY_TI_CL6X_VERSION_CHECK) -# undef HEDLEY_TI_CL6X_VERSION_CHECK -#endif -#if defined(HEDLEY_TI_CL6X_VERSION) -# define HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL6X_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_TI_CL6X_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_TI_CL7X_VERSION) -# undef HEDLEY_TI_CL7X_VERSION -#endif -#if defined(__TI_COMPILER_VERSION__) && defined(__C7000__) -# define HEDLEY_TI_CL7X_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) -#endif - -#if defined(HEDLEY_TI_CL7X_VERSION_CHECK) -# undef HEDLEY_TI_CL7X_VERSION_CHECK -#endif -#if defined(HEDLEY_TI_CL7X_VERSION) -# define HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CL7X_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_TI_CL7X_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_TI_CLPRU_VERSION) -# undef HEDLEY_TI_CLPRU_VERSION -#endif -#if defined(__TI_COMPILER_VERSION__) && defined(__PRU__) -# define HEDLEY_TI_CLPRU_VERSION HEDLEY_VERSION_ENCODE(__TI_COMPILER_VERSION__ / 1000000, (__TI_COMPILER_VERSION__ % 1000000) / 1000, (__TI_COMPILER_VERSION__ % 1000)) -#endif - -#if defined(HEDLEY_TI_CLPRU_VERSION_CHECK) -# undef HEDLEY_TI_CLPRU_VERSION_CHECK -#endif -#if defined(HEDLEY_TI_CLPRU_VERSION) -# define HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (HEDLEY_TI_CLPRU_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_TI_CLPRU_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_CRAY_VERSION) -# undef HEDLEY_CRAY_VERSION -#endif -#if defined(_CRAYC) -# if defined(_RELEASE_PATCHLEVEL) -# define HEDLEY_CRAY_VERSION HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, _RELEASE_PATCHLEVEL) -# else -# define HEDLEY_CRAY_VERSION HEDLEY_VERSION_ENCODE(_RELEASE_MAJOR, _RELEASE_MINOR, 0) -# endif -#endif - -#if defined(HEDLEY_CRAY_VERSION_CHECK) -# undef HEDLEY_CRAY_VERSION_CHECK -#endif -#if defined(HEDLEY_CRAY_VERSION) -# define HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (HEDLEY_CRAY_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_CRAY_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_IAR_VERSION) -# undef HEDLEY_IAR_VERSION -#endif -#if defined(__IAR_SYSTEMS_ICC__) -# if __VER__ > 1000 -# define HEDLEY_IAR_VERSION HEDLEY_VERSION_ENCODE((__VER__ / 1000000), ((__VER__ / 1000) % 1000), (__VER__ % 1000)) -# else -# define HEDLEY_IAR_VERSION HEDLEY_VERSION_ENCODE(__VER__ / 100, __VER__ % 100, 0) -# endif -#endif - -#if defined(HEDLEY_IAR_VERSION_CHECK) -# undef HEDLEY_IAR_VERSION_CHECK -#endif -#if defined(HEDLEY_IAR_VERSION) -# define HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (HEDLEY_IAR_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_IAR_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_TINYC_VERSION) -# undef HEDLEY_TINYC_VERSION -#endif -#if defined(__TINYC__) -# define HEDLEY_TINYC_VERSION HEDLEY_VERSION_ENCODE(__TINYC__ / 1000, (__TINYC__ / 100) % 10, __TINYC__ % 100) -#endif - -#if defined(HEDLEY_TINYC_VERSION_CHECK) -# undef HEDLEY_TINYC_VERSION_CHECK -#endif -#if defined(HEDLEY_TINYC_VERSION) -# define HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (HEDLEY_TINYC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_TINYC_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_DMC_VERSION) -# undef HEDLEY_DMC_VERSION -#endif -#if defined(__DMC__) -# define HEDLEY_DMC_VERSION HEDLEY_VERSION_ENCODE(__DMC__ >> 8, (__DMC__ >> 4) & 0xf, __DMC__ & 0xf) -#endif - -#if defined(HEDLEY_DMC_VERSION_CHECK) -# undef HEDLEY_DMC_VERSION_CHECK -#endif -#if defined(HEDLEY_DMC_VERSION) -# define HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (HEDLEY_DMC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_DMC_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_COMPCERT_VERSION) -# undef HEDLEY_COMPCERT_VERSION -#endif -#if defined(__COMPCERT_VERSION__) -# define HEDLEY_COMPCERT_VERSION HEDLEY_VERSION_ENCODE(__COMPCERT_VERSION__ / 10000, (__COMPCERT_VERSION__ / 100) % 100, __COMPCERT_VERSION__ % 100) -#endif - -#if defined(HEDLEY_COMPCERT_VERSION_CHECK) -# undef HEDLEY_COMPCERT_VERSION_CHECK -#endif -#if defined(HEDLEY_COMPCERT_VERSION) -# define HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (HEDLEY_COMPCERT_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_COMPCERT_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_PELLES_VERSION) -# undef HEDLEY_PELLES_VERSION -#endif -#if defined(__POCC__) -# define HEDLEY_PELLES_VERSION HEDLEY_VERSION_ENCODE(__POCC__ / 100, __POCC__ % 100, 0) -#endif - -#if defined(HEDLEY_PELLES_VERSION_CHECK) -# undef HEDLEY_PELLES_VERSION_CHECK -#endif -#if defined(HEDLEY_PELLES_VERSION) -# define HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (HEDLEY_PELLES_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_PELLES_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_MCST_LCC_VERSION) -# undef HEDLEY_MCST_LCC_VERSION -#endif -#if defined(__LCC__) && defined(__LCC_MINOR__) -# define HEDLEY_MCST_LCC_VERSION HEDLEY_VERSION_ENCODE(__LCC__ / 100, __LCC__ % 100, __LCC_MINOR__) -#endif - -#if defined(HEDLEY_MCST_LCC_VERSION_CHECK) -# undef HEDLEY_MCST_LCC_VERSION_CHECK -#endif -#if defined(HEDLEY_MCST_LCC_VERSION) -# define HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (HEDLEY_MCST_LCC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_MCST_LCC_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_GCC_VERSION) -# undef HEDLEY_GCC_VERSION -#endif -#if \ - defined(HEDLEY_GNUC_VERSION) && \ - !defined(__clang__) && \ - !defined(HEDLEY_INTEL_VERSION) && \ - !defined(HEDLEY_PGI_VERSION) && \ - !defined(HEDLEY_ARM_VERSION) && \ - !defined(HEDLEY_CRAY_VERSION) && \ - !defined(HEDLEY_TI_VERSION) && \ - !defined(HEDLEY_TI_ARMCL_VERSION) && \ - !defined(HEDLEY_TI_CL430_VERSION) && \ - !defined(HEDLEY_TI_CL2000_VERSION) && \ - !defined(HEDLEY_TI_CL6X_VERSION) && \ - !defined(HEDLEY_TI_CL7X_VERSION) && \ - !defined(HEDLEY_TI_CLPRU_VERSION) && \ - !defined(__COMPCERT__) && \ - !defined(HEDLEY_MCST_LCC_VERSION) -# define HEDLEY_GCC_VERSION HEDLEY_GNUC_VERSION -#endif - -#if defined(HEDLEY_GCC_VERSION_CHECK) -# undef HEDLEY_GCC_VERSION_CHECK -#endif -#if defined(HEDLEY_GCC_VERSION) -# define HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (HEDLEY_GCC_VERSION >= HEDLEY_VERSION_ENCODE(major, minor, patch)) -#else -# define HEDLEY_GCC_VERSION_CHECK(major,minor,patch) (0) -#endif - -#if defined(HEDLEY_HAS_ATTRIBUTE) -# undef HEDLEY_HAS_ATTRIBUTE -#endif -#if \ - defined(__has_attribute) && \ - ( \ - (!defined(HEDLEY_IAR_VERSION) || HEDLEY_IAR_VERSION_CHECK(8,5,9)) \ - ) -# define HEDLEY_HAS_ATTRIBUTE(attribute) __has_attribute(attribute) -#else -# define HEDLEY_HAS_ATTRIBUTE(attribute) (0) -#endif - -#if defined(HEDLEY_GNUC_HAS_ATTRIBUTE) -# undef HEDLEY_GNUC_HAS_ATTRIBUTE -#endif -#if defined(__has_attribute) -# define HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_HAS_ATTRIBUTE(attribute) -#else -# define HEDLEY_GNUC_HAS_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_GCC_HAS_ATTRIBUTE) -# undef HEDLEY_GCC_HAS_ATTRIBUTE -#endif -#if defined(__has_attribute) -# define HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_HAS_ATTRIBUTE(attribute) -#else -# define HEDLEY_GCC_HAS_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_HAS_CPP_ATTRIBUTE) -# undef HEDLEY_HAS_CPP_ATTRIBUTE -#endif -#if \ - defined(__has_cpp_attribute) && \ - defined(__cplusplus) && \ - (!defined(HEDLEY_SUNPRO_VERSION) || HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) -# define HEDLEY_HAS_CPP_ATTRIBUTE(attribute) __has_cpp_attribute(attribute) -#else -# define HEDLEY_HAS_CPP_ATTRIBUTE(attribute) (0) -#endif - -#if defined(HEDLEY_HAS_CPP_ATTRIBUTE_NS) -# undef HEDLEY_HAS_CPP_ATTRIBUTE_NS -#endif -#if !defined(__cplusplus) || !defined(__has_cpp_attribute) -# define HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0) -#elif \ - !defined(HEDLEY_PGI_VERSION) && \ - !defined(HEDLEY_IAR_VERSION) && \ - (!defined(HEDLEY_SUNPRO_VERSION) || HEDLEY_SUNPRO_VERSION_CHECK(5,15,0)) && \ - (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) -# define HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) HEDLEY_HAS_CPP_ATTRIBUTE(ns::attribute) -#else -# define HEDLEY_HAS_CPP_ATTRIBUTE_NS(ns,attribute) (0) -#endif - -#if defined(HEDLEY_GNUC_HAS_CPP_ATTRIBUTE) -# undef HEDLEY_GNUC_HAS_CPP_ATTRIBUTE -#endif -#if defined(__has_cpp_attribute) && defined(__cplusplus) -# define HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute) -#else -# define HEDLEY_GNUC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_GCC_HAS_CPP_ATTRIBUTE) -# undef HEDLEY_GCC_HAS_CPP_ATTRIBUTE -#endif -#if defined(__has_cpp_attribute) && defined(__cplusplus) -# define HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) __has_cpp_attribute(attribute) -#else -# define HEDLEY_GCC_HAS_CPP_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_HAS_BUILTIN) -# undef HEDLEY_HAS_BUILTIN -#endif -#if defined(__has_builtin) -# define HEDLEY_HAS_BUILTIN(builtin) __has_builtin(builtin) -#else -# define HEDLEY_HAS_BUILTIN(builtin) (0) -#endif - -#if defined(HEDLEY_GNUC_HAS_BUILTIN) -# undef HEDLEY_GNUC_HAS_BUILTIN -#endif -#if defined(__has_builtin) -# define HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin) -#else -# define HEDLEY_GNUC_HAS_BUILTIN(builtin,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_GCC_HAS_BUILTIN) -# undef HEDLEY_GCC_HAS_BUILTIN -#endif -#if defined(__has_builtin) -# define HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) __has_builtin(builtin) -#else -# define HEDLEY_GCC_HAS_BUILTIN(builtin,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_HAS_FEATURE) -# undef HEDLEY_HAS_FEATURE -#endif -#if defined(__has_feature) -# define HEDLEY_HAS_FEATURE(feature) __has_feature(feature) -#else -# define HEDLEY_HAS_FEATURE(feature) (0) -#endif - -#if defined(HEDLEY_GNUC_HAS_FEATURE) -# undef HEDLEY_GNUC_HAS_FEATURE -#endif -#if defined(__has_feature) -# define HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature) -#else -# define HEDLEY_GNUC_HAS_FEATURE(feature,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_GCC_HAS_FEATURE) -# undef HEDLEY_GCC_HAS_FEATURE -#endif -#if defined(__has_feature) -# define HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) __has_feature(feature) -#else -# define HEDLEY_GCC_HAS_FEATURE(feature,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_HAS_EXTENSION) -# undef HEDLEY_HAS_EXTENSION -#endif -#if defined(__has_extension) -# define HEDLEY_HAS_EXTENSION(extension) __has_extension(extension) -#else -# define HEDLEY_HAS_EXTENSION(extension) (0) -#endif - -#if defined(HEDLEY_GNUC_HAS_EXTENSION) -# undef HEDLEY_GNUC_HAS_EXTENSION -#endif -#if defined(__has_extension) -# define HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension) -#else -# define HEDLEY_GNUC_HAS_EXTENSION(extension,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_GCC_HAS_EXTENSION) -# undef HEDLEY_GCC_HAS_EXTENSION -#endif -#if defined(__has_extension) -# define HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) __has_extension(extension) -#else -# define HEDLEY_GCC_HAS_EXTENSION(extension,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_HAS_DECLSPEC_ATTRIBUTE) -# undef HEDLEY_HAS_DECLSPEC_ATTRIBUTE -#endif -#if defined(__has_declspec_attribute) -# define HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) __has_declspec_attribute(attribute) -#else -# define HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) (0) -#endif - -#if defined(HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE) -# undef HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE -#endif -#if defined(__has_declspec_attribute) -# define HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute) -#else -# define HEDLEY_GNUC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE) -# undef HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE -#endif -#if defined(__has_declspec_attribute) -# define HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) __has_declspec_attribute(attribute) -#else -# define HEDLEY_GCC_HAS_DECLSPEC_ATTRIBUTE(attribute,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_HAS_WARNING) -# undef HEDLEY_HAS_WARNING -#endif -#if defined(__has_warning) -# define HEDLEY_HAS_WARNING(warning) __has_warning(warning) -#else -# define HEDLEY_HAS_WARNING(warning) (0) -#endif - -#if defined(HEDLEY_GNUC_HAS_WARNING) -# undef HEDLEY_GNUC_HAS_WARNING -#endif -#if defined(__has_warning) -# define HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning) -#else -# define HEDLEY_GNUC_HAS_WARNING(warning,major,minor,patch) HEDLEY_GNUC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_GCC_HAS_WARNING) -# undef HEDLEY_GCC_HAS_WARNING -#endif -#if defined(__has_warning) -# define HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) __has_warning(warning) -#else -# define HEDLEY_GCC_HAS_WARNING(warning,major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if \ - (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \ - defined(__clang__) || \ - HEDLEY_GCC_VERSION_CHECK(3,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_IAR_VERSION_CHECK(8,0,0) || \ - HEDLEY_PGI_VERSION_CHECK(18,4,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \ - HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,0,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - HEDLEY_CRAY_VERSION_CHECK(5,0,0) || \ - HEDLEY_TINYC_VERSION_CHECK(0,9,17) || \ - HEDLEY_SUNPRO_VERSION_CHECK(8,0,0) || \ - (HEDLEY_IBM_VERSION_CHECK(10,1,0) && defined(__C99_PRAGMA_OPERATOR)) -# define HEDLEY_PRAGMA(value) _Pragma(#value) -#elif HEDLEY_MSVC_VERSION_CHECK(15,0,0) -# define HEDLEY_PRAGMA(value) __pragma(value) -#else -# define HEDLEY_PRAGMA(value) -#endif - -#if defined(HEDLEY_DIAGNOSTIC_PUSH) -# undef HEDLEY_DIAGNOSTIC_PUSH -#endif -#if defined(HEDLEY_DIAGNOSTIC_POP) -# undef HEDLEY_DIAGNOSTIC_POP -#endif -#if defined(__clang__) -# define HEDLEY_DIAGNOSTIC_PUSH _Pragma("clang diagnostic push") -# define HEDLEY_DIAGNOSTIC_POP _Pragma("clang diagnostic pop") -#elif HEDLEY_INTEL_VERSION_CHECK(13,0,0) -# define HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)") -# define HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)") -#elif HEDLEY_GCC_VERSION_CHECK(4,6,0) -# define HEDLEY_DIAGNOSTIC_PUSH _Pragma("GCC diagnostic push") -# define HEDLEY_DIAGNOSTIC_POP _Pragma("GCC diagnostic pop") -#elif \ - HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(push)) -# define HEDLEY_DIAGNOSTIC_POP __pragma(warning(pop)) -#elif HEDLEY_ARM_VERSION_CHECK(5,6,0) -# define HEDLEY_DIAGNOSTIC_PUSH _Pragma("push") -# define HEDLEY_DIAGNOSTIC_POP _Pragma("pop") -#elif \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,4,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) -# define HEDLEY_DIAGNOSTIC_PUSH _Pragma("diag_push") -# define HEDLEY_DIAGNOSTIC_POP _Pragma("diag_pop") -#elif HEDLEY_PELLES_VERSION_CHECK(2,90,0) -# define HEDLEY_DIAGNOSTIC_PUSH _Pragma("warning(push)") -# define HEDLEY_DIAGNOSTIC_POP _Pragma("warning(pop)") -#else -# define HEDLEY_DIAGNOSTIC_PUSH -# define HEDLEY_DIAGNOSTIC_POP -#endif - -/* HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ is for - HEDLEY INTERNAL USE ONLY. API subject to change without notice. */ -#if defined(HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_) -# undef HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_ -#endif -#if defined(__cplusplus) -# if HEDLEY_HAS_WARNING("-Wc++98-compat") -# if HEDLEY_HAS_WARNING("-Wc++17-extensions") -# if HEDLEY_HAS_WARNING("-Wc++1z-extensions") -# define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \ - _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \ - _Pragma("clang diagnostic ignored \"-Wc++1z-extensions\"") \ - xpr \ - HEDLEY_DIAGNOSTIC_POP -# else -# define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \ - _Pragma("clang diagnostic ignored \"-Wc++17-extensions\"") \ - xpr \ - HEDLEY_DIAGNOSTIC_POP -# endif -# else -# define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(xpr) \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wc++98-compat\"") \ - xpr \ - HEDLEY_DIAGNOSTIC_POP -# endif -# endif -#endif -#if !defined(HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_) -# define HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(x) x -#endif - -#if defined(HEDLEY_CONST_CAST) -# undef HEDLEY_CONST_CAST -#endif -#if defined(__cplusplus) -# define HEDLEY_CONST_CAST(T, expr) (const_cast(expr)) -#elif \ - HEDLEY_HAS_WARNING("-Wcast-qual") || \ - HEDLEY_GCC_VERSION_CHECK(4,6,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) -# define HEDLEY_CONST_CAST(T, expr) (__extension__ ({ \ - HEDLEY_DIAGNOSTIC_PUSH \ - HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \ - ((T) (expr)); \ - HEDLEY_DIAGNOSTIC_POP \ - })) -#else -# define HEDLEY_CONST_CAST(T, expr) ((T) (expr)) -#endif - -#if defined(HEDLEY_REINTERPRET_CAST) -# undef HEDLEY_REINTERPRET_CAST -#endif -#if defined(__cplusplus) -# define HEDLEY_REINTERPRET_CAST(T, expr) (reinterpret_cast(expr)) -#else -# define HEDLEY_REINTERPRET_CAST(T, expr) ((T) (expr)) -#endif - -#if defined(HEDLEY_STATIC_CAST) -# undef HEDLEY_STATIC_CAST -#endif -#if defined(__cplusplus) -# define HEDLEY_STATIC_CAST(T, expr) (static_cast(expr)) -#else -# define HEDLEY_STATIC_CAST(T, expr) ((T) (expr)) -#endif - -#if defined(HEDLEY_CPP_CAST) -# undef HEDLEY_CPP_CAST -#endif -#if defined(__cplusplus) -# if HEDLEY_HAS_WARNING("-Wold-style-cast") -# define HEDLEY_CPP_CAST(T, expr) \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wold-style-cast\"") \ - ((T) (expr)) \ - HEDLEY_DIAGNOSTIC_POP -# elif HEDLEY_IAR_VERSION_CHECK(8,3,0) -# define HEDLEY_CPP_CAST(T, expr) \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("diag_suppress=Pe137") \ - HEDLEY_DIAGNOSTIC_POP -# else -# define HEDLEY_CPP_CAST(T, expr) ((T) (expr)) -# endif -#else -# define HEDLEY_CPP_CAST(T, expr) (expr) -#endif - -#if defined(HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED) -# undef HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED -#endif -#if HEDLEY_HAS_WARNING("-Wdeprecated-declarations") -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"") -#elif HEDLEY_INTEL_VERSION_CHECK(13,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warning(disable:1478 1786)") -#elif HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:1478 1786)) -#elif HEDLEY_PGI_VERSION_CHECK(20,7,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1216,1444,1445") -#elif HEDLEY_PGI_VERSION_CHECK(17,10,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444") -#elif HEDLEY_GCC_VERSION_CHECK(4,3,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") -#elif HEDLEY_MSVC_VERSION_CHECK(15,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED __pragma(warning(disable:4996)) -#elif HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1215,1444") -#elif \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress 1291,1718") -#elif HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && !defined(__cplusplus) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,E_DEPRECATED_ATT,E_DEPRECATED_ATT_MESS)") -#elif HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) && defined(__cplusplus) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("error_messages(off,symdeprecated,symdeprecated2)") -#elif HEDLEY_IAR_VERSION_CHECK(8,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("diag_suppress=Pe1444,Pe1215") -#elif HEDLEY_PELLES_VERSION_CHECK(2,90,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED _Pragma("warn(disable:2241)") -#else -# define HEDLEY_DIAGNOSTIC_DISABLE_DEPRECATED -#endif - -#if defined(HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS) -# undef HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS -#endif -#if HEDLEY_HAS_WARNING("-Wunknown-pragmas") -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("clang diagnostic ignored \"-Wunknown-pragmas\"") -#elif HEDLEY_INTEL_VERSION_CHECK(13,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("warning(disable:161)") -#elif HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:161)) -#elif HEDLEY_PGI_VERSION_CHECK(17,10,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 1675") -#elif HEDLEY_GCC_VERSION_CHECK(4,3,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("GCC diagnostic ignored \"-Wunknown-pragmas\"") -#elif HEDLEY_MSVC_VERSION_CHECK(15,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS __pragma(warning(disable:4068)) -#elif \ - HEDLEY_TI_VERSION_CHECK(16,9,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163") -#elif HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 163") -#elif HEDLEY_IAR_VERSION_CHECK(8,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress=Pe161") -#elif HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS _Pragma("diag_suppress 161") -#else -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS -#endif - -#if defined(HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES) -# undef HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES -#endif -#if HEDLEY_HAS_WARNING("-Wunknown-attributes") -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("clang diagnostic ignored \"-Wunknown-attributes\"") -#elif HEDLEY_GCC_VERSION_CHECK(4,6,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"") -#elif HEDLEY_INTEL_VERSION_CHECK(17,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("warning(disable:1292)") -#elif HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:1292)) -#elif HEDLEY_MSVC_VERSION_CHECK(19,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES __pragma(warning(disable:5030)) -#elif HEDLEY_PGI_VERSION_CHECK(20,7,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097,1098") -#elif HEDLEY_PGI_VERSION_CHECK(17,10,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097") -#elif HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("error_messages(off,attrskipunsup)") -#elif \ - HEDLEY_TI_VERSION_CHECK(18,1,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1173") -#elif HEDLEY_IAR_VERSION_CHECK(8,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress=Pe1097") -#elif HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES _Pragma("diag_suppress 1097") -#else -# define HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_CPP_ATTRIBUTES -#endif - -#if defined(HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL) -# undef HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL -#endif -#if HEDLEY_HAS_WARNING("-Wcast-qual") -# define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("clang diagnostic ignored \"-Wcast-qual\"") -#elif HEDLEY_INTEL_VERSION_CHECK(13,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("warning(disable:2203 2331)") -#elif HEDLEY_GCC_VERSION_CHECK(3,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL _Pragma("GCC diagnostic ignored \"-Wcast-qual\"") -#else -# define HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL -#endif - -#if defined(HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION) -# undef HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION -#endif -#if HEDLEY_HAS_WARNING("-Wunused-function") -# define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("clang diagnostic ignored \"-Wunused-function\"") -#elif HEDLEY_GCC_VERSION_CHECK(3,4,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("GCC diagnostic ignored \"-Wunused-function\"") -#elif HEDLEY_MSVC_VERSION_CHECK(1,0,0) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION __pragma(warning(disable:4505)) -#elif HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION _Pragma("diag_suppress 3142") -#else -# define HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION -#endif - -#if defined(HEDLEY_DEPRECATED) -# undef HEDLEY_DEPRECATED -#endif -#if defined(HEDLEY_DEPRECATED_FOR) -# undef HEDLEY_DEPRECATED_FOR -#endif -#if \ - HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_DEPRECATED(since) __declspec(deprecated("Since " # since)) -# define HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated("Since " #since "; use " #replacement)) -#elif \ - (HEDLEY_HAS_EXTENSION(attribute_deprecated_with_message) && !defined(HEDLEY_IAR_VERSION)) || \ - HEDLEY_GCC_VERSION_CHECK(4,5,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(5,6,0) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) || \ - HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ - HEDLEY_TI_VERSION_CHECK(18,1,0) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(18,1,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(8,3,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_DEPRECATED(since) __attribute__((__deprecated__("Since " #since))) -# define HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__("Since " #since "; use " #replacement))) -#elif defined(__cplusplus) && (__cplusplus >= 201402L) -# define HEDLEY_DEPRECATED(since) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since)]]) -# define HEDLEY_DEPRECATED_FOR(since, replacement) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[deprecated("Since " #since "; use " #replacement)]]) -#elif \ - HEDLEY_HAS_ATTRIBUTE(deprecated) || \ - HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \ - HEDLEY_IAR_VERSION_CHECK(8,10,0) -# define HEDLEY_DEPRECATED(since) __attribute__((__deprecated__)) -# define HEDLEY_DEPRECATED_FOR(since, replacement) __attribute__((__deprecated__)) -#elif \ - HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ - HEDLEY_PELLES_VERSION_CHECK(6,50,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_DEPRECATED(since) __declspec(deprecated) -# define HEDLEY_DEPRECATED_FOR(since, replacement) __declspec(deprecated) -#elif HEDLEY_IAR_VERSION_CHECK(8,0,0) -# define HEDLEY_DEPRECATED(since) _Pragma("deprecated") -# define HEDLEY_DEPRECATED_FOR(since, replacement) _Pragma("deprecated") -#else -# define HEDLEY_DEPRECATED(since) -# define HEDLEY_DEPRECATED_FOR(since, replacement) -#endif - -#if defined(HEDLEY_UNAVAILABLE) -# undef HEDLEY_UNAVAILABLE -#endif -#if \ - HEDLEY_HAS_ATTRIBUTE(warning) || \ - HEDLEY_GCC_VERSION_CHECK(4,3,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_UNAVAILABLE(available_since) __attribute__((__warning__("Not available until " #available_since))) -#else -# define HEDLEY_UNAVAILABLE(available_since) -#endif - -#if defined(HEDLEY_WARN_UNUSED_RESULT) -# undef HEDLEY_WARN_UNUSED_RESULT -#endif -#if defined(HEDLEY_WARN_UNUSED_RESULT_MSG) -# undef HEDLEY_WARN_UNUSED_RESULT_MSG -#endif -#if \ - HEDLEY_HAS_ATTRIBUTE(warn_unused_result) || \ - HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - (HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \ - HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__)) -# define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) __attribute__((__warn_unused_result__)) -#elif (HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) >= 201907L) -# define HEDLEY_WARN_UNUSED_RESULT HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) -# define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard(msg)]]) -#elif HEDLEY_HAS_CPP_ATTRIBUTE(nodiscard) -# define HEDLEY_WARN_UNUSED_RESULT HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) -# define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[nodiscard]]) -#elif defined(_Check_return_) /* SAL */ -# define HEDLEY_WARN_UNUSED_RESULT _Check_return_ -# define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) _Check_return_ -#else -# define HEDLEY_WARN_UNUSED_RESULT -# define HEDLEY_WARN_UNUSED_RESULT_MSG(msg) -#endif - -#if defined(HEDLEY_SENTINEL) -# undef HEDLEY_SENTINEL -#endif -#if \ - HEDLEY_HAS_ATTRIBUTE(sentinel) || \ - HEDLEY_GCC_VERSION_CHECK(4,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(5,4,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_SENTINEL(position) __attribute__((__sentinel__(position))) -#else -# define HEDLEY_SENTINEL(position) -#endif - -#if defined(HEDLEY_NO_RETURN) -# undef HEDLEY_NO_RETURN -#endif -#if HEDLEY_IAR_VERSION_CHECK(8,0,0) -# define HEDLEY_NO_RETURN __noreturn -#elif \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_NO_RETURN __attribute__((__noreturn__)) -#elif defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L -# define HEDLEY_NO_RETURN _Noreturn -#elif defined(__cplusplus) && (__cplusplus >= 201103L) -# define HEDLEY_NO_RETURN HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[noreturn]]) -#elif \ - HEDLEY_HAS_ATTRIBUTE(noreturn) || \ - HEDLEY_GCC_VERSION_CHECK(3,2,0) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - HEDLEY_IAR_VERSION_CHECK(8,10,0) -# define HEDLEY_NO_RETURN __attribute__((__noreturn__)) -#elif HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) -# define HEDLEY_NO_RETURN _Pragma("does_not_return") -#elif \ - HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_NO_RETURN __declspec(noreturn) -#elif HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus) -# define HEDLEY_NO_RETURN _Pragma("FUNC_NEVER_RETURNS;") -#elif HEDLEY_COMPCERT_VERSION_CHECK(3,2,0) -# define HEDLEY_NO_RETURN __attribute((noreturn)) -#elif HEDLEY_PELLES_VERSION_CHECK(9,0,0) -# define HEDLEY_NO_RETURN __declspec(noreturn) -#else -# define HEDLEY_NO_RETURN -#endif - -#if defined(HEDLEY_NO_ESCAPE) -# undef HEDLEY_NO_ESCAPE -#endif -#if HEDLEY_HAS_ATTRIBUTE(noescape) -# define HEDLEY_NO_ESCAPE __attribute__((__noescape__)) -#else -# define HEDLEY_NO_ESCAPE -#endif - -#if defined(HEDLEY_UNREACHABLE) -# undef HEDLEY_UNREACHABLE -#endif -#if defined(HEDLEY_UNREACHABLE_RETURN) -# undef HEDLEY_UNREACHABLE_RETURN -#endif -#if defined(HEDLEY_ASSUME) -# undef HEDLEY_ASSUME -#endif -#if \ - HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_ASSUME(expr) __assume(expr) -#elif HEDLEY_HAS_BUILTIN(__builtin_assume) -# define HEDLEY_ASSUME(expr) __builtin_assume(expr) -#elif \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) -# if defined(__cplusplus) -# define HEDLEY_ASSUME(expr) std::_nassert(expr) -# else -# define HEDLEY_ASSUME(expr) _nassert(expr) -# endif -#endif -#if \ - (HEDLEY_HAS_BUILTIN(__builtin_unreachable) && (!defined(HEDLEY_ARM_VERSION))) || \ - HEDLEY_GCC_VERSION_CHECK(4,5,0) || \ - HEDLEY_PGI_VERSION_CHECK(18,10,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_IBM_VERSION_CHECK(13,1,5) || \ - HEDLEY_CRAY_VERSION_CHECK(10,0,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_UNREACHABLE() __builtin_unreachable() -#elif defined(HEDLEY_ASSUME) -# define HEDLEY_UNREACHABLE() HEDLEY_ASSUME(0) -#endif -#if !defined(HEDLEY_ASSUME) -# if defined(HEDLEY_UNREACHABLE) -# define HEDLEY_ASSUME(expr) HEDLEY_STATIC_CAST(void, ((expr) ? 1 : (HEDLEY_UNREACHABLE(), 1))) -# else -# define HEDLEY_ASSUME(expr) HEDLEY_STATIC_CAST(void, expr) -# endif -#endif -#if defined(HEDLEY_UNREACHABLE) -# if \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) -# define HEDLEY_UNREACHABLE_RETURN(value) return (HEDLEY_STATIC_CAST(void, HEDLEY_ASSUME(0)), (value)) -# else -# define HEDLEY_UNREACHABLE_RETURN(value) HEDLEY_UNREACHABLE() -# endif -#else -# define HEDLEY_UNREACHABLE_RETURN(value) return (value) -#endif -#if !defined(HEDLEY_UNREACHABLE) -# define HEDLEY_UNREACHABLE() HEDLEY_ASSUME(0) -#endif - -HEDLEY_DIAGNOSTIC_PUSH -#if HEDLEY_HAS_WARNING("-Wpedantic") -# pragma clang diagnostic ignored "-Wpedantic" -#endif -#if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") && defined(__cplusplus) -# pragma clang diagnostic ignored "-Wc++98-compat-pedantic" -#endif -#if HEDLEY_GCC_HAS_WARNING("-Wvariadic-macros",4,0,0) -# if defined(__clang__) -# pragma clang diagnostic ignored "-Wvariadic-macros" -# elif defined(HEDLEY_GCC_VERSION) -# pragma GCC diagnostic ignored "-Wvariadic-macros" -# endif -#endif -#if defined(HEDLEY_NON_NULL) -# undef HEDLEY_NON_NULL -#endif -#if \ - HEDLEY_HAS_ATTRIBUTE(nonnull) || \ - HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) -# define HEDLEY_NON_NULL(...) __attribute__((__nonnull__(__VA_ARGS__))) -#else -# define HEDLEY_NON_NULL(...) -#endif -HEDLEY_DIAGNOSTIC_POP - -#if defined(HEDLEY_PRINTF_FORMAT) -# undef HEDLEY_PRINTF_FORMAT -#endif -#if defined(__MINGW32__) && HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && !defined(__USE_MINGW_ANSI_STDIO) -# define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(ms_printf, string_idx, first_to_check))) -#elif defined(__MINGW32__) && HEDLEY_GCC_HAS_ATTRIBUTE(format,4,4,0) && defined(__USE_MINGW_ANSI_STDIO) -# define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(gnu_printf, string_idx, first_to_check))) -#elif \ - HEDLEY_HAS_ATTRIBUTE(format) || \ - HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(5,6,0) || \ - HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __attribute__((__format__(__printf__, string_idx, first_to_check))) -#elif HEDLEY_PELLES_VERSION_CHECK(6,0,0) -# define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) __declspec(vaformat(printf,string_idx,first_to_check)) -#else -# define HEDLEY_PRINTF_FORMAT(string_idx,first_to_check) -#endif - -#if defined(HEDLEY_CONSTEXPR) -# undef HEDLEY_CONSTEXPR -#endif -#if defined(__cplusplus) -# if __cplusplus >= 201103L -# define HEDLEY_CONSTEXPR HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(constexpr) -# endif -#endif -#if !defined(HEDLEY_CONSTEXPR) -# define HEDLEY_CONSTEXPR -#endif - -#if defined(HEDLEY_PREDICT) -# undef HEDLEY_PREDICT -#endif -#if defined(HEDLEY_LIKELY) -# undef HEDLEY_LIKELY -#endif -#if defined(HEDLEY_UNLIKELY) -# undef HEDLEY_UNLIKELY -#endif -#if defined(HEDLEY_UNPREDICTABLE) -# undef HEDLEY_UNPREDICTABLE -#endif -#if HEDLEY_HAS_BUILTIN(__builtin_unpredictable) -# define HEDLEY_UNPREDICTABLE(expr) __builtin_unpredictable((expr)) -#endif -#if \ - (HEDLEY_HAS_BUILTIN(__builtin_expect_with_probability) && !defined(HEDLEY_PGI_VERSION) && !defined(HEDLEY_INTEL_VERSION)) || \ - HEDLEY_GCC_VERSION_CHECK(9,0,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_PREDICT(expr, value, probability) __builtin_expect_with_probability( (expr), (value), (probability)) -# define HEDLEY_PREDICT_TRUE(expr, probability) __builtin_expect_with_probability(!!(expr), 1 , (probability)) -# define HEDLEY_PREDICT_FALSE(expr, probability) __builtin_expect_with_probability(!!(expr), 0 , (probability)) -# define HEDLEY_LIKELY(expr) __builtin_expect (!!(expr), 1 ) -# define HEDLEY_UNLIKELY(expr) __builtin_expect (!!(expr), 0 ) -#elif \ - (HEDLEY_HAS_BUILTIN(__builtin_expect) && !defined(HEDLEY_INTEL_CL_VERSION)) || \ - HEDLEY_GCC_VERSION_CHECK(3,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - (HEDLEY_SUNPRO_VERSION_CHECK(5,15,0) && defined(__cplusplus)) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(4,7,0) || \ - HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,1,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - HEDLEY_TINYC_VERSION_CHECK(0,9,27) || \ - HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_PREDICT(expr, expected, probability) \ - (((probability) >= 0.9) ? __builtin_expect((expr), (expected)) : (HEDLEY_STATIC_CAST(void, expected), (expr))) -# define HEDLEY_PREDICT_TRUE(expr, probability) \ - (__extension__ ({ \ - double hedley_probability_ = (probability); \ - ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 1) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 0) : !!(expr))); \ - })) -# define HEDLEY_PREDICT_FALSE(expr, probability) \ - (__extension__ ({ \ - double hedley_probability_ = (probability); \ - ((hedley_probability_ >= 0.9) ? __builtin_expect(!!(expr), 0) : ((hedley_probability_ <= 0.1) ? __builtin_expect(!!(expr), 1) : !!(expr))); \ - })) -# define HEDLEY_LIKELY(expr) __builtin_expect(!!(expr), 1) -# define HEDLEY_UNLIKELY(expr) __builtin_expect(!!(expr), 0) -#else -# define HEDLEY_PREDICT(expr, expected, probability) (HEDLEY_STATIC_CAST(void, expected), (expr)) -# define HEDLEY_PREDICT_TRUE(expr, probability) (!!(expr)) -# define HEDLEY_PREDICT_FALSE(expr, probability) (!!(expr)) -# define HEDLEY_LIKELY(expr) (!!(expr)) -# define HEDLEY_UNLIKELY(expr) (!!(expr)) -#endif -#if !defined(HEDLEY_UNPREDICTABLE) -# define HEDLEY_UNPREDICTABLE(expr) HEDLEY_PREDICT(expr, 1, 0.5) -#endif - -#if defined(HEDLEY_MALLOC) -# undef HEDLEY_MALLOC -#endif -#if \ - HEDLEY_HAS_ATTRIBUTE(malloc) || \ - HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(12,1,0) || \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_MALLOC __attribute__((__malloc__)) -#elif HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) -# define HEDLEY_MALLOC _Pragma("returns_new_memory") -#elif \ - HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_MALLOC __declspec(restrict) -#else -# define HEDLEY_MALLOC -#endif - -#if defined(HEDLEY_PURE) -# undef HEDLEY_PURE -#endif -#if \ - HEDLEY_HAS_ATTRIBUTE(pure) || \ - HEDLEY_GCC_VERSION_CHECK(2,96,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_PURE __attribute__((__pure__)) -#elif HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) -# define HEDLEY_PURE _Pragma("does_not_write_global_data") -#elif defined(__cplusplus) && \ - ( \ - HEDLEY_TI_CL430_VERSION_CHECK(2,0,1) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(4,0,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) \ - ) -# define HEDLEY_PURE _Pragma("FUNC_IS_PURE;") -#else -# define HEDLEY_PURE -#endif - -#if defined(HEDLEY_CONST) -# undef HEDLEY_CONST -#endif -#if \ - HEDLEY_HAS_ATTRIBUTE(const) || \ - HEDLEY_GCC_VERSION_CHECK(2,5,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_CONST __attribute__((__const__)) -#elif \ - HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) -# define HEDLEY_CONST _Pragma("no_side_effect") -#else -# define HEDLEY_CONST HEDLEY_PURE -#endif - -#if defined(HEDLEY_RESTRICT) -# undef HEDLEY_RESTRICT -#endif -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && !defined(__cplusplus) -# define HEDLEY_RESTRICT restrict -#elif \ - HEDLEY_GCC_VERSION_CHECK(3,1,0) || \ - HEDLEY_MSVC_VERSION_CHECK(14,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_PGI_VERSION_CHECK(17,10,0) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,2,4) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(8,1,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - (HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus)) || \ - HEDLEY_IAR_VERSION_CHECK(8,0,0) || \ - defined(__clang__) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_RESTRICT __restrict -#elif HEDLEY_SUNPRO_VERSION_CHECK(5,3,0) && !defined(__cplusplus) -# define HEDLEY_RESTRICT _Restrict -#else -# define HEDLEY_RESTRICT -#endif - -#if defined(HEDLEY_INLINE) -# undef HEDLEY_INLINE -#endif -#if \ - (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) || \ - (defined(__cplusplus) && (__cplusplus >= 199711L)) -# define HEDLEY_INLINE inline -#elif \ - defined(HEDLEY_GCC_VERSION) || \ - HEDLEY_ARM_VERSION_CHECK(6,2,0) -# define HEDLEY_INLINE __inline__ -#elif \ - HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,1,0) || \ - HEDLEY_TI_CL430_VERSION_CHECK(3,1,0) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,2,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_INLINE __inline -#else -# define HEDLEY_INLINE -#endif - -#if defined(HEDLEY_ALWAYS_INLINE) -# undef HEDLEY_ALWAYS_INLINE -#endif -#if \ - HEDLEY_HAS_ATTRIBUTE(always_inline) || \ - HEDLEY_GCC_VERSION_CHECK(4,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \ - HEDLEY_IAR_VERSION_CHECK(8,10,0) -# define HEDLEY_ALWAYS_INLINE __attribute__((__always_inline__)) HEDLEY_INLINE -#elif \ - HEDLEY_MSVC_VERSION_CHECK(12,0,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_ALWAYS_INLINE __forceinline -#elif defined(__cplusplus) && \ - ( \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) \ - ) -# define HEDLEY_ALWAYS_INLINE _Pragma("FUNC_ALWAYS_INLINE;") -#elif HEDLEY_IAR_VERSION_CHECK(8,0,0) -# define HEDLEY_ALWAYS_INLINE _Pragma("inline=forced") -#else -# define HEDLEY_ALWAYS_INLINE HEDLEY_INLINE -#endif - -#if defined(HEDLEY_NEVER_INLINE) -# undef HEDLEY_NEVER_INLINE -#endif -#if \ - HEDLEY_HAS_ATTRIBUTE(noinline) || \ - HEDLEY_GCC_VERSION_CHECK(4,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(10,1,0) || \ - HEDLEY_TI_VERSION_CHECK(15,12,0) || \ - (HEDLEY_TI_ARMCL_VERSION_CHECK(4,8,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(5,2,0) || \ - (HEDLEY_TI_CL2000_VERSION_CHECK(6,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(6,4,0) || \ - (HEDLEY_TI_CL430_VERSION_CHECK(4,0,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL430_VERSION_CHECK(4,3,0) || \ - (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,1,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \ - HEDLEY_IAR_VERSION_CHECK(8,10,0) -# define HEDLEY_NEVER_INLINE __attribute__((__noinline__)) -#elif \ - HEDLEY_MSVC_VERSION_CHECK(13,10,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_NEVER_INLINE __declspec(noinline) -#elif HEDLEY_PGI_VERSION_CHECK(10,2,0) -# define HEDLEY_NEVER_INLINE _Pragma("noinline") -#elif HEDLEY_TI_CL6X_VERSION_CHECK(6,0,0) && defined(__cplusplus) -# define HEDLEY_NEVER_INLINE _Pragma("FUNC_CANNOT_INLINE;") -#elif HEDLEY_IAR_VERSION_CHECK(8,0,0) -# define HEDLEY_NEVER_INLINE _Pragma("inline=never") -#elif HEDLEY_COMPCERT_VERSION_CHECK(3,2,0) -# define HEDLEY_NEVER_INLINE __attribute((noinline)) -#elif HEDLEY_PELLES_VERSION_CHECK(9,0,0) -# define HEDLEY_NEVER_INLINE __declspec(noinline) -#else -# define HEDLEY_NEVER_INLINE -#endif - -#if defined(HEDLEY_PRIVATE) -# undef HEDLEY_PRIVATE -#endif -#if defined(HEDLEY_PUBLIC) -# undef HEDLEY_PUBLIC -#endif -#if defined(HEDLEY_IMPORT) -# undef HEDLEY_IMPORT -#endif -#if defined(_WIN32) || defined(__CYGWIN__) -# define HEDLEY_PRIVATE -# define HEDLEY_PUBLIC __declspec(dllexport) -# define HEDLEY_IMPORT __declspec(dllimport) -#else -# if \ - HEDLEY_HAS_ATTRIBUTE(visibility) || \ - HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5,11,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ - ( \ - defined(__TI_EABI__) && \ - ( \ - (HEDLEY_TI_CL6X_VERSION_CHECK(7,2,0) && defined(__TI_GNU_ATTRIBUTE_SUPPORT__)) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(7,5,0) \ - ) \ - ) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_PRIVATE __attribute__((__visibility__("hidden"))) -# define HEDLEY_PUBLIC __attribute__((__visibility__("default"))) -# else -# define HEDLEY_PRIVATE -# define HEDLEY_PUBLIC -# endif -# define HEDLEY_IMPORT extern -#endif - -#if defined(HEDLEY_NO_THROW) -# undef HEDLEY_NO_THROW -#endif -#if \ - HEDLEY_HAS_ATTRIBUTE(nothrow) || \ - HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_NO_THROW __attribute__((__nothrow__)) -#elif \ - HEDLEY_MSVC_VERSION_CHECK(13,1,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) -# define HEDLEY_NO_THROW __declspec(nothrow) -#else -# define HEDLEY_NO_THROW -#endif - -#if defined(HEDLEY_FALL_THROUGH) -# undef HEDLEY_FALL_THROUGH -#endif -#if defined(HEDLEY_INTEL_VERSION) -# define HEDLEY_FALL_THROUGH -#elif \ - HEDLEY_HAS_ATTRIBUTE(fallthrough) || \ - HEDLEY_GCC_VERSION_CHECK(7,0,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_FALL_THROUGH __attribute__((__fallthrough__)) -#elif HEDLEY_HAS_CPP_ATTRIBUTE_NS(clang,fallthrough) -# define HEDLEY_FALL_THROUGH HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[clang::fallthrough]]) -#elif HEDLEY_HAS_CPP_ATTRIBUTE(fallthrough) -# define HEDLEY_FALL_THROUGH HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_([[fallthrough]]) -#elif defined(__fallthrough) /* SAL */ -# define HEDLEY_FALL_THROUGH __fallthrough -#else -# define HEDLEY_FALL_THROUGH -#endif - -#if defined(HEDLEY_RETURNS_NON_NULL) -# undef HEDLEY_RETURNS_NON_NULL -#endif -#if \ - HEDLEY_HAS_ATTRIBUTE(returns_nonnull) || \ - HEDLEY_GCC_VERSION_CHECK(4,9,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_RETURNS_NON_NULL __attribute__((__returns_nonnull__)) -#elif defined(_Ret_notnull_) /* SAL */ -# define HEDLEY_RETURNS_NON_NULL _Ret_notnull_ -#else -# define HEDLEY_RETURNS_NON_NULL -#endif - -#if defined(HEDLEY_ARRAY_PARAM) -# undef HEDLEY_ARRAY_PARAM -#endif -#if \ - defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) && \ - !defined(__STDC_NO_VLA__) && \ - !defined(__cplusplus) && \ - !defined(HEDLEY_PGI_VERSION) && \ - !defined(HEDLEY_TINYC_VERSION) -# define HEDLEY_ARRAY_PARAM(name) (name) -#else -# define HEDLEY_ARRAY_PARAM(name) -#endif - -#if defined(HEDLEY_IS_CONSTANT) -# undef HEDLEY_IS_CONSTANT -#endif -#if defined(HEDLEY_REQUIRE_CONSTEXPR) -# undef HEDLEY_REQUIRE_CONSTEXPR -#endif -/* HEDLEY_IS_CONSTEXPR_ is for - HEDLEY INTERNAL USE ONLY. API subject to change without notice. */ -#if defined(HEDLEY_IS_CONSTEXPR_) -# undef HEDLEY_IS_CONSTEXPR_ -#endif -#if \ - HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \ - HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_TINYC_VERSION_CHECK(0,9,19) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ - (HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) && !defined(__cplusplus)) || \ - HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define HEDLEY_IS_CONSTANT(expr) __builtin_constant_p(expr) -#endif -#if !defined(__cplusplus) -# if \ - HEDLEY_HAS_BUILTIN(__builtin_types_compatible_p) || \ - HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ - HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ - HEDLEY_ARM_VERSION_CHECK(5,4,0) || \ - HEDLEY_TINYC_VERSION_CHECK(0,9,24) -# if defined(__INTPTR_TYPE__) -# define HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0)), int*) -# else -# include -# define HEDLEY_IS_CONSTEXPR_(expr) __builtin_types_compatible_p(__typeof__((1 ? (void*) ((intptr_t) ((expr) * 0)) : (int*) 0)), int*) -# endif -# elif \ - ( \ - defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \ - !defined(HEDLEY_SUNPRO_VERSION) && \ - !defined(HEDLEY_PGI_VERSION) && \ - !defined(HEDLEY_IAR_VERSION)) || \ - (HEDLEY_HAS_EXTENSION(c_generic_selections) && !defined(HEDLEY_IAR_VERSION)) || \ - HEDLEY_GCC_VERSION_CHECK(4,9,0) || \ - HEDLEY_INTEL_VERSION_CHECK(17,0,0) || \ - HEDLEY_IBM_VERSION_CHECK(12,1,0) || \ - HEDLEY_ARM_VERSION_CHECK(5,3,0) -# if defined(__INTPTR_TYPE__) -# define HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((__INTPTR_TYPE__) ((expr) * 0)) : (int*) 0), int*: 1, void*: 0) -# else -# include -# define HEDLEY_IS_CONSTEXPR_(expr) _Generic((1 ? (void*) ((intptr_t) * 0) : (int*) 0), int*: 1, void*: 0) -# endif -# elif \ - defined(HEDLEY_GCC_VERSION) || \ - defined(HEDLEY_INTEL_VERSION) || \ - defined(HEDLEY_TINYC_VERSION) || \ - defined(HEDLEY_TI_ARMCL_VERSION) || \ - HEDLEY_TI_CL430_VERSION_CHECK(18,12,0) || \ - defined(HEDLEY_TI_CL2000_VERSION) || \ - defined(HEDLEY_TI_CL6X_VERSION) || \ - defined(HEDLEY_TI_CL7X_VERSION) || \ - defined(HEDLEY_TI_CLPRU_VERSION) || \ - defined(__clang__) -# define HEDLEY_IS_CONSTEXPR_(expr) ( \ - sizeof(void) != \ - sizeof(*( \ - 1 ? \ - ((void*) ((expr) * 0L) ) : \ - ((struct { char v[sizeof(void) * 2]; } *) 1) \ - ) \ - ) \ - ) -# endif -#endif -#if defined(HEDLEY_IS_CONSTEXPR_) -# if !defined(HEDLEY_IS_CONSTANT) -# define HEDLEY_IS_CONSTANT(expr) HEDLEY_IS_CONSTEXPR_(expr) -# endif -# define HEDLEY_REQUIRE_CONSTEXPR(expr) (HEDLEY_IS_CONSTEXPR_(expr) ? (expr) : (-1)) -#else -# if !defined(HEDLEY_IS_CONSTANT) -# define HEDLEY_IS_CONSTANT(expr) (0) -# endif -# define HEDLEY_REQUIRE_CONSTEXPR(expr) (expr) -#endif - -#if defined(HEDLEY_BEGIN_C_DECLS) -# undef HEDLEY_BEGIN_C_DECLS -#endif -#if defined(HEDLEY_END_C_DECLS) -# undef HEDLEY_END_C_DECLS -#endif -#if defined(HEDLEY_C_DECL) -# undef HEDLEY_C_DECL -#endif -#if defined(__cplusplus) -# define HEDLEY_BEGIN_C_DECLS extern "C" { -# define HEDLEY_END_C_DECLS } -# define HEDLEY_C_DECL extern "C" -#else -# define HEDLEY_BEGIN_C_DECLS -# define HEDLEY_END_C_DECLS -# define HEDLEY_C_DECL -#endif - -#if defined(HEDLEY_STATIC_ASSERT) -# undef HEDLEY_STATIC_ASSERT -#endif -#if \ - !defined(__cplusplus) && ( \ - (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \ - (HEDLEY_HAS_FEATURE(c_static_assert) && !defined(HEDLEY_INTEL_CL_VERSION)) || \ - HEDLEY_GCC_VERSION_CHECK(6,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - defined(_Static_assert) \ - ) -# define HEDLEY_STATIC_ASSERT(expr, message) _Static_assert(expr, message) -#elif \ - (defined(__cplusplus) && (__cplusplus >= 201103L)) || \ - HEDLEY_MSVC_VERSION_CHECK(16,0,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_STATIC_ASSERT(expr, message) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message)) -#else -# define HEDLEY_STATIC_ASSERT(expr, message) -#endif - -#if defined(HEDLEY_NULL) -# undef HEDLEY_NULL -#endif -#if defined(__cplusplus) -# if __cplusplus >= 201103L -# define HEDLEY_NULL HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(nullptr) -# elif defined(NULL) -# define HEDLEY_NULL NULL -# else -# define HEDLEY_NULL HEDLEY_STATIC_CAST(void*, 0) -# endif -#elif defined(NULL) -# define HEDLEY_NULL NULL -#else -# define HEDLEY_NULL ((void*) 0) -#endif - -#if defined(HEDLEY_MESSAGE) -# undef HEDLEY_MESSAGE -#endif -#if HEDLEY_HAS_WARNING("-Wunknown-pragmas") -# define HEDLEY_MESSAGE(msg) \ - HEDLEY_DIAGNOSTIC_PUSH \ - HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \ - HEDLEY_PRAGMA(message msg) \ - HEDLEY_DIAGNOSTIC_POP -#elif \ - HEDLEY_GCC_VERSION_CHECK(4,4,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) -# define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(message msg) -#elif HEDLEY_CRAY_VERSION_CHECK(5,0,0) -# define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(_CRI message msg) -#elif HEDLEY_IAR_VERSION_CHECK(8,0,0) -# define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(message(msg)) -#elif HEDLEY_PELLES_VERSION_CHECK(2,0,0) -# define HEDLEY_MESSAGE(msg) HEDLEY_PRAGMA(message(msg)) -#else -# define HEDLEY_MESSAGE(msg) -#endif - -#if defined(HEDLEY_WARNING) -# undef HEDLEY_WARNING -#endif -#if HEDLEY_HAS_WARNING("-Wunknown-pragmas") -# define HEDLEY_WARNING(msg) \ - HEDLEY_DIAGNOSTIC_PUSH \ - HEDLEY_DIAGNOSTIC_DISABLE_UNKNOWN_PRAGMAS \ - HEDLEY_PRAGMA(clang warning msg) \ - HEDLEY_DIAGNOSTIC_POP -#elif \ - HEDLEY_GCC_VERSION_CHECK(4,8,0) || \ - HEDLEY_PGI_VERSION_CHECK(18,4,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) -# define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(GCC warning msg) -#elif \ - HEDLEY_MSVC_VERSION_CHECK(15,0,0) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_WARNING(msg) HEDLEY_PRAGMA(message(msg)) -#else -# define HEDLEY_WARNING(msg) HEDLEY_MESSAGE(msg) -#endif - -#if defined(HEDLEY_REQUIRE) -# undef HEDLEY_REQUIRE -#endif -#if defined(HEDLEY_REQUIRE_MSG) -# undef HEDLEY_REQUIRE_MSG -#endif -#if HEDLEY_HAS_ATTRIBUTE(diagnose_if) -# if HEDLEY_HAS_WARNING("-Wgcc-compat") -# define HEDLEY_REQUIRE(expr) \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \ - __attribute__((diagnose_if(!(expr), #expr, "error"))) \ - HEDLEY_DIAGNOSTIC_POP -# define HEDLEY_REQUIRE_MSG(expr,msg) \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wgcc-compat\"") \ - __attribute__((diagnose_if(!(expr), msg, "error"))) \ - HEDLEY_DIAGNOSTIC_POP -# else -# define HEDLEY_REQUIRE(expr) __attribute__((diagnose_if(!(expr), #expr, "error"))) -# define HEDLEY_REQUIRE_MSG(expr,msg) __attribute__((diagnose_if(!(expr), msg, "error"))) -# endif -#else -# define HEDLEY_REQUIRE(expr) -# define HEDLEY_REQUIRE_MSG(expr,msg) -#endif - -#if defined(HEDLEY_FLAGS) -# undef HEDLEY_FLAGS -#endif -#if HEDLEY_HAS_ATTRIBUTE(flag_enum) && (!defined(__cplusplus) || HEDLEY_HAS_WARNING("-Wbitfield-enum-conversion")) -# define HEDLEY_FLAGS __attribute__((__flag_enum__)) -#else -# define HEDLEY_FLAGS -#endif - -#if defined(HEDLEY_FLAGS_CAST) -# undef HEDLEY_FLAGS_CAST -#endif -#if HEDLEY_INTEL_VERSION_CHECK(19,0,0) -# define HEDLEY_FLAGS_CAST(T, expr) (__extension__ ({ \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("warning(disable:188)") \ - ((T) (expr)); \ - HEDLEY_DIAGNOSTIC_POP \ - })) -#else -# define HEDLEY_FLAGS_CAST(T, expr) HEDLEY_STATIC_CAST(T, expr) -#endif - -#if defined(HEDLEY_EMPTY_BASES) -# undef HEDLEY_EMPTY_BASES -#endif -#if \ - (HEDLEY_MSVC_VERSION_CHECK(19,0,23918) && !HEDLEY_MSVC_VERSION_CHECK(20,0,0)) || \ - HEDLEY_INTEL_CL_VERSION_CHECK(2021,1,0) -# define HEDLEY_EMPTY_BASES __declspec(empty_bases) -#else -# define HEDLEY_EMPTY_BASES -#endif - -/* Remaining macros are deprecated. */ - -#if defined(HEDLEY_GCC_NOT_CLANG_VERSION_CHECK) -# undef HEDLEY_GCC_NOT_CLANG_VERSION_CHECK -#endif -#if defined(__clang__) -# define HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) (0) -#else -# define HEDLEY_GCC_NOT_CLANG_VERSION_CHECK(major,minor,patch) HEDLEY_GCC_VERSION_CHECK(major,minor,patch) -#endif - -#if defined(HEDLEY_CLANG_HAS_ATTRIBUTE) -# undef HEDLEY_CLANG_HAS_ATTRIBUTE -#endif -#define HEDLEY_CLANG_HAS_ATTRIBUTE(attribute) HEDLEY_HAS_ATTRIBUTE(attribute) - -#if defined(HEDLEY_CLANG_HAS_CPP_ATTRIBUTE) -# undef HEDLEY_CLANG_HAS_CPP_ATTRIBUTE -#endif -#define HEDLEY_CLANG_HAS_CPP_ATTRIBUTE(attribute) HEDLEY_HAS_CPP_ATTRIBUTE(attribute) - -#if defined(HEDLEY_CLANG_HAS_BUILTIN) -# undef HEDLEY_CLANG_HAS_BUILTIN -#endif -#define HEDLEY_CLANG_HAS_BUILTIN(builtin) HEDLEY_HAS_BUILTIN(builtin) - -#if defined(HEDLEY_CLANG_HAS_FEATURE) -# undef HEDLEY_CLANG_HAS_FEATURE -#endif -#define HEDLEY_CLANG_HAS_FEATURE(feature) HEDLEY_HAS_FEATURE(feature) - -#if defined(HEDLEY_CLANG_HAS_EXTENSION) -# undef HEDLEY_CLANG_HAS_EXTENSION -#endif -#define HEDLEY_CLANG_HAS_EXTENSION(extension) HEDLEY_HAS_EXTENSION(extension) - -#if defined(HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE) -# undef HEDLEY_CLANG_HAS_DECLSPEC_DECLSPEC_ATTRIBUTE -#endif -#define HEDLEY_CLANG_HAS_DECLSPEC_ATTRIBUTE(attribute) HEDLEY_HAS_DECLSPEC_ATTRIBUTE(attribute) - -#if defined(HEDLEY_CLANG_HAS_WARNING) -# undef HEDLEY_CLANG_HAS_WARNING -#endif -#define HEDLEY_CLANG_HAS_WARNING(warning) HEDLEY_HAS_WARNING(warning) - -#endif /* !defined(HEDLEY_VERSION) || (HEDLEY_VERSION < X) */ diff --git a/ffi-deps/simde/simde/simde-aes.h b/ffi-deps/simde/simde/simde-aes.h deleted file mode 100644 index 3ba650e..0000000 --- a/ffi-deps/simde/simde/simde-aes.h +++ /dev/null @@ -1,265 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) - */ - -#if !defined(SIMDE_AES_H) -#define SIMDE_AES_H - -#include "simde-features.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS - -#if !(defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO)) - -/* - * Number of columns (32-bit words) comprising the State. For this - * standard, Nb = 4. - */ -#define simde_x_aes_Nb 4 - -static uint8_t simde_x_aes_gmult_lookup_table[8][256] = { -{ // gmult(0x02, b); - 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e, - 0x20, 0x22, 0x24, 0x26, 0x28, 0x2a, 0x2c, 0x2e, 0x30, 0x32, 0x34, 0x36, 0x38, 0x3a, 0x3c, 0x3e, - 0x40, 0x42, 0x44, 0x46, 0x48, 0x4a, 0x4c, 0x4e, 0x50, 0x52, 0x54, 0x56, 0x58, 0x5a, 0x5c, 0x5e, - 0x60, 0x62, 0x64, 0x66, 0x68, 0x6a, 0x6c, 0x6e, 0x70, 0x72, 0x74, 0x76, 0x78, 0x7a, 0x7c, 0x7e, - 0x80, 0x82, 0x84, 0x86, 0x88, 0x8a, 0x8c, 0x8e, 0x90, 0x92, 0x94, 0x96, 0x98, 0x9a, 0x9c, 0x9e, - 0xa0, 0xa2, 0xa4, 0xa6, 0xa8, 0xaa, 0xac, 0xae, 0xb0, 0xb2, 0xb4, 0xb6, 0xb8, 0xba, 0xbc, 0xbe, - 0xc0, 0xc2, 0xc4, 0xc6, 0xc8, 0xca, 0xcc, 0xce, 0xd0, 0xd2, 0xd4, 0xd6, 0xd8, 0xda, 0xdc, 0xde, - 0xe0, 0xe2, 0xe4, 0xe6, 0xe8, 0xea, 0xec, 0xee, 0xf0, 0xf2, 0xf4, 0xf6, 0xf8, 0xfa, 0xfc, 0xfe, - 0x1b, 0x19, 0x1f, 0x1d, 0x13, 0x11, 0x17, 0x15, 0x0b, 0x09, 0x0f, 0x0d, 0x03, 0x01, 0x07, 0x05, - 0x3b, 0x39, 0x3f, 0x3d, 0x33, 0x31, 0x37, 0x35, 0x2b, 0x29, 0x2f, 0x2d, 0x23, 0x21, 0x27, 0x25, - 0x5b, 0x59, 0x5f, 0x5d, 0x53, 0x51, 0x57, 0x55, 0x4b, 0x49, 0x4f, 0x4d, 0x43, 0x41, 0x47, 0x45, - 0x7b, 0x79, 0x7f, 0x7d, 0x73, 0x71, 0x77, 0x75, 0x6b, 0x69, 0x6f, 0x6d, 0x63, 0x61, 0x67, 0x65, - 0x9b, 0x99, 0x9f, 0x9d, 0x93, 0x91, 0x97, 0x95, 0x8b, 0x89, 0x8f, 0x8d, 0x83, 0x81, 0x87, 0x85, - 0xbb, 0xb9, 0xbf, 0xbd, 0xb3, 0xb1, 0xb7, 0xb5, 0xab, 0xa9, 0xaf, 0xad, 0xa3, 0xa1, 0xa7, 0xa5, - 0xdb, 0xd9, 0xdf, 0xdd, 0xd3, 0xd1, 0xd7, 0xd5, 0xcb, 0xc9, 0xcf, 0xcd, 0xc3, 0xc1, 0xc7, 0xc5, - 0xfb, 0xf9, 0xff, 0xfd, 0xf3, 0xf1, 0xf7, 0xf5, 0xeb, 0xe9, 0xef, 0xed, 0xe3, 0xe1, 0xe7, 0xe5 -}, -{ // gmult(0x01, b); - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, - 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, - 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, - 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, - 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, - 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, - 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, - 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, - 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, - 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, - 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, - 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, - 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, - 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, - 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, - 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, -}, -{ // gmult(0x01, b); - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, - 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, - 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, - 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, - 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, - 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, - 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, - 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, - 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, - 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, - 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, - 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, - 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, - 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, - 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, - 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, -}, -{ // gmult(0x03, b); - 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11, - 0x30, 0x33, 0x36, 0x35, 0x3c, 0x3f, 0x3a, 0x39, 0x28, 0x2b, 0x2e, 0x2d, 0x24, 0x27, 0x22, 0x21, - 0x60, 0x63, 0x66, 0x65, 0x6c, 0x6f, 0x6a, 0x69, 0x78, 0x7b, 0x7e, 0x7d, 0x74, 0x77, 0x72, 0x71, - 0x50, 0x53, 0x56, 0x55, 0x5c, 0x5f, 0x5a, 0x59, 0x48, 0x4b, 0x4e, 0x4d, 0x44, 0x47, 0x42, 0x41, - 0xc0, 0xc3, 0xc6, 0xc5, 0xcc, 0xcf, 0xca, 0xc9, 0xd8, 0xdb, 0xde, 0xdd, 0xd4, 0xd7, 0xd2, 0xd1, - 0xf0, 0xf3, 0xf6, 0xf5, 0xfc, 0xff, 0xfa, 0xf9, 0xe8, 0xeb, 0xee, 0xed, 0xe4, 0xe7, 0xe2, 0xe1, - 0xa0, 0xa3, 0xa6, 0xa5, 0xac, 0xaf, 0xaa, 0xa9, 0xb8, 0xbb, 0xbe, 0xbd, 0xb4, 0xb7, 0xb2, 0xb1, - 0x90, 0x93, 0x96, 0x95, 0x9c, 0x9f, 0x9a, 0x99, 0x88, 0x8b, 0x8e, 0x8d, 0x84, 0x87, 0x82, 0x81, - 0x9b, 0x98, 0x9d, 0x9e, 0x97, 0x94, 0x91, 0x92, 0x83, 0x80, 0x85, 0x86, 0x8f, 0x8c, 0x89, 0x8a, - 0xab, 0xa8, 0xad, 0xae, 0xa7, 0xa4, 0xa1, 0xa2, 0xb3, 0xb0, 0xb5, 0xb6, 0xbf, 0xbc, 0xb9, 0xba, - 0xfb, 0xf8, 0xfd, 0xfe, 0xf7, 0xf4, 0xf1, 0xf2, 0xe3, 0xe0, 0xe5, 0xe6, 0xef, 0xec, 0xe9, 0xea, - 0xcb, 0xc8, 0xcd, 0xce, 0xc7, 0xc4, 0xc1, 0xc2, 0xd3, 0xd0, 0xd5, 0xd6, 0xdf, 0xdc, 0xd9, 0xda, - 0x5b, 0x58, 0x5d, 0x5e, 0x57, 0x54, 0x51, 0x52, 0x43, 0x40, 0x45, 0x46, 0x4f, 0x4c, 0x49, 0x4a, - 0x6b, 0x68, 0x6d, 0x6e, 0x67, 0x64, 0x61, 0x62, 0x73, 0x70, 0x75, 0x76, 0x7f, 0x7c, 0x79, 0x7a, - 0x3b, 0x38, 0x3d, 0x3e, 0x37, 0x34, 0x31, 0x32, 0x23, 0x20, 0x25, 0x26, 0x2f, 0x2c, 0x29, 0x2a, - 0x0b, 0x08, 0x0d, 0x0e, 0x07, 0x04, 0x01, 0x02, 0x13, 0x10, 0x15, 0x16, 0x1f, 0x1c, 0x19, 0x1a, -}, -{ // gmult(0x0e, b); - 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a, - 0xe0, 0xee, 0xfc, 0xf2, 0xd8, 0xd6, 0xc4, 0xca, 0x90, 0x9e, 0x8c, 0x82, 0xa8, 0xa6, 0xb4, 0xba, - 0xdb, 0xd5, 0xc7, 0xc9, 0xe3, 0xed, 0xff, 0xf1, 0xab, 0xa5, 0xb7, 0xb9, 0x93, 0x9d, 0x8f, 0x81, - 0x3b, 0x35, 0x27, 0x29, 0x03, 0x0d, 0x1f, 0x11, 0x4b, 0x45, 0x57, 0x59, 0x73, 0x7d, 0x6f, 0x61, - 0xad, 0xa3, 0xb1, 0xbf, 0x95, 0x9b, 0x89, 0x87, 0xdd, 0xd3, 0xc1, 0xcf, 0xe5, 0xeb, 0xf9, 0xf7, - 0x4d, 0x43, 0x51, 0x5f, 0x75, 0x7b, 0x69, 0x67, 0x3d, 0x33, 0x21, 0x2f, 0x05, 0x0b, 0x19, 0x17, - 0x76, 0x78, 0x6a, 0x64, 0x4e, 0x40, 0x52, 0x5c, 0x06, 0x08, 0x1a, 0x14, 0x3e, 0x30, 0x22, 0x2c, - 0x96, 0x98, 0x8a, 0x84, 0xae, 0xa0, 0xb2, 0xbc, 0xe6, 0xe8, 0xfa, 0xf4, 0xde, 0xd0, 0xc2, 0xcc, - 0x41, 0x4f, 0x5d, 0x53, 0x79, 0x77, 0x65, 0x6b, 0x31, 0x3f, 0x2d, 0x23, 0x09, 0x07, 0x15, 0x1b, - 0xa1, 0xaf, 0xbd, 0xb3, 0x99, 0x97, 0x85, 0x8b, 0xd1, 0xdf, 0xcd, 0xc3, 0xe9, 0xe7, 0xf5, 0xfb, - 0x9a, 0x94, 0x86, 0x88, 0xa2, 0xac, 0xbe, 0xb0, 0xea, 0xe4, 0xf6, 0xf8, 0xd2, 0xdc, 0xce, 0xc0, - 0x7a, 0x74, 0x66, 0x68, 0x42, 0x4c, 0x5e, 0x50, 0x0a, 0x04, 0x16, 0x18, 0x32, 0x3c, 0x2e, 0x20, - 0xec, 0xe2, 0xf0, 0xfe, 0xd4, 0xda, 0xc8, 0xc6, 0x9c, 0x92, 0x80, 0x8e, 0xa4, 0xaa, 0xb8, 0xb6, - 0x0c, 0x02, 0x10, 0x1e, 0x34, 0x3a, 0x28, 0x26, 0x7c, 0x72, 0x60, 0x6e, 0x44, 0x4a, 0x58, 0x56, - 0x37, 0x39, 0x2b, 0x25, 0x0f, 0x01, 0x13, 0x1d, 0x47, 0x49, 0x5b, 0x55, 0x7f, 0x71, 0x63, 0x6d, - 0xd7, 0xd9, 0xcb, 0xc5, 0xef, 0xe1, 0xf3, 0xfd, 0xa7, 0xa9, 0xbb, 0xb5, 0x9f, 0x91, 0x83, 0x8d, -}, -{ // gmult(0x09, b); - 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77, - 0x90, 0x99, 0x82, 0x8b, 0xb4, 0xbd, 0xa6, 0xaf, 0xd8, 0xd1, 0xca, 0xc3, 0xfc, 0xf5, 0xee, 0xe7, - 0x3b, 0x32, 0x29, 0x20, 0x1f, 0x16, 0x0d, 0x04, 0x73, 0x7a, 0x61, 0x68, 0x57, 0x5e, 0x45, 0x4c, - 0xab, 0xa2, 0xb9, 0xb0, 0x8f, 0x86, 0x9d, 0x94, 0xe3, 0xea, 0xf1, 0xf8, 0xc7, 0xce, 0xd5, 0xdc, - 0x76, 0x7f, 0x64, 0x6d, 0x52, 0x5b, 0x40, 0x49, 0x3e, 0x37, 0x2c, 0x25, 0x1a, 0x13, 0x08, 0x01, - 0xe6, 0xef, 0xf4, 0xfd, 0xc2, 0xcb, 0xd0, 0xd9, 0xae, 0xa7, 0xbc, 0xb5, 0x8a, 0x83, 0x98, 0x91, - 0x4d, 0x44, 0x5f, 0x56, 0x69, 0x60, 0x7b, 0x72, 0x05, 0x0c, 0x17, 0x1e, 0x21, 0x28, 0x33, 0x3a, - 0xdd, 0xd4, 0xcf, 0xc6, 0xf9, 0xf0, 0xeb, 0xe2, 0x95, 0x9c, 0x87, 0x8e, 0xb1, 0xb8, 0xa3, 0xaa, - 0xec, 0xe5, 0xfe, 0xf7, 0xc8, 0xc1, 0xda, 0xd3, 0xa4, 0xad, 0xb6, 0xbf, 0x80, 0x89, 0x92, 0x9b, - 0x7c, 0x75, 0x6e, 0x67, 0x58, 0x51, 0x4a, 0x43, 0x34, 0x3d, 0x26, 0x2f, 0x10, 0x19, 0x02, 0x0b, - 0xd7, 0xde, 0xc5, 0xcc, 0xf3, 0xfa, 0xe1, 0xe8, 0x9f, 0x96, 0x8d, 0x84, 0xbb, 0xb2, 0xa9, 0xa0, - 0x47, 0x4e, 0x55, 0x5c, 0x63, 0x6a, 0x71, 0x78, 0x0f, 0x06, 0x1d, 0x14, 0x2b, 0x22, 0x39, 0x30, - 0x9a, 0x93, 0x88, 0x81, 0xbe, 0xb7, 0xac, 0xa5, 0xd2, 0xdb, 0xc0, 0xc9, 0xf6, 0xff, 0xe4, 0xed, - 0x0a, 0x03, 0x18, 0x11, 0x2e, 0x27, 0x3c, 0x35, 0x42, 0x4b, 0x50, 0x59, 0x66, 0x6f, 0x74, 0x7d, - 0xa1, 0xa8, 0xb3, 0xba, 0x85, 0x8c, 0x97, 0x9e, 0xe9, 0xe0, 0xfb, 0xf2, 0xcd, 0xc4, 0xdf, 0xd6, - 0x31, 0x38, 0x23, 0x2a, 0x15, 0x1c, 0x07, 0x0e, 0x79, 0x70, 0x6b, 0x62, 0x5d, 0x54, 0x4f, 0x46, - -}, -{ // gmult(0x0d, b); - 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b, - 0xd0, 0xdd, 0xca, 0xc7, 0xe4, 0xe9, 0xfe, 0xf3, 0xb8, 0xb5, 0xa2, 0xaf, 0x8c, 0x81, 0x96, 0x9b, - 0xbb, 0xb6, 0xa1, 0xac, 0x8f, 0x82, 0x95, 0x98, 0xd3, 0xde, 0xc9, 0xc4, 0xe7, 0xea, 0xfd, 0xf0, - 0x6b, 0x66, 0x71, 0x7c, 0x5f, 0x52, 0x45, 0x48, 0x03, 0x0e, 0x19, 0x14, 0x37, 0x3a, 0x2d, 0x20, - 0x6d, 0x60, 0x77, 0x7a, 0x59, 0x54, 0x43, 0x4e, 0x05, 0x08, 0x1f, 0x12, 0x31, 0x3c, 0x2b, 0x26, - 0xbd, 0xb0, 0xa7, 0xaa, 0x89, 0x84, 0x93, 0x9e, 0xd5, 0xd8, 0xcf, 0xc2, 0xe1, 0xec, 0xfb, 0xf6, - 0xd6, 0xdb, 0xcc, 0xc1, 0xe2, 0xef, 0xf8, 0xf5, 0xbe, 0xb3, 0xa4, 0xa9, 0x8a, 0x87, 0x90, 0x9d, - 0x06, 0x0b, 0x1c, 0x11, 0x32, 0x3f, 0x28, 0x25, 0x6e, 0x63, 0x74, 0x79, 0x5a, 0x57, 0x40, 0x4d, - 0xda, 0xd7, 0xc0, 0xcd, 0xee, 0xe3, 0xf4, 0xf9, 0xb2, 0xbf, 0xa8, 0xa5, 0x86, 0x8b, 0x9c, 0x91, - 0x0a, 0x07, 0x10, 0x1d, 0x3e, 0x33, 0x24, 0x29, 0x62, 0x6f, 0x78, 0x75, 0x56, 0x5b, 0x4c, 0x41, - 0x61, 0x6c, 0x7b, 0x76, 0x55, 0x58, 0x4f, 0x42, 0x09, 0x04, 0x13, 0x1e, 0x3d, 0x30, 0x27, 0x2a, - 0xb1, 0xbc, 0xab, 0xa6, 0x85, 0x88, 0x9f, 0x92, 0xd9, 0xd4, 0xc3, 0xce, 0xed, 0xe0, 0xf7, 0xfa, - 0xb7, 0xba, 0xad, 0xa0, 0x83, 0x8e, 0x99, 0x94, 0xdf, 0xd2, 0xc5, 0xc8, 0xeb, 0xe6, 0xf1, 0xfc, - 0x67, 0x6a, 0x7d, 0x70, 0x53, 0x5e, 0x49, 0x44, 0x0f, 0x02, 0x15, 0x18, 0x3b, 0x36, 0x21, 0x2c, - 0x0c, 0x01, 0x16, 0x1b, 0x38, 0x35, 0x22, 0x2f, 0x64, 0x69, 0x7e, 0x73, 0x50, 0x5d, 0x4a, 0x47, - 0xdc, 0xd1, 0xc6, 0xcb, 0xe8, 0xe5, 0xf2, 0xff, 0xb4, 0xb9, 0xae, 0xa3, 0x80, 0x8d, 0x9a, 0x97, -}, -{ // gmult(0x0b, b); - 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69, - 0xb0, 0xbb, 0xa6, 0xad, 0x9c, 0x97, 0x8a, 0x81, 0xe8, 0xe3, 0xfe, 0xf5, 0xc4, 0xcf, 0xd2, 0xd9, - 0x7b, 0x70, 0x6d, 0x66, 0x57, 0x5c, 0x41, 0x4a, 0x23, 0x28, 0x35, 0x3e, 0x0f, 0x04, 0x19, 0x12, - 0xcb, 0xc0, 0xdd, 0xd6, 0xe7, 0xec, 0xf1, 0xfa, 0x93, 0x98, 0x85, 0x8e, 0xbf, 0xb4, 0xa9, 0xa2, - 0xf6, 0xfd, 0xe0, 0xeb, 0xda, 0xd1, 0xcc, 0xc7, 0xae, 0xa5, 0xb8, 0xb3, 0x82, 0x89, 0x94, 0x9f, - 0x46, 0x4d, 0x50, 0x5b, 0x6a, 0x61, 0x7c, 0x77, 0x1e, 0x15, 0x08, 0x03, 0x32, 0x39, 0x24, 0x2f, - 0x8d, 0x86, 0x9b, 0x90, 0xa1, 0xaa, 0xb7, 0xbc, 0xd5, 0xde, 0xc3, 0xc8, 0xf9, 0xf2, 0xef, 0xe4, - 0x3d, 0x36, 0x2b, 0x20, 0x11, 0x1a, 0x07, 0x0c, 0x65, 0x6e, 0x73, 0x78, 0x49, 0x42, 0x5f, 0x54, - 0xf7, 0xfc, 0xe1, 0xea, 0xdb, 0xd0, 0xcd, 0xc6, 0xaf, 0xa4, 0xb9, 0xb2, 0x83, 0x88, 0x95, 0x9e, - 0x47, 0x4c, 0x51, 0x5a, 0x6b, 0x60, 0x7d, 0x76, 0x1f, 0x14, 0x09, 0x02, 0x33, 0x38, 0x25, 0x2e, - 0x8c, 0x87, 0x9a, 0x91, 0xa0, 0xab, 0xb6, 0xbd, 0xd4, 0xdf, 0xc2, 0xc9, 0xf8, 0xf3, 0xee, 0xe5, - 0x3c, 0x37, 0x2a, 0x21, 0x10, 0x1b, 0x06, 0x0d, 0x64, 0x6f, 0x72, 0x79, 0x48, 0x43, 0x5e, 0x55, - 0x01, 0x0a, 0x17, 0x1c, 0x2d, 0x26, 0x3b, 0x30, 0x59, 0x52, 0x4f, 0x44, 0x75, 0x7e, 0x63, 0x68, - 0xb1, 0xba, 0xa7, 0xac, 0x9d, 0x96, 0x8b, 0x80, 0xe9, 0xe2, 0xff, 0xf4, 0xc5, 0xce, 0xd3, 0xd8, - 0x7a, 0x71, 0x6c, 0x67, 0x56, 0x5d, 0x40, 0x4b, 0x22, 0x29, 0x34, 0x3f, 0x0e, 0x05, 0x18, 0x13, - 0xca, 0xc1, 0xdc, 0xd7, 0xe6, 0xed, 0xf0, 0xfb, 0x92, 0x99, 0x84, 0x8f, 0xbe, 0xb5, 0xa8, 0xa3, -} -}; - -/* - * S-box transformation table - */ -static uint8_t simde_x_aes_s_box[256] = { - // 0 1 2 3 4 5 6 7 8 9 a b c d e f - 0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76, // 0 - 0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0, // 1 - 0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15, // 2 - 0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75, // 3 - 0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84, // 4 - 0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf, // 5 - 0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8, // 6 - 0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2, // 7 - 0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73, // 8 - 0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb, // 9 - 0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79, // a - 0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08, // b - 0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a, // c - 0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e, // d - 0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf, // e - 0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16};// f - -/* - * Inverse S-box transformation table - */ -static uint8_t simde_x_aes_inv_s_box[256] = { - // 0 1 2 3 4 5 6 7 8 9 a b c d e f - 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb, // 0 - 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, // 1 - 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, // 2 - 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, // 3 - 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, // 4 - 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84, // 5 - 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, // 6 - 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, // 7 - 0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73, // 8 - 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e, // 9 - 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, // a - 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, // b - 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f, // c - 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, // d - 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, // e - 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d};// f - -/* - * Multiplication of 4 byte words - * m(x) = x4+1 - -SIMDE_FUNCTION_ATTRIBUTES -void coef_mult(uint8_t *a, uint8_t *b, uint8_t *d) { - - d[0] = gmult(a[0],b[0])^gmult(a[3],b[1])^gmult(a[2],b[2])^gmult(a[1],b[3]); - d[1] = gmult(a[1],b[0])^gmult(a[0],b[1])^gmult(a[3],b[2])^gmult(a[2],b[3]); - d[2] = gmult(a[2],b[0])^gmult(a[1],b[1])^gmult(a[0],b[2])^gmult(a[3],b[3]); - d[3] = gmult(a[3],b[0])^gmult(a[2],b[1])^gmult(a[1],b[2])^gmult(a[0],b[3]); -} -*/ - -SIMDE_FUNCTION_ATTRIBUTES -void simde_x_aes_coef_mult_lookup(int lookup_table_offset, uint8_t *b, uint8_t *d) { - int o = lookup_table_offset; - - #define gmultl(o,b) simde_x_aes_gmult_lookup_table[o][b] - d[0] = gmultl(o+0,b[0])^gmultl(o+3,b[1])^gmultl(o+2,b[2])^gmultl(o+1,b[3]); - d[1] = gmultl(o+1,b[0])^gmultl(o+0,b[1])^gmultl(o+3,b[2])^gmultl(o+2,b[3]); - d[2] = gmultl(o+2,b[0])^gmultl(o+1,b[1])^gmultl(o+0,b[2])^gmultl(o+3,b[3]); - d[3] = gmultl(o+3,b[0])^gmultl(o+2,b[1])^gmultl(o+1,b[2])^gmultl(o+0,b[3]); - #undef gmultl -} - -#endif - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_AES_H) */ diff --git a/ffi-deps/simde/simde/simde-align.h b/ffi-deps/simde/simde/simde-align.h deleted file mode 100644 index 0c8a809..0000000 --- a/ffi-deps/simde/simde/simde-align.h +++ /dev/null @@ -1,450 +0,0 @@ -/* Alignment - * Created by Evan Nemerson - * - * To the extent possible under law, the authors have waived all - * copyright and related or neighboring rights to this code. For - * details, see the Creative Commons Zero 1.0 Universal license at - * - * - * SPDX-License-Identifier: CC0-1.0 - * - ********************************************************************** - * - * This is portability layer which should help iron out some - * differences across various compilers, as well as various verisons of - * C and C++. - * - * It was originally developed for SIMD Everywhere - * (), but since its only - * dependency is Hedley (, also CC0) - * it can easily be used in other projects, so please feel free to do - * so. - * - * If you do use this in your project, please keep a link to SIMDe in - * your code to remind you where to report any bugs and/or check for - * updated versions. - * - * # API Overview - * - * The API has several parts, and most macros have a few variations. - * There are APIs for declaring aligned fields/variables, optimization - * hints, and run-time alignment checks. - * - * Briefly, macros ending with "_TO" take numeric values and are great - * when you know the value you would like to use. Macros ending with - * "_LIKE", on the other hand, accept a type and are used when you want - * to use the alignment of a type instead of hardcoding a value. - * - * Documentation for each section of the API is inline. - * - * True to form, MSVC is the main problem and imposes several - * limitations on the effectiveness of the APIs. Detailed descriptions - * of the limitations of each macro are inline, but in general: - * - * * On C11+ or C++11+ code written using this API will work. The - * ASSUME macros may or may not generate a hint to the compiler, but - * that is only an optimization issue and will not actually cause - * failures. - * * If you're using pretty much any compiler other than MSVC, - * everything should basically work as well as in C11/C++11. - */ - -#if !defined(SIMDE_ALIGN_H) -#define SIMDE_ALIGN_H - -#include "hedley.h" - -/* I know this seems a little silly, but some non-hosted compilers - * don't have stddef.h, so we try to accomodate them. */ -#if !defined(SIMDE_ALIGN_SIZE_T_) - #if defined(__SIZE_TYPE__) - #define SIMDE_ALIGN_SIZE_T_ __SIZE_TYPE__ - #elif defined(__SIZE_T_TYPE__) - #define SIMDE_ALIGN_SIZE_T_ __SIZE_TYPE__ - #elif defined(__cplusplus) - #include - #define SIMDE_ALIGN_SIZE_T_ size_t - #else - #include - #define SIMDE_ALIGN_SIZE_T_ size_t - #endif -#endif - -#if !defined(SIMDE_ALIGN_INTPTR_T_) - #if defined(__INTPTR_TYPE__) - #define SIMDE_ALIGN_INTPTR_T_ __INTPTR_TYPE__ - #elif defined(__PTRDIFF_TYPE__) - #define SIMDE_ALIGN_INTPTR_T_ __PTRDIFF_TYPE__ - #elif defined(__PTRDIFF_T_TYPE__) - #define SIMDE_ALIGN_INTPTR_T_ __PTRDIFF_T_TYPE__ - #elif defined(__cplusplus) - #include - #define SIMDE_ALIGN_INTPTR_T_ ptrdiff_t - #else - #include - #define SIMDE_ALIGN_INTPTR_T_ ptrdiff_t - #endif -#endif - -#if defined(SIMDE_ALIGN_DEBUG) - #if defined(__cplusplus) - #include - #else - #include - #endif -#endif - -/* SIMDE_ALIGN_OF(Type) - * - * The SIMDE_ALIGN_OF macro works like alignof, or _Alignof, or - * __alignof, or __alignof__, or __ALIGNOF__, depending on the compiler. - * It isn't defined everywhere (only when the compiler has some alignof- - * like feature we can use to implement it), but it should work in most - * modern compilers, as well as C11 and C++11. - * - * If we can't find an implementation for SIMDE_ALIGN_OF then the macro - * will not be defined, so if you can handle that situation sensibly - * you may need to sprinkle some ifdefs into your code. - */ -#if \ - (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \ - (0 && HEDLEY_HAS_FEATURE(c_alignof)) - #define SIMDE_ALIGN_OF(Type) _Alignof(Type) -#elif \ - (defined(__cplusplus) && (__cplusplus >= 201103L)) || \ - (0 && HEDLEY_HAS_FEATURE(cxx_alignof)) - #define SIMDE_ALIGN_OF(Type) alignof(Type) -#elif \ - HEDLEY_GCC_VERSION_CHECK(2,95,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5,13,0) || \ - HEDLEY_TINYC_VERSION_CHECK(0,9,24) || \ - HEDLEY_PGI_VERSION_CHECK(19,10,0) || \ - HEDLEY_CRAY_VERSION_CHECK(10,0,0) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(16,9,0) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(16,9,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CL430_VERSION_CHECK(16,9,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,2) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) || \ - defined(__IBM__ALIGNOF__) || \ - defined(__clang__) - #define SIMDE_ALIGN_OF(Type) __alignof__(Type) -#elif \ - HEDLEY_IAR_VERSION_CHECK(8,40,0) - #define SIMDE_ALIGN_OF(Type) __ALIGNOF__(Type) -#elif \ - HEDLEY_MSVC_VERSION_CHECK(19,0,0) - /* Probably goes back much further, but MS takes down their old docs. - * If you can verify that this works in earlier versions please let - * me know! */ - #define SIMDE_ALIGN_OF(Type) __alignof(Type) -#endif - -/* SIMDE_ALIGN_MAXIMUM: - * - * This is the maximum alignment that the compiler supports. You can - * define the value prior to including SIMDe if necessary, but in that - * case *please* submit an issue so we can add the platform to the - * detection code. - * - * Most compilers are okay with types which are aligned beyond what - * they think is the maximum, as long as the alignment is a power - * of two. Older versions of MSVC is the exception, so we need to cap - * the alignment requests at values that the implementation supports. - * - * XL C/C++ will accept values larger than 16 (which is the alignment - * of an AltiVec vector), but will not reliably align to the larger - * value, so so we cap the value at 16 there. - * - * If the compiler accepts any power-of-two value within reason then - * this macro should be left undefined, and the SIMDE_ALIGN_CAP - * macro will just return the value passed to it. */ -#if !defined(SIMDE_ALIGN_MAXIMUM) - #if defined(HEDLEY_MSVC_VERSION) - #if HEDLEY_MSVC_VERSION_CHECK(19, 16, 0) - // Visual studio 2017 and newer does not need a max - #else - #if defined(_M_IX86) || defined(_M_AMD64) - #if HEDLEY_MSVC_VERSION_CHECK(19,14,0) - #define SIMDE_ALIGN_PLATFORM_MAXIMUM 64 - #elif HEDLEY_MSVC_VERSION_CHECK(16,0,0) - /* VS 2010 is really a guess based on Wikipedia; if anyone can - * test with old VS versions I'd really appreciate it. */ - #define SIMDE_ALIGN_PLATFORM_MAXIMUM 32 - #else - #define SIMDE_ALIGN_PLATFORM_MAXIMUM 16 - #endif - #elif defined(_M_ARM) || defined(_M_ARM64) - #define SIMDE_ALIGN_PLATFORM_MAXIMUM 8 - #endif - #endif - #elif defined(HEDLEY_IBM_VERSION) - #define SIMDE_ALIGN_PLATFORM_MAXIMUM 16 - #endif -#endif - -/* You can mostly ignore these; they're intended for internal use. - * If you do need to use them please let me know; if they fulfill - * a common use case I'll probably drop the trailing underscore - * and make them part of the public API. */ -#if defined(SIMDE_ALIGN_PLATFORM_MAXIMUM) - #if SIMDE_ALIGN_PLATFORM_MAXIMUM >= 64 - #define SIMDE_ALIGN_64_ 64 - #define SIMDE_ALIGN_32_ 32 - #define SIMDE_ALIGN_16_ 16 - #define SIMDE_ALIGN_8_ 8 - #elif SIMDE_ALIGN_PLATFORM_MAXIMUM >= 32 - #define SIMDE_ALIGN_64_ 32 - #define SIMDE_ALIGN_32_ 32 - #define SIMDE_ALIGN_16_ 16 - #define SIMDE_ALIGN_8_ 8 - #elif SIMDE_ALIGN_PLATFORM_MAXIMUM >= 16 - #define SIMDE_ALIGN_64_ 16 - #define SIMDE_ALIGN_32_ 16 - #define SIMDE_ALIGN_16_ 16 - #define SIMDE_ALIGN_8_ 8 - #elif SIMDE_ALIGN_PLATFORM_MAXIMUM >= 8 - #define SIMDE_ALIGN_64_ 8 - #define SIMDE_ALIGN_32_ 8 - #define SIMDE_ALIGN_16_ 8 - #define SIMDE_ALIGN_8_ 8 - #else - #error Max alignment expected to be >= 8 - #endif -#else - #define SIMDE_ALIGN_64_ 64 - #define SIMDE_ALIGN_32_ 32 - #define SIMDE_ALIGN_16_ 16 - #define SIMDE_ALIGN_8_ 8 -#endif - -/** - * SIMDE_ALIGN_CAP(Alignment) - * - * Returns the minimum of Alignment or SIMDE_ALIGN_MAXIMUM. - */ -#if defined(SIMDE_ALIGN_MAXIMUM) - #define SIMDE_ALIGN_CAP(Alignment) (((Alignment) < (SIMDE_ALIGN_PLATFORM_MAXIMUM)) ? (Alignment) : (SIMDE_ALIGN_PLATFORM_MAXIMUM)) -#else - #define SIMDE_ALIGN_CAP(Alignment) (Alignment) -#endif - -/* SIMDE_ALIGN_TO(Alignment) - * - * SIMDE_ALIGN_TO is used to declare types or variables. It basically - * maps to the align attribute in most compilers, the align declspec - * in MSVC, or _Alignas/alignas in C11/C++11. - * - * Example: - * - * struct i32x4 { - * SIMDE_ALIGN_TO(16) int32_t values[4]; - * } - * - * Limitations: - * - * MSVC requires that the Alignment parameter be numeric; you can't do - * something like `SIMDE_ALIGN_TO(SIMDE_ALIGN_OF(int))`. This is - * unfortunate because that's really how the LIKE macros are - * implemented, and I am not aware of a way to get anything like this - * to work without using the C11/C++11 keywords. - * - * It also means that we can't use SIMDE_ALIGN_CAP to limit the - * alignment to the value specified, which MSVC also requires, so on - * MSVC you should use the `SIMDE_ALIGN_TO_8/16/32/64` macros instead. - * They work like `SIMDE_ALIGN_TO(SIMDE_ALIGN_CAP(Alignment))` would, - * but should be safe to use on MSVC. - * - * All this is to say that, if you want your code to work on MSVC, you - * should use the SIMDE_ALIGN_TO_8/16/32/64 macros below instead of - * SIMDE_ALIGN_TO(8/16/32/64). - */ -#if \ - HEDLEY_HAS_ATTRIBUTE(aligned) || \ - HEDLEY_GCC_VERSION_CHECK(2,95,0) || \ - HEDLEY_CRAY_VERSION_CHECK(8,4,0) || \ - HEDLEY_IBM_VERSION_CHECK(11,1,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_PGI_VERSION_CHECK(19,4,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_TINYC_VERSION_CHECK(0,9,24) || \ - HEDLEY_TI_ARMCL_VERSION_CHECK(16,9,0) || \ - HEDLEY_TI_CL2000_VERSION_CHECK(16,9,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CL430_VERSION_CHECK(16,9,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,2) - #define SIMDE_ALIGN_TO(Alignment) __attribute__((__aligned__(SIMDE_ALIGN_CAP(Alignment)))) -#elif \ - (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) - #define SIMDE_ALIGN_TO(Alignment) _Alignas(SIMDE_ALIGN_CAP(Alignment)) -#elif \ - (defined(__cplusplus) && (__cplusplus >= 201103L)) - #define SIMDE_ALIGN_TO(Alignment) alignas(SIMDE_ALIGN_CAP(Alignment)) -#elif \ - defined(HEDLEY_MSVC_VERSION) - #define SIMDE_ALIGN_TO(Alignment) __declspec(align(Alignment)) - /* Unfortunately MSVC can't handle __declspec(align(__alignof(Type))); - * the alignment passed to the declspec has to be an integer. */ - #define SIMDE_ALIGN_OF_UNUSABLE_FOR_LIKE -#endif -#define SIMDE_ALIGN_TO_64 SIMDE_ALIGN_TO(SIMDE_ALIGN_64_) -#define SIMDE_ALIGN_TO_32 SIMDE_ALIGN_TO(SIMDE_ALIGN_32_) -#define SIMDE_ALIGN_TO_16 SIMDE_ALIGN_TO(SIMDE_ALIGN_16_) -#define SIMDE_ALIGN_TO_8 SIMDE_ALIGN_TO(SIMDE_ALIGN_8_) - -/* SIMDE_ALIGN_ASSUME_TO(Pointer, Alignment) - * - * SIMDE_ALIGN_ASSUME_TO is semantically similar to C++20's - * std::assume_aligned, or __builtin_assume_aligned. It tells the - * compiler to assume that the provided pointer is aligned to an - * `Alignment`-byte boundary. - * - * If you define SIMDE_ALIGN_DEBUG prior to including this header then - * SIMDE_ALIGN_ASSUME_TO will turn into a runtime check. We don't - * integrate with NDEBUG in this header, but it may be a good idea to - * put something like this in your code: - * - * #if !defined(NDEBUG) - * #define SIMDE_ALIGN_DEBUG - * #endif - * #include <.../simde-align.h> - */ -#if \ - HEDLEY_HAS_BUILTIN(__builtin_assume_aligned) || \ - HEDLEY_GCC_VERSION_CHECK(4,7,0) - #define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment) \ - HEDLEY_REINTERPRET_CAST(__typeof__(Pointer), __builtin_assume_aligned(HEDLEY_CONST_CAST(void*, HEDLEY_REINTERPRET_CAST(const void*, Pointer)), Alignment)) -#elif HEDLEY_INTEL_VERSION_CHECK(13,0,0) - #define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment) (__extension__ ({ \ - __typeof__(v) simde_assume_aligned_t_ = (Pointer); \ - __assume_aligned(simde_assume_aligned_t_, Alignment); \ - simde_assume_aligned_t_; \ - })) -#elif defined(__cplusplus) && (__cplusplus > 201703L) - #include - #define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment) std::assume_aligned(Pointer) -#else - #if defined(__cplusplus) - template HEDLEY_ALWAYS_INLINE static T* simde_align_assume_to_unchecked(T* ptr, const size_t alignment) - #else - HEDLEY_ALWAYS_INLINE static void* simde_align_assume_to_unchecked(void* ptr, const size_t alignment) - #endif - { - HEDLEY_ASSUME((HEDLEY_REINTERPRET_CAST(size_t, (ptr)) % SIMDE_ALIGN_CAP(alignment)) == 0); - return ptr; - } - #if defined(__cplusplus) - #define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment) simde_align_assume_to_unchecked((Pointer), (Alignment)) - #else - #define SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment) simde_align_assume_to_unchecked(HEDLEY_CONST_CAST(void*, HEDLEY_REINTERPRET_CAST(const void*, Pointer)), (Alignment)) - #endif -#endif - -#if !defined(SIMDE_ALIGN_DEBUG) - #define SIMDE_ALIGN_ASSUME_TO(Pointer, Alignment) SIMDE_ALIGN_ASSUME_TO_UNCHECKED(Pointer, Alignment) -#else - #include - #if defined(__cplusplus) - template - static HEDLEY_ALWAYS_INLINE - T* - simde_align_assume_to_checked_uncapped(T* ptr, const size_t alignment, const char* file, int line, const char* ptrname) - #else - static HEDLEY_ALWAYS_INLINE - void* - simde_align_assume_to_checked_uncapped(void* ptr, const size_t alignment, const char* file, int line, const char* ptrname) - #endif - { - if (HEDLEY_UNLIKELY((HEDLEY_REINTERPRET_CAST(SIMDE_ALIGN_INTPTR_T_, (ptr)) % HEDLEY_STATIC_CAST(SIMDE_ALIGN_INTPTR_T_, SIMDE_ALIGN_CAP(alignment))) != 0)) { - fprintf(stderr, "%s:%d: alignment check failed for `%s' (%p %% %u == %u)\n", - file, line, ptrname, HEDLEY_REINTERPRET_CAST(const void*, ptr), - HEDLEY_STATIC_CAST(unsigned int, SIMDE_ALIGN_CAP(alignment)), - HEDLEY_STATIC_CAST(unsigned int, HEDLEY_REINTERPRET_CAST(SIMDE_ALIGN_INTPTR_T_, (ptr)) % HEDLEY_STATIC_CAST(SIMDE_ALIGN_INTPTR_T_, SIMDE_ALIGN_CAP(alignment)))); - } - - return ptr; - } - - #if defined(__cplusplus) - #define SIMDE_ALIGN_ASSUME_TO(Pointer, Alignment) simde_align_assume_to_checked_uncapped((Pointer), (Alignment), __FILE__, __LINE__, #Pointer) - #else - #define SIMDE_ALIGN_ASSUME_TO(Pointer, Alignment) simde_align_assume_to_checked_uncapped(HEDLEY_CONST_CAST(void*, HEDLEY_REINTERPRET_CAST(const void*, Pointer)), (Alignment), __FILE__, __LINE__, #Pointer) - #endif -#endif - -/* SIMDE_ALIGN_LIKE(Type) - * SIMDE_ALIGN_LIKE_#(Type) - * - * The SIMDE_ALIGN_LIKE macros are similar to the SIMDE_ALIGN_TO macros - * except instead of an integer they take a type; basically, it's just - * a more convenient way to do something like: - * - * SIMDE_ALIGN_TO(SIMDE_ALIGN_OF(Type)) - * - * The versions with a numeric suffix will fall back on using a numeric - * value in the event we can't use SIMDE_ALIGN_OF(Type). This is - * mainly for MSVC, where __declspec(align()) can't handle anything - * other than hard-coded numeric values. - */ -#if defined(SIMDE_ALIGN_OF) && defined(SIMDE_ALIGN_TO) && !defined(SIMDE_ALIGN_OF_UNUSABLE_FOR_LIKE) - #define SIMDE_ALIGN_LIKE(Type) SIMDE_ALIGN_TO(SIMDE_ALIGN_OF(Type)) - #define SIMDE_ALIGN_LIKE_64(Type) SIMDE_ALIGN_LIKE(Type) - #define SIMDE_ALIGN_LIKE_32(Type) SIMDE_ALIGN_LIKE(Type) - #define SIMDE_ALIGN_LIKE_16(Type) SIMDE_ALIGN_LIKE(Type) - #define SIMDE_ALIGN_LIKE_8(Type) SIMDE_ALIGN_LIKE(Type) -#else - #define SIMDE_ALIGN_LIKE_64(Type) SIMDE_ALIGN_TO_64 - #define SIMDE_ALIGN_LIKE_32(Type) SIMDE_ALIGN_TO_32 - #define SIMDE_ALIGN_LIKE_16(Type) SIMDE_ALIGN_TO_16 - #define SIMDE_ALIGN_LIKE_8(Type) SIMDE_ALIGN_TO_8 -#endif - -/* SIMDE_ALIGN_ASSUME_LIKE(Pointer, Type) - * - * Tihs is similar to SIMDE_ALIGN_ASSUME_TO, except that it takes a - * type instead of a numeric value. */ -#if defined(SIMDE_ALIGN_OF) && defined(SIMDE_ALIGN_ASSUME_TO) - #define SIMDE_ALIGN_ASSUME_LIKE(Pointer, Type) SIMDE_ALIGN_ASSUME_TO(Pointer, SIMDE_ALIGN_OF(Type)) -#endif - -/* SIMDE_ALIGN_CAST(Type, Pointer) - * - * SIMDE_ALIGN_CAST is like C++'s reinterpret_cast, but it will try - * to silence warnings that some compilers may produce if you try - * to assign to a type with increased alignment requirements. - * - * Note that it does *not* actually attempt to tell the compiler that - * the pointer is aligned like the destination should be; that's the - * job of the next macro. This macro is necessary for stupid APIs - * like _mm_loadu_si128 where the input is a __m128i* but the function - * is specifically for data which isn't necessarily aligned to - * _Alignof(__m128i). - */ -#if HEDLEY_HAS_WARNING("-Wcast-align") || defined(__clang__) || HEDLEY_GCC_VERSION_CHECK(3,4,0) - #define SIMDE_ALIGN_CAST(Type, Pointer) (__extension__({ \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("GCC diagnostic ignored \"-Wcast-align\"") \ - Type simde_r_ = HEDLEY_REINTERPRET_CAST(Type, Pointer); \ - HEDLEY_DIAGNOSTIC_POP \ - simde_r_; \ - })) -#else - #define SIMDE_ALIGN_CAST(Type, Pointer) HEDLEY_REINTERPRET_CAST(Type, Pointer) -#endif - -/* SIMDE_ALIGN_ASSUME_CAST(Type, Pointer) - * - * This is sort of like a combination of a reinterpret_cast and a - * SIMDE_ALIGN_ASSUME_LIKE. It uses SIMDE_ALIGN_ASSUME_LIKE to tell - * the compiler that the pointer is aligned like the specified type - * and casts the pointer to the specified type while suppressing any - * warnings from the compiler about casting to a type with greater - * alignment requirements. - */ -#define SIMDE_ALIGN_ASSUME_CAST(Type, Pointer) SIMDE_ALIGN_ASSUME_LIKE(SIMDE_ALIGN_CAST(Type, Pointer), Type) - -#endif /* !defined(SIMDE_ALIGN_H) */ diff --git a/ffi-deps/simde/simde/simde-arch.h b/ffi-deps/simde/simde/simde-arch.h deleted file mode 100644 index a492d7e..0000000 --- a/ffi-deps/simde/simde/simde-arch.h +++ /dev/null @@ -1,622 +0,0 @@ -/* Architecture detection - * Created by Evan Nemerson - * - * To the extent possible under law, the authors have waived all - * copyright and related or neighboring rights to this code. For - * details, see the Creative Commons Zero 1.0 Universal license at - * - * - * SPDX-License-Identifier: CC0-1.0 - * - * Different compilers define different preprocessor macros for the - * same architecture. This is an attempt to provide a single - * interface which is usable on any compiler. - * - * In general, a macro named SIMDE_ARCH_* is defined for each - * architecture the CPU supports. When there are multiple possible - * versions, we try to define the macro to the target version. For - * example, if you want to check for i586+, you could do something - * like: - * - * #if defined(SIMDE_ARCH_X86) && (SIMDE_ARCH_X86 >= 5) - * ... - * #endif - * - * You could also just check that SIMDE_ARCH_X86 >= 5 without checking - * if it's defined first, but some compilers may emit a warning about - * an undefined macro being used (e.g., GCC with -Wundef). - * - * This was originally created for SIMDe - * (hence the prefix), but this - * header has no dependencies and may be used anywhere. It is - * originally based on information from - * , though it - * has been enhanced with additional information. - * - * If you improve this file, or find a bug, please file the issue at - * . If you copy this into - * your project, even if you change the prefix, please keep the links - * to SIMDe intact so others know where to report issues, submit - * enhancements, and find the latest version. */ - -#if !defined(SIMDE_ARCH_H) -#define SIMDE_ARCH_H - -#include "hedley.h" - -/* Alpha - */ -#if defined(__alpha__) || defined(__alpha) || defined(_M_ALPHA) -# if defined(__alpha_ev6__) -# define SIMDE_ARCH_ALPHA 6 -# elif defined(__alpha_ev5__) -# define SIMDE_ARCH_ALPHA 5 -# elif defined(__alpha_ev4__) -# define SIMDE_ARCH_ALPHA 4 -# else -# define SIMDE_ARCH_ALPHA 1 -# endif -#endif -#if defined(SIMDE_ARCH_ALPHA) -# define SIMDE_ARCH_ALPHA_CHECK(version) ((version) <= SIMDE_ARCH_ALPHA) -#else -# define SIMDE_ARCH_ALPHA_CHECK(version) (0) -#endif - -/* Atmel AVR - */ -#if defined(__AVR_ARCH__) -# define SIMDE_ARCH_AVR __AVR_ARCH__ -#endif - -/* AMD64 / x86_64 - */ -#if defined(__amd64__) || defined(__amd64) || defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64) -# if !defined(_M_ARM64EC) -# define SIMDE_ARCH_AMD64 1000 -# endif -#endif - -/* ARM - */ -#if defined(__ARM_ARCH) -# if __ARM_ARCH > 100 -# define SIMDE_ARCH_ARM (__ARM_ARCH) -# else -# define SIMDE_ARCH_ARM (__ARM_ARCH * 100) -# endif -#elif defined(_M_ARM) -# if _M_ARM > 100 -# define SIMDE_ARCH_ARM (_M_ARM) -# else -# define SIMDE_ARCH_ARM (_M_ARM * 100) -# endif -#elif defined(_M_ARM64) || defined(_M_ARM64EC) -# define SIMDE_ARCH_ARM 800 -#elif defined(__arm__) || defined(__thumb__) || defined(__TARGET_ARCH_ARM) || defined(_ARM) || defined(_M_ARM) || defined(_M_ARM) -# define SIMDE_ARCH_ARM 1 -#endif -#if defined(SIMDE_ARCH_ARM) -# define SIMDE_ARCH_ARM_CHECK(major, minor) (((major * 100) + (minor)) <= SIMDE_ARCH_ARM) -#else -# define SIMDE_ARCH_ARM_CHECK(major, minor) (0) -#endif - -/* AArch64 - */ -#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) -# define SIMDE_ARCH_AARCH64 1000 -#endif -#if defined(SIMDE_ARCH_AARCH64) -# define SIMDE_ARCH_AARCH64_CHECK(version) ((version) <= SIMDE_ARCH_AARCH64) -#else -# define SIMDE_ARCH_AARCH64_CHECK(version) (0) -#endif - -/* ARM SIMD ISA extensions */ -#if defined(__ARM_NEON) || defined(SIMDE_ARCH_AARCH64) -# if defined(SIMDE_ARCH_AARCH64) -# define SIMDE_ARCH_ARM_NEON SIMDE_ARCH_AARCH64 -# elif defined(SIMDE_ARCH_ARM) -# define SIMDE_ARCH_ARM_NEON SIMDE_ARCH_ARM -# endif -#endif -#if defined(__ARM_FEATURE_SVE) -# define SIMDE_ARCH_ARM_SVE -#endif -#if defined(__ARM_FEATURE_FMA) && __ARM_FEATURE_FMA -# define SIMDE_ARCH_ARM_FMA -#endif -#if defined(__ARM_FEATURE_CRYPTO) -# define SIMDE_ARCH_ARM_CRYPTO -#endif -#if defined(__ARM_FEATURE_QRDMX) -# define SIMDE_ARCH_ARM_QRDMX -#endif - -/* Blackfin - */ -#if defined(__bfin) || defined(__BFIN__) || defined(__bfin__) -# define SIMDE_ARCH_BLACKFIN 1 -#endif - -/* CRIS - */ -#if defined(__CRIS_arch_version) -# define SIMDE_ARCH_CRIS __CRIS_arch_version -#elif defined(__cris__) || defined(__cris) || defined(__CRIS) || defined(__CRIS__) -# define SIMDE_ARCH_CRIS 1 -#endif - -/* Convex - */ -#if defined(__convex_c38__) -# define SIMDE_ARCH_CONVEX 38 -#elif defined(__convex_c34__) -# define SIMDE_ARCH_CONVEX 34 -#elif defined(__convex_c32__) -# define SIMDE_ARCH_CONVEX 32 -#elif defined(__convex_c2__) -# define SIMDE_ARCH_CONVEX 2 -#elif defined(__convex__) -# define SIMDE_ARCH_CONVEX 1 -#endif -#if defined(SIMDE_ARCH_CONVEX) -# define SIMDE_ARCH_CONVEX_CHECK(version) ((version) <= SIMDE_ARCH_CONVEX) -#else -# define SIMDE_ARCH_CONVEX_CHECK(version) (0) -#endif - -/* Adapteva Epiphany - */ -#if defined(__epiphany__) -# define SIMDE_ARCH_EPIPHANY 1 -#endif - -/* Fujitsu FR-V - */ -#if defined(__frv__) -# define SIMDE_ARCH_FRV 1 -#endif - -/* H8/300 - */ -#if defined(__H8300__) -# define SIMDE_ARCH_H8300 -#endif - -/* Elbrus (8S, 8SV and successors) - */ -#if defined(__e2k__) -# define SIMDE_ARCH_E2K -#endif - -/* HP/PA / PA-RISC - */ -#if defined(__PA8000__) || defined(__HPPA20__) || defined(__RISC2_0__) || defined(_PA_RISC2_0) -# define SIMDE_ARCH_HPPA 20 -#elif defined(__PA7100__) || defined(__HPPA11__) || defined(_PA_RISC1_1) -# define SIMDE_ARCH_HPPA 11 -#elif defined(_PA_RISC1_0) -# define SIMDE_ARCH_HPPA 10 -#elif defined(__hppa__) || defined(__HPPA__) || defined(__hppa) -# define SIMDE_ARCH_HPPA 1 -#endif -#if defined(SIMDE_ARCH_HPPA) -# define SIMDE_ARCH_HPPA_CHECK(version) ((version) <= SIMDE_ARCH_HPPA) -#else -# define SIMDE_ARCH_HPPA_CHECK(version) (0) -#endif - -/* x86 - */ -#if defined(_M_IX86) -# define SIMDE_ARCH_X86 (_M_IX86 / 100) -#elif defined(__I86__) -# define SIMDE_ARCH_X86 __I86__ -#elif defined(i686) || defined(__i686) || defined(__i686__) -# define SIMDE_ARCH_X86 6 -#elif defined(i586) || defined(__i586) || defined(__i586__) -# define SIMDE_ARCH_X86 5 -#elif defined(i486) || defined(__i486) || defined(__i486__) -# define SIMDE_ARCH_X86 4 -#elif defined(i386) || defined(__i386) || defined(__i386__) -# define SIMDE_ARCH_X86 3 -#elif defined(_X86_) || defined(__X86__) || defined(__THW_INTEL__) -# define SIMDE_ARCH_X86 3 -#endif -#if defined(SIMDE_ARCH_X86) -# define SIMDE_ARCH_X86_CHECK(version) ((version) <= SIMDE_ARCH_X86) -#else -# define SIMDE_ARCH_X86_CHECK(version) (0) -#endif - -/* SIMD ISA extensions for x86/x86_64 and Elbrus */ -#if defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64) || defined(SIMDE_ARCH_E2K) -# if defined(_M_IX86_FP) -# define SIMDE_ARCH_X86_MMX -# if (_M_IX86_FP >= 1) -# define SIMDE_ARCH_X86_SSE 1 -# endif -# if (_M_IX86_FP >= 2) -# define SIMDE_ARCH_X86_SSE2 1 -# endif -# elif defined(_M_X64) -# define SIMDE_ARCH_X86_SSE 1 -# define SIMDE_ARCH_X86_SSE2 1 -# else -# if defined(__MMX__) -# define SIMDE_ARCH_X86_MMX 1 -# endif -# if defined(__SSE__) -# define SIMDE_ARCH_X86_SSE 1 -# endif -# if defined(__SSE2__) -# define SIMDE_ARCH_X86_SSE2 1 -# endif -# endif -# if defined(__SSE3__) -# define SIMDE_ARCH_X86_SSE3 1 -# endif -# if defined(__SSSE3__) -# define SIMDE_ARCH_X86_SSSE3 1 -# endif -# if defined(__SSE4_1__) -# define SIMDE_ARCH_X86_SSE4_1 1 -# endif -# if defined(__SSE4_2__) -# define SIMDE_ARCH_X86_SSE4_2 1 -# endif -# if defined(__XOP__) -# define SIMDE_ARCH_X86_XOP 1 -# endif -# if defined(__AVX__) -# define SIMDE_ARCH_X86_AVX 1 -# if !defined(SIMDE_ARCH_X86_SSE3) -# define SIMDE_ARCH_X86_SSE3 1 -# endif -# if !defined(SIMDE_ARCH_X86_SSE4_1) -# define SIMDE_ARCH_X86_SSE4_1 1 -# endif -# if !defined(SIMDE_ARCH_X86_SSE4_2) -# define SIMDE_ARCH_X86_SSE4_2 1 -# endif -# endif -# if defined(__AVX2__) -# define SIMDE_ARCH_X86_AVX2 1 -# if defined(_MSC_VER) -# define SIMDE_ARCH_X86_FMA 1 -# endif -# endif -# if defined(__FMA__) -# define SIMDE_ARCH_X86_FMA 1 -# if !defined(SIMDE_ARCH_X86_AVX) -# define SIMDE_ARCH_X86_AVX 1 -# endif -# endif -# if defined(__AVX512VP2INTERSECT__) -# define SIMDE_ARCH_X86_AVX512VP2INTERSECT 1 -# endif -# if defined(__AVX512BITALG__) -# define SIMDE_ARCH_X86_AVX512BITALG 1 -# endif -# if defined(__AVX512VPOPCNTDQ__) -# define SIMDE_ARCH_X86_AVX512VPOPCNTDQ 1 -# endif -# if defined(__AVX512VBMI__) -# define SIMDE_ARCH_X86_AVX512VBMI 1 -# endif -# if defined(__AVX512VBMI2__) -# define SIMDE_ARCH_X86_AVX512VBMI2 1 -# endif -# if defined(__AVX512VNNI__) -# define SIMDE_ARCH_X86_AVX512VNNI 1 -# endif -# if defined(__AVX5124VNNIW__) -# define SIMDE_ARCH_X86_AVX5124VNNIW 1 -# endif -# if defined(__AVX512BW__) -# define SIMDE_ARCH_X86_AVX512BW 1 -# endif -# if defined(__AVX512BF16__) -# define SIMDE_ARCH_X86_AVX512BF16 1 -# endif -# if defined(__AVX512CD__) -# define SIMDE_ARCH_X86_AVX512CD 1 -# endif -# if defined(__AVX512DQ__) -# define SIMDE_ARCH_X86_AVX512DQ 1 -# endif -# if defined(__AVX512F__) -# define SIMDE_ARCH_X86_AVX512F 1 -# endif -# if defined(__AVX512VL__) -# define SIMDE_ARCH_X86_AVX512VL 1 -# endif -# if defined(__AVX512FP16__) -# define SIMDE_ARCH_X86_AVX512FP16 1 -# endif -# if defined(__GFNI__) -# define SIMDE_ARCH_X86_GFNI 1 -# endif -# if defined(__PCLMUL__) -# define SIMDE_ARCH_X86_PCLMUL 1 -# endif -# if defined(__VPCLMULQDQ__) -# define SIMDE_ARCH_X86_VPCLMULQDQ 1 -# endif -# if defined(__F16C__) || (defined(HEDLEY_MSVC_VERSION) && HEDLEY_MSVC_VERSION_CHECK(19,30,0) && defined(SIMDE_ARCH_X86_AVX2) ) -# define SIMDE_ARCH_X86_F16C 1 -# endif -# if defined(__AES__) -# define SIMDE_ARCH_X86_AES 1 -# endif -#endif - -/* Itanium - */ -#if defined(__ia64__) || defined(_IA64) || defined(__IA64__) || defined(__ia64) || defined(_M_IA64) || defined(__itanium__) -# define SIMDE_ARCH_IA64 1 -#endif - -/* Renesas M32R - */ -#if defined(__m32r__) || defined(__M32R__) -# define SIMDE_ARCH_M32R -#endif - -/* Motorola 68000 - */ -#if defined(__mc68060__) || defined(__MC68060__) -# define SIMDE_ARCH_M68K 68060 -#elif defined(__mc68040__) || defined(__MC68040__) -# define SIMDE_ARCH_M68K 68040 -#elif defined(__mc68030__) || defined(__MC68030__) -# define SIMDE_ARCH_M68K 68030 -#elif defined(__mc68020__) || defined(__MC68020__) -# define SIMDE_ARCH_M68K 68020 -#elif defined(__mc68010__) || defined(__MC68010__) -# define SIMDE_ARCH_M68K 68010 -#elif defined(__mc68000__) || defined(__MC68000__) -# define SIMDE_ARCH_M68K 68000 -#endif -#if defined(SIMDE_ARCH_M68K) -# define SIMDE_ARCH_M68K_CHECK(version) ((version) <= SIMDE_ARCH_M68K) -#else -# define SIMDE_ARCH_M68K_CHECK(version) (0) -#endif - -/* Xilinx MicroBlaze - */ -#if defined(__MICROBLAZE__) || defined(__microblaze__) -# define SIMDE_ARCH_MICROBLAZE -#endif - -/* MIPS - */ -#if defined(_MIPS_ISA_MIPS64R2) -# define SIMDE_ARCH_MIPS 642 -#elif defined(_MIPS_ISA_MIPS64) -# define SIMDE_ARCH_MIPS 640 -#elif defined(_MIPS_ISA_MIPS32R2) -# define SIMDE_ARCH_MIPS 322 -#elif defined(_MIPS_ISA_MIPS32) -# define SIMDE_ARCH_MIPS 320 -#elif defined(_MIPS_ISA_MIPS4) -# define SIMDE_ARCH_MIPS 4 -#elif defined(_MIPS_ISA_MIPS3) -# define SIMDE_ARCH_MIPS 3 -#elif defined(_MIPS_ISA_MIPS2) -# define SIMDE_ARCH_MIPS 2 -#elif defined(_MIPS_ISA_MIPS1) -# define SIMDE_ARCH_MIPS 1 -#elif defined(_MIPS_ISA_MIPS) || defined(__mips) || defined(__MIPS__) -# define SIMDE_ARCH_MIPS 1 -#endif -#if defined(SIMDE_ARCH_MIPS) -# define SIMDE_ARCH_MIPS_CHECK(version) ((version) <= SIMDE_ARCH_MIPS) -#else -# define SIMDE_ARCH_MIPS_CHECK(version) (0) -#endif - -#if defined(__mips_loongson_mmi) -# define SIMDE_ARCH_MIPS_LOONGSON_MMI 1 -#endif - -#if defined(__mips_msa) -# define SIMDE_ARCH_MIPS_MSA 1 -#endif - -/* Matsushita MN10300 - */ -#if defined(__MN10300__) || defined(__mn10300__) -# define SIMDE_ARCH_MN10300 1 -#endif - -/* POWER - */ -#if defined(_M_PPC) -# define SIMDE_ARCH_POWER _M_PPC -#elif defined(_ARCH_PWR9) -# define SIMDE_ARCH_POWER 900 -#elif defined(_ARCH_PWR8) -# define SIMDE_ARCH_POWER 800 -#elif defined(_ARCH_PWR7) -# define SIMDE_ARCH_POWER 700 -#elif defined(_ARCH_PWR6) -# define SIMDE_ARCH_POWER 600 -#elif defined(_ARCH_PWR5) -# define SIMDE_ARCH_POWER 500 -#elif defined(_ARCH_PWR4) -# define SIMDE_ARCH_POWER 400 -#elif defined(_ARCH_440) || defined(__ppc440__) -# define SIMDE_ARCH_POWER 440 -#elif defined(_ARCH_450) || defined(__ppc450__) -# define SIMDE_ARCH_POWER 450 -#elif defined(_ARCH_601) || defined(__ppc601__) -# define SIMDE_ARCH_POWER 601 -#elif defined(_ARCH_603) || defined(__ppc603__) -# define SIMDE_ARCH_POWER 603 -#elif defined(_ARCH_604) || defined(__ppc604__) -# define SIMDE_ARCH_POWER 604 -#elif defined(_ARCH_605) || defined(__ppc605__) -# define SIMDE_ARCH_POWER 605 -#elif defined(_ARCH_620) || defined(__ppc620__) -# define SIMDE_ARCH_POWER 620 -#elif defined(__powerpc) || defined(__powerpc__) || defined(__POWERPC__) || defined(__ppc__) || defined(__PPC__) || defined(_ARCH_PPC) || defined(__ppc) -# define SIMDE_ARCH_POWER 1 -#endif -#if defined(SIMDE_ARCH_POWER) - #define SIMDE_ARCH_POWER_CHECK(version) ((version) <= SIMDE_ARCH_POWER) -#else - #define SIMDE_ARCH_POWER_CHECK(version) (0) -#endif - -#if defined(__ALTIVEC__) -# define SIMDE_ARCH_POWER_ALTIVEC SIMDE_ARCH_POWER - #define SIMDE_ARCH_POWER_ALTIVEC_CHECK(version) ((version) <= SIMDE_ARCH_POWER) -#else - #define SIMDE_ARCH_POWER_ALTIVEC_CHECK(version) (0) -#endif - -#if defined(__riscv) && __riscv_xlen==64 -# define SIMDE_ARCH_RISCV64 -#endif - -/* SPARC - */ -#if defined(__sparc_v9__) || defined(__sparcv9) -# define SIMDE_ARCH_SPARC 9 -#elif defined(__sparc_v8__) || defined(__sparcv8) -# define SIMDE_ARCH_SPARC 8 -#elif defined(__sparc_v7__) || defined(__sparcv7) -# define SIMDE_ARCH_SPARC 7 -#elif defined(__sparc_v6__) || defined(__sparcv6) -# define SIMDE_ARCH_SPARC 6 -#elif defined(__sparc_v5__) || defined(__sparcv5) -# define SIMDE_ARCH_SPARC 5 -#elif defined(__sparc_v4__) || defined(__sparcv4) -# define SIMDE_ARCH_SPARC 4 -#elif defined(__sparc_v3__) || defined(__sparcv3) -# define SIMDE_ARCH_SPARC 3 -#elif defined(__sparc_v2__) || defined(__sparcv2) -# define SIMDE_ARCH_SPARC 2 -#elif defined(__sparc_v1__) || defined(__sparcv1) -# define SIMDE_ARCH_SPARC 1 -#elif defined(__sparc__) || defined(__sparc) -# define SIMDE_ARCH_SPARC 1 -#endif -#if defined(SIMDE_ARCH_SPARC) - #define SIMDE_ARCH_SPARC_CHECK(version) ((version) <= SIMDE_ARCH_SPARC) -#else - #define SIMDE_ARCH_SPARC_CHECK(version) (0) -#endif - -/* SuperH - */ -#if defined(__sh5__) || defined(__SH5__) -# define SIMDE_ARCH_SUPERH 5 -#elif defined(__sh4__) || defined(__SH4__) -# define SIMDE_ARCH_SUPERH 4 -#elif defined(__sh3__) || defined(__SH3__) -# define SIMDE_ARCH_SUPERH 3 -#elif defined(__sh2__) || defined(__SH2__) -# define SIMDE_ARCH_SUPERH 2 -#elif defined(__sh1__) || defined(__SH1__) -# define SIMDE_ARCH_SUPERH 1 -#elif defined(__sh__) || defined(__SH__) -# define SIMDE_ARCH_SUPERH 1 -#endif - -/* IBM System z - */ -#if defined(__370__) || defined(__THW_370__) || defined(__s390__) || defined(__s390x__) || defined(__zarch__) || defined(__SYSC_ZARCH__) -# define SIMDE_ARCH_ZARCH __ARCH__ -#endif -#if defined(SIMDE_ARCH_ZARCH) - #define SIMDE_ARCH_ZARCH_CHECK(version) ((version) <= SIMDE_ARCH_ZARCH) -#else - #define SIMDE_ARCH_ZARCH_CHECK(version) (0) -#endif - -#if defined(SIMDE_ARCH_ZARCH) && defined(__VEC__) - #define SIMDE_ARCH_ZARCH_ZVECTOR SIMDE_ARCH_ZARCH -#endif - -/* TMS320 DSP - */ -#if defined(_TMS320C6740) || defined(__TMS320C6740__) -# define SIMDE_ARCH_TMS320 6740 -#elif defined(_TMS320C6700_PLUS) || defined(__TMS320C6700_PLUS__) -# define SIMDE_ARCH_TMS320 6701 -#elif defined(_TMS320C6700) || defined(__TMS320C6700__) -# define SIMDE_ARCH_TMS320 6700 -#elif defined(_TMS320C6600) || defined(__TMS320C6600__) -# define SIMDE_ARCH_TMS320 6600 -#elif defined(_TMS320C6400_PLUS) || defined(__TMS320C6400_PLUS__) -# define SIMDE_ARCH_TMS320 6401 -#elif defined(_TMS320C6400) || defined(__TMS320C6400__) -# define SIMDE_ARCH_TMS320 6400 -#elif defined(_TMS320C6200) || defined(__TMS320C6200__) -# define SIMDE_ARCH_TMS320 6200 -#elif defined(_TMS320C55X) || defined(__TMS320C55X__) -# define SIMDE_ARCH_TMS320 550 -#elif defined(_TMS320C54X) || defined(__TMS320C54X__) -# define SIMDE_ARCH_TMS320 540 -#elif defined(_TMS320C28X) || defined(__TMS320C28X__) -# define SIMDE_ARCH_TMS320 280 -#endif -#if defined(SIMDE_ARCH_TMS320) - #define SIMDE_ARCH_TMS320_CHECK(version) ((version) <= SIMDE_ARCH_TMS320) -#else - #define SIMDE_ARCH_TMS320_CHECK(version) (0) -#endif - -/* WebAssembly */ -#if defined(__wasm__) -# define SIMDE_ARCH_WASM 1 -#endif - -#if defined(SIMDE_ARCH_WASM) && defined(__wasm_simd128__) -# define SIMDE_ARCH_WASM_SIMD128 -#endif - -#if defined(SIMDE_ARCH_WASM) && defined(__wasm_relaxed_simd__) -# define SIMDE_ARCH_WASM_RELAXED_SIMD -#endif - -/* Xtensa - */ -#if defined(__xtensa__) || defined(__XTENSA__) -# define SIMDE_ARCH_XTENSA 1 -#endif - -/* Availability of 16-bit floating-point arithmetic intrinsics */ -#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) -# define SIMDE_ARCH_ARM_NEON_FP16 -#endif - -/* Availability of 16-bit brain floating-point arithmetic intrinsics */ -#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) -# define SIMDE_ARCH_ARM_NEON_BF16 -#endif - -/* LoongArch - */ -#if defined(__loongarch32) -# define SIMDE_ARCH_LOONGARCH 1 -#elif defined(__loongarch64) -# define SIMDE_ARCH_LOONGARCH 2 -#endif - -/* LSX: LoongArch 128-bits SIMD extension */ -#if defined(__loongarch_sx) -# define SIMDE_ARCH_LOONGARCH_LSX 1 -#endif - -/* LASX: LoongArch 256-bits SIMD extension */ -#if defined(__loongarch_asx) -# define SIMDE_ARCH_LOONGARCH_LASX 2 -#endif - -#endif /* !defined(SIMDE_ARCH_H) */ diff --git a/ffi-deps/simde/simde/simde-bf16.h b/ffi-deps/simde/simde/simde-bf16.h deleted file mode 100644 index 7e07368..0000000 --- a/ffi-deps/simde/simde/simde-bf16.h +++ /dev/null @@ -1,131 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) - */ - -#include "hedley.h" -#include "simde-common.h" -#include "simde-detect-clang.h" - -#if !defined(SIMDE_BFLOAT16_H) -#define SIMDE_BFLOAT16_H - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -/* This implementations is based upon simde-f16.h */ - -/* Portable version which should work on pretty much any compiler. - * Obviously you can't rely on compiler support for things like - * conversion to/from 32-bit floats, so make sure you always use the - * functions and macros in this file! - */ -#define SIMDE_BFLOAT16_API_PORTABLE 1 - -#define SIMDE_BFLOAT16_API_BF16 2 - -#if !defined(SIMDE_BFLOAT16_API) - #if defined(SIMDE_ARM_NEON_BF16) - #define SIMDE_BFLOAT16_API SIMDE_BFLOAT16_API_BF16 - #else - #define SIMDE_BFLOAT16_API SIMDE_BFLOAT16_API_PORTABLE - #endif -#endif - -#if SIMDE_BFLOAT16_API == SIMDE_BFLOAT16_API_BF16 - #include - typedef __bf16 simde_bfloat16; -#elif SIMDE_BFLOAT16_API == SIMDE_BFLOAT16_API_PORTABLE - typedef struct { uint16_t value; } simde_bfloat16; -#else - #error No 16-bit floating point API. -#endif - -/* Conversion -- convert between single-precision and brain half-precision - * floats. */ -static HEDLEY_ALWAYS_INLINE HEDLEY_CONST -simde_bfloat16 -simde_bfloat16_from_float32 (simde_float32 value) { -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) - return vcvth_bf16_f32(value); -#else - simde_bfloat16 res; - char* src = HEDLEY_REINTERPRET_CAST(char*, &value); - // rounding to nearest bfloat16 - // If the 17th bit of value is 1, set the rounding to 1. - uint8_t rounding = 0; - - #if SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE - if (src[1] & UINT8_C(0x80)) rounding = 1; - src[2] = HEDLEY_STATIC_CAST(char, (HEDLEY_STATIC_CAST(uint8_t, src[2]) + rounding)); - simde_memcpy(&res, src+2, sizeof(res)); - #else - if (src[2] & UINT8_C(0x80)) rounding = 1; - src[1] = HEDLEY_STATIC_CAST(char, (HEDLEY_STATIC_CAST(uint8_t, src[1]) + rounding)); - simde_memcpy(&res, src, sizeof(res)); - #endif - - return res; -#endif -} - -static HEDLEY_ALWAYS_INLINE HEDLEY_CONST -simde_float32 -simde_bfloat16_to_float32 (simde_bfloat16 value) { -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARM_NEON_BF16) - return vcvtah_f32_bf16(value); -#else - simde_float32 res = 0.0; - char* _res = HEDLEY_REINTERPRET_CAST(char*, &res); - - #if SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE - simde_memcpy(_res+2, &value, sizeof(value)); - #else - simde_memcpy(_res, &value, sizeof(value)); - #endif - - return res; -#endif -} - -SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_uint16_as_bfloat16, simde_bfloat16, uint16_t) - -#define SIMDE_NANBF simde_uint16_as_bfloat16(0xFFC1) // a quiet Not-a-Number -#define SIMDE_INFINITYBF simde_uint16_as_bfloat16(0x7F80) -#define SIMDE_NINFINITYBF simde_uint16_as_bfloat16(0xFF80) - -#define SIMDE_BFLOAT16_VALUE(value) simde_bfloat16_from_float32(SIMDE_FLOAT32_C(value)) - -#if !defined(simde_isinfbf) && defined(simde_math_isinff) - #define simde_isinfbf(a) simde_math_isinff(simde_bfloat16_to_float32(a)) -#endif -#if !defined(simde_isnanbf) && defined(simde_math_isnanf) - #define simde_isnanbf(a) simde_math_isnanf(simde_bfloat16_to_float32(a)) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_BFLOAT16_H) */ diff --git a/ffi-deps/simde/simde/simde-common.h b/ffi-deps/simde/simde/simde-common.h deleted file mode 100644 index 3734a7e..0000000 --- a/ffi-deps/simde/simde/simde-common.h +++ /dev/null @@ -1,1192 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2017-2020 Evan Nemerson - * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) - */ - -#if !defined(SIMDE_COMMON_H) -#define SIMDE_COMMON_H - -#include "hedley.h" - -#define SIMDE_VERSION_MAJOR 0 -#define SIMDE_VERSION_MINOR 8 -#define SIMDE_VERSION_MICRO 0 -#define SIMDE_VERSION HEDLEY_VERSION_ENCODE(SIMDE_VERSION_MAJOR, SIMDE_VERSION_MINOR, SIMDE_VERSION_MICRO) -// Also update meson.build in the root directory of the repository - -#include -#include - -#include "simde-detect-clang.h" -#include "simde-arch.h" -#include "simde-features.h" -#include "simde-diagnostic.h" -#include "simde-math.h" -#include "simde-constify.h" -#include "simde-align.h" - -/* In some situations, SIMDe has to make large performance sacrifices - * for small increases in how faithfully it reproduces an API, but - * only a relatively small number of users will actually need the API - * to be completely accurate. The SIMDE_FAST_* options can be used to - * disable these trade-offs. - * - * They can be enabled by passing -DSIMDE_FAST_MATH to the compiler, or - * the individual defines (e.g., -DSIMDE_FAST_NANS) if you only want to - * enable some optimizations. Using -ffast-math and/or - * -ffinite-math-only will also enable the relevant options. If you - * don't want that you can pass -DSIMDE_NO_FAST_* to disable them. */ - -/* Most programs avoid NaNs by never passing values which can result in - * a NaN; for example, if you only pass non-negative values to the sqrt - * functions, it won't generate a NaN. On some platforms, similar - * functions handle NaNs differently; for example, the _mm_min_ps SSE - * function will return 0.0 if you pass it (0.0, NaN), but the NEON - * vminq_f32 function will return NaN. Making them behave like one - * another is expensive; it requires generating a mask of all lanes - * with NaNs, then performing the operation (e.g., vminq_f32), then - * blending together the result with another vector using the mask. - * - * If you don't want SIMDe to worry about the differences between how - * NaNs are handled on the two platforms, define this (or pass - * -ffinite-math-only) */ -#if !defined(SIMDE_FAST_MATH) && !defined(SIMDE_NO_FAST_MATH) && defined(__FAST_MATH__) - #define SIMDE_FAST_MATH -#endif - -#if !defined(SIMDE_FAST_NANS) && !defined(SIMDE_NO_FAST_NANS) - #if defined(SIMDE_FAST_MATH) - #define SIMDE_FAST_NANS - #elif defined(__FINITE_MATH_ONLY__) - #if __FINITE_MATH_ONLY__ - #define SIMDE_FAST_NANS - #endif - #endif -#endif - -/* Many functions are defined as using the current rounding mode - * (i.e., the SIMD version of fegetround()) when converting to - * an integer. For example, _mm_cvtpd_epi32. Unfortunately, - * on some platforms (such as ARMv8+ where round-to-nearest is - * always used, regardless of the FPSCR register) this means we - * have to first query the current rounding mode, then choose - * the proper function (rounnd - , ceil, floor, etc.) */ -#if !defined(SIMDE_FAST_ROUND_MODE) && !defined(SIMDE_NO_FAST_ROUND_MODE) && defined(SIMDE_FAST_MATH) - #define SIMDE_FAST_ROUND_MODE -#endif - -/* This controls how ties are rounded. For example, does 10.5 round to - * 10 or 11? IEEE 754 specifies round-towards-even, but ARMv7 (for - * example) doesn't support it and it must be emulated (which is rather - * slow). If you're okay with just using the default for whatever arch - * you're on, you should definitely define this. - * - * Note that we don't use this macro to avoid correct implementations - * in functions which are explicitly about rounding (such as vrnd* on - * NEON, _mm_round_* on x86, etc.); it is only used for code where - * rounding is a component in another function, and even then it isn't - * usually a problem since such functions will use the current rounding - * mode. */ -#if !defined(SIMDE_FAST_ROUND_TIES) && !defined(SIMDE_NO_FAST_ROUND_TIES) && defined(SIMDE_FAST_MATH) - #define SIMDE_FAST_ROUND_TIES -#endif - -/* For functions which convert from one type to another (mostly from - * floating point to integer types), sometimes we need to do a range - * check and potentially return a different result if the value - * falls outside that range. Skipping this check can provide a - * performance boost, at the expense of faithfulness to the API we're - * emulating. */ -#if !defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_NO_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_MATH) - #define SIMDE_FAST_CONVERSION_RANGE -#endif - -/* Due to differences across platforms, sometimes it can be much - * faster for us to allow spurious floating point exceptions, - * or to no generate them when we should. */ -#if !defined(SIMDE_FAST_EXCEPTIONS) && !defined(SIMDE_NO_FAST_EXCEPTIONS) && defined(SIMDE_FAST_MATH) - #define SIMDE_FAST_EXCEPTIONS -#endif - -#if \ - HEDLEY_HAS_BUILTIN(__builtin_constant_p) || \ - HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_TINYC_VERSION_CHECK(0,9,19) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(6,1,0) || \ - (HEDLEY_SUNPRO_VERSION_CHECK(5,10,0) && !defined(__cplusplus)) || \ - HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define SIMDE_CHECK_CONSTANT_(expr) (__builtin_constant_p(expr)) -#elif defined(__cplusplus) && (__cplusplus > 201703L) - #include - #define SIMDE_CHECK_CONSTANT_(expr) (std::is_constant_evaluated()) -#endif - -#if !defined(SIMDE_NO_CHECK_IMMEDIATE_CONSTANT) - #if defined(SIMDE_CHECK_CONSTANT_) && \ - SIMDE_DETECT_CLANG_VERSION_CHECK(9,0,0) && \ - (!defined(__apple_build_version__) || ((__apple_build_version__ < 11000000) || (__apple_build_version__ >= 12000000))) - #define SIMDE_REQUIRE_CONSTANT(arg) HEDLEY_REQUIRE_MSG(SIMDE_CHECK_CONSTANT_(arg), "`" #arg "' must be constant") - #else - #define SIMDE_REQUIRE_CONSTANT(arg) - #endif -#else - #define SIMDE_REQUIRE_CONSTANT(arg) -#endif - -#define SIMDE_REQUIRE_RANGE(arg, min, max) \ - HEDLEY_REQUIRE_MSG((((arg) >= (min)) && ((arg) <= (max))), "'" #arg "' must be in [" #min ", " #max "]") - -#define SIMDE_REQUIRE_CONSTANT_RANGE(arg, min, max) \ - SIMDE_REQUIRE_CONSTANT(arg) \ - SIMDE_REQUIRE_RANGE(arg, min, max) - -/* A copy of HEDLEY_STATIC_ASSERT, except we don't define an empty - * fallback if we can't find an implementation; instead we have to - * check if SIMDE_STATIC_ASSERT is defined before using it. */ -#if \ - !defined(__cplusplus) && ( \ - (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)) || \ - HEDLEY_HAS_FEATURE(c_static_assert) || \ - HEDLEY_GCC_VERSION_CHECK(6,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - defined(_Static_assert) \ - ) - /* Sometimes _Static_assert is defined (in cdefs.h) using a symbol which - * starts with a double-underscore. This is a system header so we have no - * control over it, but since it's a macro it will emit a diagnostic which - * prevents compilation with -Werror. */ - #if HEDLEY_HAS_WARNING("-Wreserved-identifier") - #define SIMDE_STATIC_ASSERT(expr, message) (__extension__({ \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wreserved-identifier\"") \ - _Static_assert(expr, message); \ - HEDLEY_DIAGNOSTIC_POP \ - })) - #else - #define SIMDE_STATIC_ASSERT(expr, message) _Static_assert(expr, message) - #endif -#elif \ - (defined(__cplusplus) && (__cplusplus >= 201103L)) || \ - HEDLEY_MSVC_VERSION_CHECK(16,0,0) - #define SIMDE_STATIC_ASSERT(expr, message) HEDLEY_DIAGNOSTIC_DISABLE_CPP98_COMPAT_WRAP_(static_assert(expr, message)) -#endif - -/* Statement exprs */ -#if \ - HEDLEY_GNUC_VERSION_CHECK(2,95,0) || \ - HEDLEY_TINYC_VERSION_CHECK(0,9,26) || \ - HEDLEY_INTEL_VERSION_CHECK(9,0,0) || \ - HEDLEY_PGI_VERSION_CHECK(18,10,0) || \ - HEDLEY_SUNPRO_VERSION_CHECK(5,12,0) || \ - HEDLEY_IBM_VERSION_CHECK(11,1,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) - #define SIMDE_STATEMENT_EXPR_(expr) (__extension__ expr) -#endif - -/* This is just a convenience macro to make it easy to call a single - * function with a specific diagnostic disabled. */ -#if defined(SIMDE_STATEMENT_EXPR_) - #define SIMDE_DISABLE_DIAGNOSTIC_EXPR_(diagnostic, expr) \ - SIMDE_STATEMENT_EXPR_(({ \ - HEDLEY_DIAGNOSTIC_PUSH \ - diagnostic \ - (expr); \ - HEDLEY_DIAGNOSTIC_POP \ - })) -#endif - -#if defined(SIMDE_CHECK_CONSTANT_) && defined(SIMDE_STATIC_ASSERT) - #define SIMDE_ASSERT_CONSTANT_(v) SIMDE_STATIC_ASSERT(SIMDE_CHECK_CONSTANT_(v), #v " must be constant.") -#endif - -#if \ - (HEDLEY_HAS_ATTRIBUTE(may_alias) && !defined(HEDLEY_SUNPRO_VERSION)) || \ - HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_IBM_VERSION_CHECK(13,1,0) -# define SIMDE_MAY_ALIAS __attribute__((__may_alias__)) -#else -# define SIMDE_MAY_ALIAS -#endif - -/* Lots of compilers support GCC-style vector extensions, but many - don't support all the features. Define different macros depending - on support for - - * SIMDE_VECTOR - Declaring a vector. - * SIMDE_VECTOR_OPS - basic operations (binary and unary). - * SIMDE_VECTOR_NEGATE - negating a vector - * SIMDE_VECTOR_SCALAR - For binary operators, the second argument - can be a scalar, in which case the result is as if that scalar - had been broadcast to all lanes of a vector. - * SIMDE_VECTOR_SUBSCRIPT - Supports array subscript notation for - extracting/inserting a single element.= - - SIMDE_VECTOR can be assumed if any others are defined, the - others are independent. */ -#if !defined(SIMDE_NO_VECTOR) -# if \ - HEDLEY_GCC_VERSION_CHECK(4,8,0) -# define SIMDE_VECTOR(size) __attribute__((__vector_size__(size))) -# define SIMDE_VECTOR_OPS -# define SIMDE_VECTOR_NEGATE -# define SIMDE_VECTOR_SCALAR -# define SIMDE_VECTOR_SUBSCRIPT -# elif HEDLEY_INTEL_VERSION_CHECK(16,0,0) -# define SIMDE_VECTOR(size) __attribute__((__vector_size__(size))) -# define SIMDE_VECTOR_OPS -# define SIMDE_VECTOR_NEGATE -/* ICC only supports SIMDE_VECTOR_SCALAR for constants */ -# define SIMDE_VECTOR_SUBSCRIPT -# elif \ - HEDLEY_GCC_VERSION_CHECK(4,1,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_MCST_LCC_VERSION_CHECK(1,25,10) -# define SIMDE_VECTOR(size) __attribute__((__vector_size__(size))) -# define SIMDE_VECTOR_OPS -# elif HEDLEY_SUNPRO_VERSION_CHECK(5,12,0) -# define SIMDE_VECTOR(size) __attribute__((__vector_size__(size))) -# elif HEDLEY_HAS_ATTRIBUTE(vector_size) -# define SIMDE_VECTOR(size) __attribute__((__vector_size__(size))) -# define SIMDE_VECTOR_OPS -# define SIMDE_VECTOR_NEGATE -# define SIMDE_VECTOR_SUBSCRIPT -# if SIMDE_DETECT_CLANG_VERSION_CHECK(5,0,0) -# define SIMDE_VECTOR_SCALAR -# endif -# endif - -/* GCC and clang have built-in functions to handle shuffling and - converting of vectors, but the implementations are slightly - different. This macro is just an abstraction over them. Note that - elem_size is in bits but vec_size is in bytes. */ -# if !defined(SIMDE_NO_SHUFFLE_VECTOR) && defined(SIMDE_VECTOR_SUBSCRIPT) - HEDLEY_DIAGNOSTIC_PUSH - /* We don't care about -Wvariadic-macros; all compilers that support - * shufflevector/shuffle support them. */ -# if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") -# pragma clang diagnostic ignored "-Wc++98-compat-pedantic" -# endif -# if HEDLEY_HAS_WARNING("-Wvariadic-macros") || HEDLEY_GCC_VERSION_CHECK(4,0,0) -# pragma GCC diagnostic ignored "-Wvariadic-macros" -# endif - -# if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) -# define SIMDE_SHUFFLE_VECTOR_(elem_size, vec_size, a, b, ...) __builtin_shufflevector(a, b, __VA_ARGS__) -# elif HEDLEY_GCC_HAS_BUILTIN(__builtin_shuffle,4,7,0) && !defined(__INTEL_COMPILER) -# define SIMDE_SHUFFLE_VECTOR_(elem_size, vec_size, a, b, ...) (__extension__ ({ \ - int##elem_size##_t SIMDE_VECTOR(vec_size) simde_shuffle_ = { __VA_ARGS__ }; \ - __builtin_shuffle(a, b, simde_shuffle_); \ - })) -# endif - HEDLEY_DIAGNOSTIC_POP -# endif - -/* TODO: this actually works on XL C/C++ without SIMDE_VECTOR_SUBSCRIPT - but the code needs to be refactored a bit to take advantage. */ -# if !defined(SIMDE_NO_CONVERT_VECTOR) && defined(SIMDE_VECTOR_SUBSCRIPT) -# if HEDLEY_HAS_BUILTIN(__builtin_convertvector) || HEDLEY_GCC_VERSION_CHECK(9,0,0) -# if HEDLEY_GCC_VERSION_CHECK(9,0,0) && !HEDLEY_GCC_VERSION_CHECK(9,3,0) - /* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=93557 */ -# define SIMDE_CONVERT_VECTOR_(to, from) ((to) = (__extension__({ \ - __typeof__(from) from_ = (from); \ - ((void) from_); \ - __builtin_convertvector(from_, __typeof__(to)); \ - }))) -# else -# define SIMDE_CONVERT_VECTOR_(to, from) ((to) = __builtin_convertvector((from), __typeof__(to))) -# endif -# endif -# endif -#endif - -/* Since we currently require SUBSCRIPT before using a vector in a - union, we define these as dependencies of SUBSCRIPT. They are - likely to disappear in the future, once SIMDe learns how to make - use of vectors without using the union members. Do not use them - in your code unless you're okay with it breaking when SIMDe - changes. */ -#if defined(SIMDE_VECTOR_SUBSCRIPT) -# if defined(SIMDE_VECTOR_OPS) -# define SIMDE_VECTOR_SUBSCRIPT_OPS -# endif -# if defined(SIMDE_VECTOR_SCALAR) -# define SIMDE_VECTOR_SUBSCRIPT_SCALAR -# endif -#endif - -#if !defined(SIMDE_DISABLE_OPENMP) - #if !defined(SIMDE_ENABLE_OPENMP) && ((defined(_OPENMP) && (_OPENMP >= 201307L)) || (defined(_OPENMP_SIMD) && (_OPENMP_SIMD >= 201307L))) || defined(HEDLEY_MCST_LCC_VERSION) - #define SIMDE_ENABLE_OPENMP - #endif -#endif - -#if !defined(SIMDE_ENABLE_CILKPLUS) && (defined(__cilk) || defined(HEDLEY_INTEL_VERSION)) -# define SIMDE_ENABLE_CILKPLUS -#endif - -#if defined(SIMDE_ENABLE_OPENMP) -# define SIMDE_VECTORIZE HEDLEY_PRAGMA(omp simd) -# define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(omp simd safelen(l)) -# if defined(__clang__) -# define SIMDE_VECTORIZE_REDUCTION(r) \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wsign-conversion\"") \ - HEDLEY_PRAGMA(omp simd reduction(r)) \ - HEDLEY_DIAGNOSTIC_POP -# else -# define SIMDE_VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(omp simd reduction(r)) -# endif -# if !defined(HEDLEY_MCST_LCC_VERSION) -# define SIMDE_VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(omp simd aligned(a)) -# else -# define SIMDE_VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(omp simd) -# endif -#elif defined(SIMDE_ENABLE_CILKPLUS) -# define SIMDE_VECTORIZE HEDLEY_PRAGMA(simd) -# define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(simd vectorlength(l)) -# define SIMDE_VECTORIZE_REDUCTION(r) HEDLEY_PRAGMA(simd reduction(r)) -# define SIMDE_VECTORIZE_ALIGNED(a) HEDLEY_PRAGMA(simd aligned(a)) -#elif defined(__clang__) && !defined(HEDLEY_IBM_VERSION) -# define SIMDE_VECTORIZE HEDLEY_PRAGMA(clang loop vectorize(enable)) -# define SIMDE_VECTORIZE_SAFELEN(l) HEDLEY_PRAGMA(clang loop vectorize_width(l)) -# define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE -# define SIMDE_VECTORIZE_ALIGNED(a) -#elif HEDLEY_GCC_VERSION_CHECK(4,9,0) -# define SIMDE_VECTORIZE HEDLEY_PRAGMA(GCC ivdep) -# define SIMDE_VECTORIZE_SAFELEN(l) SIMDE_VECTORIZE -# define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE -# define SIMDE_VECTORIZE_ALIGNED(a) -#elif HEDLEY_CRAY_VERSION_CHECK(5,0,0) -# define SIMDE_VECTORIZE HEDLEY_PRAGMA(_CRI ivdep) -# define SIMDE_VECTORIZE_SAFELEN(l) SIMDE_VECTORIZE -# define SIMDE_VECTORIZE_REDUCTION(r) SIMDE_VECTORIZE -# define SIMDE_VECTORIZE_ALIGNED(a) -#else -# define SIMDE_VECTORIZE -# define SIMDE_VECTORIZE_SAFELEN(l) -# define SIMDE_VECTORIZE_REDUCTION(r) -# define SIMDE_VECTORIZE_ALIGNED(a) -#endif - -#define SIMDE_MASK_NZ_(v, mask) (((v) & (mask)) | !((v) & (mask))) - -/* Intended for checking coverage, you should never use this in - production. */ -#if defined(SIMDE_NO_INLINE) -# define SIMDE_FUNCTION_ATTRIBUTES HEDLEY_NEVER_INLINE static -#else -# define SIMDE_FUNCTION_ATTRIBUTES HEDLEY_ALWAYS_INLINE static -#endif - -#if defined(SIMDE_NO_INLINE) -# define SIMDE_HUGE_FUNCTION_ATTRIBUTES HEDLEY_NEVER_INLINE static -#elif defined(SIMDE_CONSTRAINED_COMPILATION) -# define SIMDE_HUGE_FUNCTION_ATTRIBUTES static -#else -# define SIMDE_HUGE_FUNCTION_ATTRIBUTES HEDLEY_ALWAYS_INLINE static -#endif - -#if \ - HEDLEY_HAS_ATTRIBUTE(unused) || \ - HEDLEY_GCC_VERSION_CHECK(2,95,0) -# define SIMDE_FUNCTION_POSSIBLY_UNUSED_ __attribute__((__unused__)) -#else -# define SIMDE_FUNCTION_POSSIBLY_UNUSED_ -#endif - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_ - -#if defined(_MSC_VER) -# define SIMDE_BEGIN_DECLS_ HEDLEY_DIAGNOSTIC_PUSH __pragma(warning(disable:4996 4204)) HEDLEY_BEGIN_C_DECLS -# define SIMDE_END_DECLS_ HEDLEY_DIAGNOSTIC_POP HEDLEY_END_C_DECLS -#else -# define SIMDE_BEGIN_DECLS_ \ - HEDLEY_DIAGNOSTIC_PUSH \ - SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_ \ - HEDLEY_BEGIN_C_DECLS -# define SIMDE_END_DECLS_ \ - HEDLEY_END_C_DECLS \ - HEDLEY_DIAGNOSTIC_POP -#endif - -#if defined(__SIZEOF_INT128__) -# define SIMDE_HAVE_INT128_ -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DIAGNOSTIC_DISABLE_PEDANTIC_ -typedef __int128 simde_int128; -typedef unsigned __int128 simde_uint128; -HEDLEY_DIAGNOSTIC_POP -#endif - -#if !defined(SIMDE_ENDIAN_LITTLE) -# define SIMDE_ENDIAN_LITTLE 1234 -#endif -#if !defined(SIMDE_ENDIAN_BIG) -# define SIMDE_ENDIAN_BIG 4321 -#endif - -#if !defined(SIMDE_ENDIAN_ORDER) -/* GCC (and compilers masquerading as GCC) define __BYTE_ORDER__. */ -# if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE -# elif defined(__BYTE_ORDER__) && defined(__ORDER_BIG_ENDIAN__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG -/* TI defines _BIG_ENDIAN or _LITTLE_ENDIAN */ -# elif defined(_BIG_ENDIAN) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG -# elif defined(_LITTLE_ENDIAN) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE -/* We know the endianness of some common architectures. Common - * architectures not listed (ARM, POWER, MIPS, etc.) here are - * bi-endian. */ -# elif defined(__amd64) || defined(_M_X64) || defined(__i386) || defined(_M_IX86) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE -# elif defined(__s390x__) || defined(__zarch__) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG -/* Looks like we'll have to rely on the platform. If we're missing a - * platform, please let us know. */ -# elif defined(_WIN32) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE -# elif defined(sun) || defined(__sun) /* Solaris */ -# include -# if defined(_LITTLE_ENDIAN) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE -# elif defined(_BIG_ENDIAN) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG -# endif -# elif defined(__APPLE__) -# include -# if defined(__LITTLE_ENDIAN__) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE -# elif defined(__BIG_ENDIAN__) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG -# endif -# elif defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || defined(__bsdi__) || defined(__DragonFly__) || defined(BSD) -# include -# if defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE -# elif defined(__BYTE_ORDER) && (__BYTE_ORDER == __BIG_ENDIAN) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG -# endif -# elif defined(__linux__) || defined(__linux) || defined(__gnu_linux__) -# include -# if defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && (__BYTE_ORDER == __LITTLE_ENDIAN) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_LITTLE -# elif defined(__BYTE_ORDER) && defined(__BIG_ENDIAN) && (__BYTE_ORDER == __BIG_ENDIAN) -# define SIMDE_ENDIAN_ORDER SIMDE_ENDIAN_BIG -# endif -# endif -#endif - -#if \ - HEDLEY_HAS_BUILTIN(__builtin_bswap64) || \ - HEDLEY_GCC_VERSION_CHECK(4,3,0) || \ - HEDLEY_IBM_VERSION_CHECK(13,1,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) - #define simde_bswap64(v) __builtin_bswap64(v) -#elif HEDLEY_MSVC_VERSION_CHECK(13,10,0) - #define simde_bswap64(v) _byteswap_uint64(v) -#else - SIMDE_FUNCTION_ATTRIBUTES - uint64_t - simde_bswap64(uint64_t v) { - return - ((v & (((uint64_t) 0xff) << 56)) >> 56) | - ((v & (((uint64_t) 0xff) << 48)) >> 40) | - ((v & (((uint64_t) 0xff) << 40)) >> 24) | - ((v & (((uint64_t) 0xff) << 32)) >> 8) | - ((v & (((uint64_t) 0xff) << 24)) << 8) | - ((v & (((uint64_t) 0xff) << 16)) << 24) | - ((v & (((uint64_t) 0xff) << 8)) << 40) | - ((v & (((uint64_t) 0xff) )) << 56); - } -#endif - -#if !defined(SIMDE_ENDIAN_ORDER) -# error Unknown byte order; please file a bug -#else -# if SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE -# define simde_endian_bswap64_be(value) simde_bswap64(value) -# define simde_endian_bswap64_le(value) (value) -# elif SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG -# define simde_endian_bswap64_be(value) (value) -# define simde_endian_bswap64_le(value) simde_bswap64(value) -# endif -#endif - -/* TODO: we should at least make an attempt to detect the correct - types for simde_float32/float64 instead of just assuming float and - double. */ - -#if !defined(SIMDE_FLOAT32_TYPE) -# define SIMDE_FLOAT32_TYPE float -# define SIMDE_FLOAT32_C(value) value##f -#else -# define SIMDE_FLOAT32_C(value) ((SIMDE_FLOAT32_TYPE) value) -#endif -typedef SIMDE_FLOAT32_TYPE simde_float32; - -#if !defined(SIMDE_FLOAT64_TYPE) -# define SIMDE_FLOAT64_TYPE double -# define SIMDE_FLOAT64_C(value) value -#else -# define SIMDE_FLOAT64_C(value) ((SIMDE_FLOAT64_TYPE) value) -#endif -typedef SIMDE_FLOAT64_TYPE simde_float64; - -#if defined(SIMDE_POLY8_TYPE) -# undef SIMDE_POLY8_TYPE -#endif -#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define SIMDE_POLY8_TYPE poly8_t -# define SIMDE_POLY8_C(value) (HEDLEY_STATIC_CAST(poly8_t, value)) -#else -# define SIMDE_POLY8_TYPE uint8_t -# define SIMDE_POLY8_C(value) (HEDLEY_STATIC_CAST(uint8_t, value)) -#endif -typedef SIMDE_POLY8_TYPE simde_poly8; - -#if defined(SIMDE_POLY16_TYPE) -# undef SIMDE_POLY16_TYPE -#endif -#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define SIMDE_POLY16_TYPE poly16_t -# define SIMDE_POLY16_C(value) (HEDLEY_STATIC_CAST(poly16_t, value)) -#else -# define SIMDE_POLY16_TYPE uint16_t -# define SIMDE_POLY16_C(value) (HEDLEY_STATIC_CAST(uint16_t, value)) -#endif -typedef SIMDE_POLY16_TYPE simde_poly16; - -#if defined(SIMDE_POLY64_TYPE) -# undef SIMDE_POLY64_TYPE -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) -# define SIMDE_POLY64_TYPE poly64_t -# define SIMDE_POLY64_C(value) (HEDLEY_STATIC_CAST(poly64_t, value ## ull)) -#else -# define SIMDE_POLY64_TYPE uint64_t -# define SIMDE_POLY64_C(value) value ## ull -#endif -typedef SIMDE_POLY64_TYPE simde_poly64; - -#if defined(SIMDE_POLY128_TYPE) -# undef SIMDE_POLY128_TYPE -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) -# define SIMDE_POLY128_TYPE poly128_t -# define SIMDE_POLY128_C(value) value -#elif defined(__SIZEOF_INT128__) -# define SIMDE_POLY128_TYPE __int128 -# define SIMDE_POLY128_C(value) (HEDLEY_STATIC_CAST(__int128, value)) -#else -# define SIMDE_POLY128_TYPE uint64_t -# define SIMDE_TARGET_NOT_SUPPORT_INT128_TYPE 1 -#endif -typedef SIMDE_POLY128_TYPE simde_poly128; - -#if defined(__cplusplus) - typedef bool simde_bool; -#elif defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) - typedef _Bool simde_bool; -#elif defined(bool) - typedef bool simde_bool; -#else - #include - typedef bool simde_bool; -#endif - -#if HEDLEY_HAS_WARNING("-Wbad-function-cast") -# define SIMDE_CONVERT_FTOI(T,v) \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wbad-function-cast\"") \ - HEDLEY_STATIC_CAST(T, (v)) \ - HEDLEY_DIAGNOSTIC_POP -#else -# define SIMDE_CONVERT_FTOI(T,v) ((T) (v)) -#endif - -/* TODO: detect compilers which support this outside of C11 mode */ -#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) - #define SIMDE_CHECKED_REINTERPRET_CAST(to, from, value) _Generic((value), to: (value), default: (_Generic((value), from: ((to) (value))))) - #define SIMDE_CHECKED_STATIC_CAST(to, from, value) _Generic((value), to: (value), default: (_Generic((value), from: ((to) (value))))) -#else - #define SIMDE_CHECKED_REINTERPRET_CAST(to, from, value) HEDLEY_REINTERPRET_CAST(to, value) - #define SIMDE_CHECKED_STATIC_CAST(to, from, value) HEDLEY_STATIC_CAST(to, value) -#endif - -#if HEDLEY_HAS_WARNING("-Wfloat-equal") -# define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL _Pragma("clang diagnostic ignored \"-Wfloat-equal\"") -#elif HEDLEY_GCC_VERSION_CHECK(3,0,0) -# define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL _Pragma("GCC diagnostic ignored \"-Wfloat-equal\"") -#else -# define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL -#endif - -/* Some functions can trade accuracy for speed. For those functions - you can control the trade-off using this macro. Possible values: - - 0: prefer speed - 1: reasonable trade-offs - 2: prefer accuracy */ -#if !defined(SIMDE_ACCURACY_PREFERENCE) -# define SIMDE_ACCURACY_PREFERENCE 1 -#endif - -#if defined(__STDC_HOSTED__) -# define SIMDE_STDC_HOSTED __STDC_HOSTED__ -#else -# if \ - defined(HEDLEY_PGI_VERSION) || \ - defined(HEDLEY_MSVC_VERSION) -# define SIMDE_STDC_HOSTED 1 -# else -# define SIMDE_STDC_HOSTED 0 -# endif -#endif - -/* Try to deal with environments without a standard library. */ -#if !defined(simde_memcpy) - #if HEDLEY_HAS_BUILTIN(__builtin_memcpy) - #define simde_memcpy(dest, src, n) __builtin_memcpy(dest, src, n) - #endif -#endif -#if !defined(simde_memset) - #if HEDLEY_HAS_BUILTIN(__builtin_memset) - #define simde_memset(s, c, n) __builtin_memset(s, c, n) - #endif -#endif -#if !defined(simde_memcmp) - #if HEDLEY_HAS_BUILTIN(__builtin_memcmp) - #define simde_memcmp(s1, s2, n) __builtin_memcmp(s1, s2, n) - #endif -#endif - -#if !defined(simde_memcpy) || !defined(simde_memset) || !defined(simde_memcmp) - #if !defined(SIMDE_NO_STRING_H) - #if defined(__has_include) - #if !__has_include() - #define SIMDE_NO_STRING_H - #endif - #elif (SIMDE_STDC_HOSTED == 0) - #define SIMDE_NO_STRING_H - #endif - #endif - - #if !defined(SIMDE_NO_STRING_H) - #include - #if !defined(simde_memcpy) - #define simde_memcpy(dest, src, n) memcpy(dest, src, n) - #endif - #if !defined(simde_memset) - #define simde_memset(s, c, n) memset(s, c, n) - #endif - #if !defined(simde_memcmp) - #define simde_memcmp(s1, s2, n) memcmp(s1, s2, n) - #endif - #else - /* These are meant to be portable, not fast. If you're hitting them you - * should think about providing your own (by defining the simde_memcpy - * macro prior to including any SIMDe files) or submitting a patch to - * SIMDe so we can detect your system-provided memcpy/memset, like by - * adding your compiler to the checks for __builtin_memcpy and/or - * __builtin_memset. */ - #if !defined(simde_memcpy) - SIMDE_FUNCTION_ATTRIBUTES - void - simde_memcpy_(void* dest, const void* src, size_t len) { - char* dest_ = HEDLEY_STATIC_CAST(char*, dest); - char* src_ = HEDLEY_STATIC_CAST(const char*, src); - for (size_t i = 0 ; i < len ; i++) { - dest_[i] = src_[i]; - } - } - #define simde_memcpy(dest, src, n) simde_memcpy_(dest, src, n) - #endif - - #if !defined(simde_memset) - SIMDE_FUNCTION_ATTRIBUTES - void - simde_memset_(void* s, int c, size_t len) { - char* s_ = HEDLEY_STATIC_CAST(char*, s); - char c_ = HEDLEY_STATIC_CAST(char, c); - for (size_t i = 0 ; i < len ; i++) { - s_[i] = c_[i]; - } - } - #define simde_memset(s, c, n) simde_memset_(s, c, n) - #endif - - #if !defined(simde_memcmp) - SIMDE_FUCTION_ATTRIBUTES - int - simde_memcmp_(const void *s1, const void *s2, size_t n) { - unsigned char* s1_ = HEDLEY_STATIC_CAST(unsigned char*, s1); - unsigned char* s2_ = HEDLEY_STATIC_CAST(unsigned char*, s2); - for (size_t i = 0 ; i < len ; i++) { - if (s1_[i] != s2_[i]) { - return (int) (s1_[i] - s2_[i]); - } - } - return 0; - } - #define simde_memcmp(s1, s2, n) simde_memcmp_(s1, s2, n) - #endif - #endif -#endif - -/*** Functions that quiet a signaling NaN ***/ - -static HEDLEY_INLINE -double -simde_math_quiet(double x) { - uint64_t tmp, mask; - if (!simde_math_isnan(x)) { - return x; - } - simde_memcpy(&tmp, &x, 8); - mask = 0x7ff80000; - mask <<= 32; - tmp |= mask; - simde_memcpy(&x, &tmp, 8); - return x; -} - -static HEDLEY_INLINE -float -simde_math_quietf(float x) { - uint32_t tmp; - if (!simde_math_isnanf(x)) { - return x; - } - simde_memcpy(&tmp, &x, 4); - tmp |= 0x7fc00000lu; - simde_memcpy(&x, &tmp, 4); - return x; -} - -#if defined(FE_ALL_EXCEPT) - #define SIMDE_HAVE_FENV_H -#elif defined(__has_include) - #if __has_include() - #include - #define SIMDE_HAVE_FENV_H - #endif -#elif SIMDE_STDC_HOSTED == 1 - #include - #define SIMDE_HAVE_FENV_H -#endif - -#if defined(EXIT_FAILURE) - #define SIMDE_HAVE_STDLIB_H -#elif defined(__has_include) - #if __has_include() - #include - #define SIMDE_HAVE_STDLIB_H - #endif -#elif SIMDE_STDC_HOSTED == 1 - #include - #define SIMDE_HAVE_STDLIB_H -#endif - -#if defined(__has_include) -# if defined(__cplusplus) && (__cplusplus >= 201103L) && __has_include() -# include -# elif __has_include() -# include -# endif -# if __has_include() -# include -# endif -#elif SIMDE_STDC_HOSTED == 1 -# include -# include -#endif - -#define SIMDE_DEFINE_CONVERSION_FUNCTION_(Name, T_To, T_From) \ - static HEDLEY_ALWAYS_INLINE HEDLEY_CONST SIMDE_FUNCTION_POSSIBLY_UNUSED_ \ - T_To \ - Name (T_From value) { \ - T_To r; \ - simde_memcpy(&r, &value, sizeof(r)); \ - return r; \ - } - -SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_float32_as_uint32, uint32_t, simde_float32) -SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_uint32_as_float32, simde_float32, uint32_t) -SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_float64_as_uint64, uint64_t, simde_float64) -SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_uint64_as_float64, simde_float64, uint64_t) - -#include "check.h" - -/* GCC/clang have a bunch of functionality in builtins which we would - * like to access, but the suffixes indicate whether the operate on - * int, long, or long long, not fixed width types (e.g., int32_t). - * we use these macros to attempt to map from fixed-width to the - * names GCC uses. Note that you should still cast the input(s) and - * return values (to/from SIMDE_BUILTIN_TYPE_*_) since often even if - * types are the same size they may not be compatible according to the - * compiler. For example, on x86 long and long lonsg are generally - * both 64 bits, but platforms vary on whether an int64_t is mapped - * to a long or long long. */ - -#include - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ - -#if (INT8_MAX == INT_MAX) && (INT8_MIN == INT_MIN) - #define SIMDE_BUILTIN_SUFFIX_8_ - #define SIMDE_BUILTIN_TYPE_8_ int -#elif (INT8_MAX == LONG_MAX) && (INT8_MIN == LONG_MIN) - #define SIMDE_BUILTIN_SUFFIX_8_ l - #define SIMDE_BUILTIN_TYPE_8_ long -#elif (INT8_MAX == LLONG_MAX) && (INT8_MIN == LLONG_MIN) - #define SIMDE_BUILTIN_SUFFIX_8_ ll - #define SIMDE_BUILTIN_TYPE_8_ long long -#endif - -#if (INT16_MAX == INT_MAX) && (INT16_MIN == INT_MIN) - #define SIMDE_BUILTIN_SUFFIX_16_ - #define SIMDE_BUILTIN_TYPE_16_ int -#elif (INT16_MAX == LONG_MAX) && (INT16_MIN == LONG_MIN) - #define SIMDE_BUILTIN_SUFFIX_16_ l - #define SIMDE_BUILTIN_TYPE_16_ long -#elif (INT16_MAX == LLONG_MAX) && (INT16_MIN == LLONG_MIN) - #define SIMDE_BUILTIN_SUFFIX_16_ ll - #define SIMDE_BUILTIN_TYPE_16_ long long -#endif - -#if (INT32_MAX == INT_MAX) && (INT32_MIN == INT_MIN) - #define SIMDE_BUILTIN_SUFFIX_32_ - #define SIMDE_BUILTIN_TYPE_32_ int -#elif (INT32_MAX == LONG_MAX) && (INT32_MIN == LONG_MIN) - #define SIMDE_BUILTIN_SUFFIX_32_ l - #define SIMDE_BUILTIN_TYPE_32_ long -#elif (INT32_MAX == LLONG_MAX) && (INT32_MIN == LLONG_MIN) - #define SIMDE_BUILTIN_SUFFIX_32_ ll - #define SIMDE_BUILTIN_TYPE_32_ long long -#endif - -#if (INT64_MAX == INT_MAX) && (INT64_MIN == INT_MIN) - #define SIMDE_BUILTIN_SUFFIX_64_ - #define SIMDE_BUILTIN_TYPE_64_ int -#elif (INT64_MAX == LONG_MAX) && (INT64_MIN == LONG_MIN) - #define SIMDE_BUILTIN_SUFFIX_64_ l - #define SIMDE_BUILTIN_TYPE_64_ long -#elif (INT64_MAX == LLONG_MAX) && (INT64_MIN == LLONG_MIN) - #define SIMDE_BUILTIN_SUFFIX_64_ ll - #define SIMDE_BUILTIN_TYPE_64_ long long -#endif - -/* SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ */ -HEDLEY_DIAGNOSTIC_POP - -#if defined(SIMDE_BUILTIN_SUFFIX_8_) - #define SIMDE_BUILTIN_8_(name) HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_8_) - #define SIMDE_BUILTIN_HAS_8_(name) HEDLEY_HAS_BUILTIN(HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_8_)) -#else - #define SIMDE_BUILTIN_HAS_8_(name) 0 -#endif -#if defined(SIMDE_BUILTIN_SUFFIX_16_) - #define SIMDE_BUILTIN_16_(name) HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_16_) - #define SIMDE_BUILTIN_HAS_16_(name) HEDLEY_HAS_BUILTIN(HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_16_)) -#else - #define SIMDE_BUILTIN_HAS_16_(name) 0 -#endif -#if defined(SIMDE_BUILTIN_SUFFIX_32_) - #define SIMDE_BUILTIN_32_(name) HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_32_) - #define SIMDE_BUILTIN_HAS_32_(name) HEDLEY_HAS_BUILTIN(HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_32_)) -#else - #define SIMDE_BUILTIN_HAS_32_(name) 0 -#endif -#if defined(SIMDE_BUILTIN_SUFFIX_64_) - #define SIMDE_BUILTIN_64_(name) HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_64_) - #define SIMDE_BUILTIN_HAS_64_(name) HEDLEY_HAS_BUILTIN(HEDLEY_CONCAT3(__builtin_, name, SIMDE_BUILTIN_SUFFIX_64_)) -#else - #define SIMDE_BUILTIN_HAS_64_(name) 0 -#endif - -#if !defined(__cplusplus) - #if defined(__clang__) - #if HEDLEY_HAS_WARNING("-Wc11-extensions") - #define SIMDE_GENERIC_(...) (__extension__ ({ \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wc11-extensions\"") \ - _Generic(__VA_ARGS__); \ - HEDLEY_DIAGNOSTIC_POP \ - })) - #elif HEDLEY_HAS_WARNING("-Wc1x-extensions") - #define SIMDE_GENERIC_(...) (__extension__ ({ \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("clang diagnostic ignored \"-Wc1x-extensions\"") \ - _Generic(__VA_ARGS__); \ - HEDLEY_DIAGNOSTIC_POP \ - })) - #endif - #elif \ - defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) || \ - HEDLEY_HAS_EXTENSION(c_generic_selections) || \ - HEDLEY_GCC_VERSION_CHECK(4,9,0) || \ - HEDLEY_INTEL_VERSION_CHECK(17,0,0) || \ - HEDLEY_IBM_VERSION_CHECK(12,1,0) || \ - HEDLEY_ARM_VERSION_CHECK(5,3,0) - #define SIMDE_GENERIC_(...) _Generic(__VA_ARGS__) - #endif -#endif - -/* Sometimes we run into problems with specific versions of compilers - which make the native versions unusable for us. Often this is due - to missing functions, sometimes buggy implementations, etc. These - macros are how we check for specific bugs. As they are fixed we'll - start only defining them for problematic compiler versions. */ - -#if !defined(SIMDE_IGNORE_COMPILER_BUGS) -# if defined(HEDLEY_GCC_VERSION) -# if !HEDLEY_GCC_VERSION_CHECK(4,9,0) -# define SIMDE_BUG_GCC_REV_208793 -# endif -# if !HEDLEY_GCC_VERSION_CHECK(5,0,0) -# define SIMDE_BUG_GCC_BAD_MM_SRA_EPI32 /* TODO: find relevant bug or commit */ -# endif -# if !HEDLEY_GCC_VERSION_CHECK(6,0,0) -# define SIMDE_BUG_GCC_SIZEOF_IMMEDIATE -# endif -# if !HEDLEY_GCC_VERSION_CHECK(4,6,0) -# define SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8 /* TODO: find relevant bug or commit */ -# endif -# if !HEDLEY_GCC_VERSION_CHECK(8,0,0) -# define SIMDE_BUG_GCC_REV_247851 -# endif -# if !HEDLEY_GCC_VERSION_CHECK(10,0,0) -# define SIMDE_BUG_GCC_REV_274313 -# define SIMDE_BUG_GCC_91341 -# define SIMDE_BUG_GCC_92035 -# endif -# if !HEDLEY_GCC_VERSION_CHECK(9,0,0) && defined(SIMDE_ARCH_AARCH64) -# define SIMDE_BUG_GCC_ARM_SHIFT_SCALAR -# endif -# if !HEDLEY_GCC_VERSION_CHECK(9,0,0) && defined(SIMDE_ARCH_AARCH64) -# define SIMDE_BUG_GCC_BAD_VEXT_REV32 -# endif -# if !(HEDLEY_GCC_VERSION_CHECK(9,4,0) \ - || (HEDLEY_GCC_VERSION_CHECK(8,5,0) && !HEDLEY_GCC_VERSION_CHECK(9,0,0)) \ - ) && defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64) -# define SIMDE_BUG_GCC_94482 -# endif -# if (defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) || defined(SIMDE_ARCH_ZARCH) -# define SIMDE_BUG_GCC_53784 -# endif -# if defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64) -# if HEDLEY_GCC_VERSION_CHECK(4,3,0) /* -Wsign-conversion */ -# define SIMDE_BUG_GCC_95144 -# endif -# if !HEDLEY_GCC_VERSION_CHECK(11,2,0) -# define SIMDE_BUG_GCC_95483 -# endif -# if defined(__OPTIMIZE__) -# define SIMDE_BUG_GCC_100927 -# endif -# if !(HEDLEY_GCC_VERSION_CHECK(10,3,0)) -# define SIMDE_BUG_GCC_98521 -# endif -# endif -# if !HEDLEY_GCC_VERSION_CHECK(9,4,0) && defined(SIMDE_ARCH_AARCH64) -# define SIMDE_BUG_GCC_94488 -# endif -# if !HEDLEY_GCC_VERSION_CHECK(9,1,0) && defined(SIMDE_ARCH_AARCH64) -# define SIMDE_BUG_GCC_REV_264019 -# endif -# if (!HEDLEY_GCC_VERSION_CHECK(9,0,0) && !defined(SIMDE_ARCH_AARCH64)) || (!defined(SIMDE_ARCH_AARCH64) && defined(SIMDE_ARCH_ARM)) -# define SIMDE_BUG_GCC_REV_260989 -# endif -# if defined(SIMDE_ARCH_ARM) && !defined(SIMDE_ARCH_AARCH64) -# define SIMDE_BUG_GCC_95399 -# define SIMDE_BUG_GCC_95471 -# define SIMDE_BUG_GCC_111609 -# elif defined(SIMDE_ARCH_POWER) -# define SIMDE_BUG_GCC_95227 -# define SIMDE_BUG_GCC_95782 -# if !HEDLEY_GCC_VERSION_CHECK(12,0,0) -# define SIMDE_BUG_VEC_CPSGN_REVERSED_ARGS -# endif -# elif defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64) -# if !HEDLEY_GCC_VERSION_CHECK(10,2,0) && !defined(__OPTIMIZE__) -# define SIMDE_BUG_GCC_96174 -# endif -# elif defined(SIMDE_ARCH_ZARCH) -# define SIMDE_BUG_GCC_95782 -# if HEDLEY_GCC_VERSION_CHECK(10,0,0) -# define SIMDE_BUG_GCC_101614 -# endif -# endif -# if defined(SIMDE_ARCH_MIPS_MSA) -# define SIMDE_BUG_GCC_97248 -# if !HEDLEY_GCC_VERSION_CHECK(12,1,0) -# define SIMDE_BUG_GCC_100760 -# define SIMDE_BUG_GCC_100761 -# define SIMDE_BUG_GCC_100762 -# endif -# endif -# if !defined(__OPTIMIZE__) && !(\ - HEDLEY_GCC_VERSION_CHECK(11,4,0) \ - || (HEDLEY_GCC_VERSION_CHECK(10,4,0) && !(HEDLEY_GCC_VERSION_CHECK(11,0,0))) \ - || (HEDLEY_GCC_VERSION_CHECK(9,5,0) && !(HEDLEY_GCC_VERSION_CHECK(10,0,0)))) -# define SIMDE_BUG_GCC_105339 -# endif -# elif defined(__clang__) -# if defined(SIMDE_ARCH_AARCH64) -# define SIMDE_BUG_CLANG_48257 // https://github.com/llvm/llvm-project/issues/47601 -# define SIMDE_BUG_CLANG_71362 // https://github.com/llvm/llvm-project/issues/71362 -# define SIMDE_BUG_CLANG_71365 // https://github.com/llvm/llvm-project/issues/71365 -# define SIMDE_BUG_CLANG_71751 // https://github.com/llvm/llvm-project/issues/71751 -# if !SIMDE_DETECT_CLANG_VERSION_CHECK(15,0,0) -# define SIMDE_BUG_CLANG_45541 -# endif -# if !SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0) -# define SIMDE_BUG_CLANG_46840 -# define SIMDE_BUG_CLANG_46844 -# endif -# if SIMDE_DETECT_CLANG_VERSION_CHECK(10,0,0) && SIMDE_DETECT_CLANG_VERSION_NOT(11,0,0) -# define SIMDE_BUG_CLANG_BAD_VI64_OPS -# endif -# if SIMDE_DETECT_CLANG_VERSION_NOT(9,0,0) -# define SIMDE_BUG_CLANG_GIT_4EC445B8 -# define SIMDE_BUG_CLANG_REV_365298 /* 0464e07c8f6e3310c28eb210a4513bc2243c2a7e */ -# endif -# endif -# if defined(SIMDE_ARCH_ARM) -# if !SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0) -# define SIMDE_BUG_CLANG_BAD_VGET_SET_LANE_TYPES -# endif -# if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_ARM_NEON_A32V8_NATIVE) -# define SIMDE_BUG_CLANG_71763 // https://github.com/llvm/llvm-project/issues/71763 -# endif -# endif -# if defined(SIMDE_ARCH_POWER) && !SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0) -# define SIMDE_BUG_CLANG_46770 -# endif -# if defined(SIMDE_ARCH_POWER) && (SIMDE_ARCH_POWER == 700) && (SIMDE_DETECT_CLANG_VERSION_CHECK(11,0,0)) -# if !SIMDE_DETECT_CLANG_VERSION_CHECK(13,0,0) -# define SIMDE_BUG_CLANG_50893 -# define SIMDE_BUG_CLANG_50901 -# endif -# endif -# if defined(_ARCH_PWR9) && !SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0) && !defined(__OPTIMIZE__) -# define SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT -# endif -# if defined(SIMDE_ARCH_POWER) -# if !SIMDE_DETECT_CLANG_VERSION_CHECK(14,0,0) -# define SIMDE_BUG_CLANG_50932 -# endif -# if !SIMDE_DETECT_CLANG_VERSION_CHECK(12,0,0) -# define SIMDE_BUG_VEC_CPSGN_REVERSED_ARGS -# endif -# endif -# if defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64) -# if SIMDE_DETECT_CLANG_VERSION_NOT(5,0,0) -# define SIMDE_BUG_CLANG_REV_298042 /* 6afc436a7817a52e78ae7bcdc3faafd460124cac */ -# endif -# if SIMDE_DETECT_CLANG_VERSION_NOT(3,7,0) -# define SIMDE_BUG_CLANG_REV_234560 /* b929ad7b1726a32650a8051f69a747fb6836c540 */ -# endif -# if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) && SIMDE_DETECT_CLANG_VERSION_NOT(5,0,0) -# define SIMDE_BUG_CLANG_BAD_MADD -# endif -# if SIMDE_DETECT_CLANG_VERSION_CHECK(4,0,0) && SIMDE_DETECT_CLANG_VERSION_NOT(5,0,0) -# define SIMDE_BUG_CLANG_REV_299346 /* ac9959eb533a58482ea4da6c4db1e635a98de384 */ -# endif -# if SIMDE_DETECT_CLANG_VERSION_NOT(8,0,0) -# define SIMDE_BUG_CLANG_REV_344862 /* eae26bf73715994c2bd145f9b6dc3836aa4ffd4f */ -# endif -# if HEDLEY_HAS_WARNING("-Wsign-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(11,0,0) -# define SIMDE_BUG_CLANG_45931 -# endif -# if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(11,0,0) -# define SIMDE_BUG_CLANG_44589 -# endif -# define SIMDE_BUG_CLANG_48673 // https://github.com/llvm/llvm-project/issues/48017 -# endif -# define SIMDE_BUG_CLANG_45959 // https://github.com/llvm/llvm-project/issues/45304 -# if defined(SIMDE_ARCH_WASM_SIMD128) && !SIMDE_DETECT_CLANG_VERSION_CHECK(17,0,0) -# define SIMDE_BUG_CLANG_60655 -# endif -# elif defined(HEDLEY_MSVC_VERSION) -# if defined(SIMDE_ARCH_X86) -# define SIMDE_BUG_MSVC_ROUND_EXTRACT -# endif -# elif defined(HEDLEY_INTEL_VERSION) -# define SIMDE_BUG_INTEL_857088 -# elif defined(HEDLEY_MCST_LCC_VERSION) -# define SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS -# define SIMDE_BUG_MCST_LCC_MISSING_CMOV_M256 -# define SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT -# elif defined(HEDLEY_PGI_VERSION) -# define SIMDE_BUG_PGI_30104 -# define SIMDE_BUG_PGI_30107 -# define SIMDE_BUG_PGI_30106 -# endif -#endif - -/* GCC and Clang both have the same issue: - * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95144 - * https://bugs.llvm.org/show_bug.cgi?id=45931 - * This is just an easy way to work around it. - */ -#if \ - (HEDLEY_HAS_WARNING("-Wsign-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(11,0,0)) || \ - HEDLEY_GCC_VERSION_CHECK(4,3,0) -# define SIMDE_BUG_IGNORE_SIGN_CONVERSION(expr) (__extension__ ({ \ - HEDLEY_DIAGNOSTIC_PUSH \ - _Pragma("GCC diagnostic ignored \"-Wsign-conversion\"") \ - __typeof__(expr) simde_bug_ignore_sign_conversion_v_= (expr); \ - HEDLEY_DIAGNOSTIC_POP \ - simde_bug_ignore_sign_conversion_v_; \ - })) -#else -# define SIMDE_BUG_IGNORE_SIGN_CONVERSION(expr) (expr) -#endif - -/* Usually the shift count is signed (for example, NEON or SSE). - * OTOH, unsigned is good for PPC (vec_srl uses unsigned), and the only option for E2K. - * Further info: https://github.com/simd-everywhere/simde/pull/700 - */ -#if defined(SIMDE_ARCH_E2K) || defined(SIMDE_ARCH_POWER) - #define SIMDE_CAST_VECTOR_SHIFT_COUNT(width, value) HEDLEY_STATIC_CAST(uint##width##_t, (value)) -#else - #define SIMDE_CAST_VECTOR_SHIFT_COUNT(width, value) HEDLEY_STATIC_CAST(int##width##_t, (value)) -#endif - -/* SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_ */ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_COMMON_H) */ diff --git a/ffi-deps/simde/simde/simde-complex.h b/ffi-deps/simde/simde/simde-complex.h deleted file mode 100644 index ce840e2..0000000 --- a/ffi-deps/simde/simde/simde-complex.h +++ /dev/null @@ -1,148 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020-2021 Evan Nemerson - */ - -/* Support for complex math. - * - * We try to avoid inculding (in C++ mode) since it pulls in - * a *lot* of code. Unfortunately this only works for GNU modes (i.e., - * -std=gnu++14 not -std=c++14) unless you pass -fext-numeric-literals, - * but there is no way (AFAICT) to detect that flag so we have to rely - * on __STRICT_ANSI__ to instead detect GNU mode. - * - * This header is separate from simde-math.h since there is a good - * chance it will pull in , and most of the time we don't need - * complex math (on x86 only SVML uses it). */ - -#if !defined(SIMDE_COMPLEX_H) -#define SIMDE_COMPLEX_H 1 - -#include "simde-math.h" - -#if ( \ - HEDLEY_HAS_BUILTIN(__builtin_creal) || \ - HEDLEY_GCC_VERSION_CHECK(4,7,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) \ - ) && (!defined(__cplusplus) && !defined(__STRICT_ANSI__)) - HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_C99_EXTENSIONS_ - typedef __complex__ float simde_cfloat32; - typedef __complex__ double simde_cfloat64; - HEDLEY_DIAGNOSTIC_POP - #define SIMDE_MATH_CMPLX(x, y) (HEDLEY_STATIC_CAST(double, x) + HEDLEY_STATIC_CAST(double, y) * (__extension__ 1.0j)) - #define SIMDE_MATH_CMPLXF(x, y) (HEDLEY_STATIC_CAST(float, x) + HEDLEY_STATIC_CAST(float, y) * (__extension__ 1.0fj)) - - #if !defined(simde_math_creal) - #define simde_math_crealf(z) __builtin_crealf(z) - #endif - #if !defined(simde_math_crealf) - #define simde_math_creal(z) __builtin_creal(z) - #endif - #if !defined(simde_math_cimag) - #define simde_math_cimagf(z) __builtin_cimagf(z) - #endif - #if !defined(simde_math_cimagf) - #define simde_math_cimag(z) __builtin_cimag(z) - #endif - #if !defined(simde_math_cexp) - #define simde_math_cexp(z) __builtin_cexp(z) - #endif - #if !defined(simde_math_cexpf) - #define simde_math_cexpf(z) __builtin_cexpf(z) - #endif -#elif !defined(__cplusplus) - #include - - #if !defined(HEDLEY_MSVC_VERSION) - typedef float _Complex simde_cfloat32; - typedef double _Complex simde_cfloat64; - #else - typedef _Fcomplex simde_cfloat32; - typedef _Dcomplex simde_cfloat64; - #endif - - #if defined(HEDLEY_MSVC_VERSION) - #define SIMDE_MATH_CMPLX(x, y) ((simde_cfloat64) { (x), (y) }) - #define SIMDE_MATH_CMPLXF(x, y) ((simde_cfloat32) { (x), (y) }) - #elif defined(CMPLX) && defined(CMPLXF) - #define SIMDE_MATH_CMPLX(x, y) CMPLX(x, y) - #define SIMDE_MATH_CMPLXF(x, y) CMPLXF(x, y) - #else - #define SIMDE_MATH_CMPLX(x, y) (HEDLEY_STATIC_CAST(double, x) + HEDLEY_STATIC_CAST(double, y) * I) - #define SIMDE_MATH_CMPLXF(x, y) (HEDLEY_STATIC_CAST(float, x) + HEDLEY_STATIC_CAST(float, y) * I) - #endif - - #if !defined(simde_math_creal) - #define simde_math_creal(z) creal(z) - #endif - #if !defined(simde_math_crealf) - #define simde_math_crealf(z) crealf(z) - #endif - #if !defined(simde_math_cimag) - #define simde_math_cimag(z) cimag(z) - #endif - #if !defined(simde_math_cimagf) - #define simde_math_cimagf(z) cimagf(z) - #endif - #if !defined(simde_math_cexp) - #define simde_math_cexp(z) cexp(z) - #endif - #if !defined(simde_math_cexpf) - #define simde_math_cexpf(z) cexpf(z) - #endif -#else - HEDLEY_DIAGNOSTIC_PUSH - #if defined(HEDLEY_MSVC_VERSION) - #pragma warning(disable:4530) - #endif - #include - HEDLEY_DIAGNOSTIC_POP - - typedef std::complex simde_cfloat32; - typedef std::complex simde_cfloat64; - #define SIMDE_MATH_CMPLX(x, y) (std::complex(x, y)) - #define SIMDE_MATH_CMPLXF(x, y) (std::complex(x, y)) - - #if !defined(simde_math_creal) - #define simde_math_creal(z) ((z).real()) - #endif - #if !defined(simde_math_crealf) - #define simde_math_crealf(z) ((z).real()) - #endif - #if !defined(simde_math_cimag) - #define simde_math_cimag(z) ((z).imag()) - #endif - #if !defined(simde_math_cimagf) - #define simde_math_cimagf(z) ((z).imag()) - #endif - #if !defined(simde_math_cexp) - #define simde_math_cexp(z) std::exp(z) - #endif - #if !defined(simde_math_cexpf) - #define simde_math_cexpf(z) std::exp(z) - #endif -#endif - -#endif /* !defined(SIMDE_COMPLEX_H) */ diff --git a/ffi-deps/simde/simde/simde-constify.h b/ffi-deps/simde/simde/simde-constify.h deleted file mode 100644 index 94a9d38..0000000 --- a/ffi-deps/simde/simde/simde-constify.h +++ /dev/null @@ -1,397 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - */ - -/* Constify macros. For internal use only. - * - * These are used to make it possible to call a function which takes - * an Integer Constant Expression (ICE) using a compile time constant. - * Technically it would also be possible to use a value not trivially - * known by the compiler, but there would be a siginficant performance - * hit (a switch switch is used). - * - * The basic idea is pretty simple; we just emit a do while loop which - * contains a switch with a case for every possible value of the - * constant. - * - * As long as the value you pass to the function in constant, pretty - * much any copmiler shouldn't have a problem generating exactly the - * same code as if you had used an ICE. - * - * This is intended to be used in the SIMDe implementations of - * functions the compilers require to be an ICE, but the other benefit - * is that if we also disable the warnings from - * SIMDE_REQUIRE_CONSTANT_RANGE we can actually just allow the tests - * to use non-ICE parameters - */ - -#if !defined(SIMDE_CONSTIFY_H) -#define SIMDE_CONSTIFY_H - -#include "simde-diagnostic.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_ -SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ - -#define SIMDE_CONSTIFY_2_(func_name, result, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: result = func_name(__VA_ARGS__, 0); break; \ - case 1: result = func_name(__VA_ARGS__, 1); break; \ - default: result = default_case; break; \ - } \ - } while (0) - -#define SIMDE_CONSTIFY_4_(func_name, result, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: result = func_name(__VA_ARGS__, 0); break; \ - case 1: result = func_name(__VA_ARGS__, 1); break; \ - case 2: result = func_name(__VA_ARGS__, 2); break; \ - case 3: result = func_name(__VA_ARGS__, 3); break; \ - default: result = default_case; break; \ - } \ - } while (0) - -#define SIMDE_CONSTIFY_8_(func_name, result, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: result = func_name(__VA_ARGS__, 0); break; \ - case 1: result = func_name(__VA_ARGS__, 1); break; \ - case 2: result = func_name(__VA_ARGS__, 2); break; \ - case 3: result = func_name(__VA_ARGS__, 3); break; \ - case 4: result = func_name(__VA_ARGS__, 4); break; \ - case 5: result = func_name(__VA_ARGS__, 5); break; \ - case 6: result = func_name(__VA_ARGS__, 6); break; \ - case 7: result = func_name(__VA_ARGS__, 7); break; \ - default: result = default_case; break; \ - } \ - } while (0) - -#define SIMDE_CONSTIFY_16_(func_name, result, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: result = func_name(__VA_ARGS__, 0); break; \ - case 1: result = func_name(__VA_ARGS__, 1); break; \ - case 2: result = func_name(__VA_ARGS__, 2); break; \ - case 3: result = func_name(__VA_ARGS__, 3); break; \ - case 4: result = func_name(__VA_ARGS__, 4); break; \ - case 5: result = func_name(__VA_ARGS__, 5); break; \ - case 6: result = func_name(__VA_ARGS__, 6); break; \ - case 7: result = func_name(__VA_ARGS__, 7); break; \ - case 8: result = func_name(__VA_ARGS__, 8); break; \ - case 9: result = func_name(__VA_ARGS__, 9); break; \ - case 10: result = func_name(__VA_ARGS__, 10); break; \ - case 11: result = func_name(__VA_ARGS__, 11); break; \ - case 12: result = func_name(__VA_ARGS__, 12); break; \ - case 13: result = func_name(__VA_ARGS__, 13); break; \ - case 14: result = func_name(__VA_ARGS__, 14); break; \ - case 15: result = func_name(__VA_ARGS__, 15); break; \ - default: result = default_case; break; \ - } \ - } while (0) - -#define SIMDE_CONSTIFY_32_(func_name, result, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: result = func_name(__VA_ARGS__, 0); break; \ - case 1: result = func_name(__VA_ARGS__, 1); break; \ - case 2: result = func_name(__VA_ARGS__, 2); break; \ - case 3: result = func_name(__VA_ARGS__, 3); break; \ - case 4: result = func_name(__VA_ARGS__, 4); break; \ - case 5: result = func_name(__VA_ARGS__, 5); break; \ - case 6: result = func_name(__VA_ARGS__, 6); break; \ - case 7: result = func_name(__VA_ARGS__, 7); break; \ - case 8: result = func_name(__VA_ARGS__, 8); break; \ - case 9: result = func_name(__VA_ARGS__, 9); break; \ - case 10: result = func_name(__VA_ARGS__, 10); break; \ - case 11: result = func_name(__VA_ARGS__, 11); break; \ - case 12: result = func_name(__VA_ARGS__, 12); break; \ - case 13: result = func_name(__VA_ARGS__, 13); break; \ - case 14: result = func_name(__VA_ARGS__, 14); break; \ - case 15: result = func_name(__VA_ARGS__, 15); break; \ - case 16: result = func_name(__VA_ARGS__, 16); break; \ - case 17: result = func_name(__VA_ARGS__, 17); break; \ - case 18: result = func_name(__VA_ARGS__, 18); break; \ - case 19: result = func_name(__VA_ARGS__, 19); break; \ - case 20: result = func_name(__VA_ARGS__, 20); break; \ - case 21: result = func_name(__VA_ARGS__, 21); break; \ - case 22: result = func_name(__VA_ARGS__, 22); break; \ - case 23: result = func_name(__VA_ARGS__, 23); break; \ - case 24: result = func_name(__VA_ARGS__, 24); break; \ - case 25: result = func_name(__VA_ARGS__, 25); break; \ - case 26: result = func_name(__VA_ARGS__, 26); break; \ - case 27: result = func_name(__VA_ARGS__, 27); break; \ - case 28: result = func_name(__VA_ARGS__, 28); break; \ - case 29: result = func_name(__VA_ARGS__, 29); break; \ - case 30: result = func_name(__VA_ARGS__, 30); break; \ - case 31: result = func_name(__VA_ARGS__, 31); break; \ - default: result = default_case; break; \ - } \ - } while (0) - -#define SIMDE_CONSTIFY_64_(func_name, result, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: result = func_name(__VA_ARGS__, 0); break; \ - case 1: result = func_name(__VA_ARGS__, 1); break; \ - case 2: result = func_name(__VA_ARGS__, 2); break; \ - case 3: result = func_name(__VA_ARGS__, 3); break; \ - case 4: result = func_name(__VA_ARGS__, 4); break; \ - case 5: result = func_name(__VA_ARGS__, 5); break; \ - case 6: result = func_name(__VA_ARGS__, 6); break; \ - case 7: result = func_name(__VA_ARGS__, 7); break; \ - case 8: result = func_name(__VA_ARGS__, 8); break; \ - case 9: result = func_name(__VA_ARGS__, 9); break; \ - case 10: result = func_name(__VA_ARGS__, 10); break; \ - case 11: result = func_name(__VA_ARGS__, 11); break; \ - case 12: result = func_name(__VA_ARGS__, 12); break; \ - case 13: result = func_name(__VA_ARGS__, 13); break; \ - case 14: result = func_name(__VA_ARGS__, 14); break; \ - case 15: result = func_name(__VA_ARGS__, 15); break; \ - case 16: result = func_name(__VA_ARGS__, 16); break; \ - case 17: result = func_name(__VA_ARGS__, 17); break; \ - case 18: result = func_name(__VA_ARGS__, 18); break; \ - case 19: result = func_name(__VA_ARGS__, 19); break; \ - case 20: result = func_name(__VA_ARGS__, 20); break; \ - case 21: result = func_name(__VA_ARGS__, 21); break; \ - case 22: result = func_name(__VA_ARGS__, 22); break; \ - case 23: result = func_name(__VA_ARGS__, 23); break; \ - case 24: result = func_name(__VA_ARGS__, 24); break; \ - case 25: result = func_name(__VA_ARGS__, 25); break; \ - case 26: result = func_name(__VA_ARGS__, 26); break; \ - case 27: result = func_name(__VA_ARGS__, 27); break; \ - case 28: result = func_name(__VA_ARGS__, 28); break; \ - case 29: result = func_name(__VA_ARGS__, 29); break; \ - case 30: result = func_name(__VA_ARGS__, 30); break; \ - case 31: result = func_name(__VA_ARGS__, 31); break; \ - case 32: result = func_name(__VA_ARGS__, 32); break; \ - case 33: result = func_name(__VA_ARGS__, 33); break; \ - case 34: result = func_name(__VA_ARGS__, 34); break; \ - case 35: result = func_name(__VA_ARGS__, 35); break; \ - case 36: result = func_name(__VA_ARGS__, 36); break; \ - case 37: result = func_name(__VA_ARGS__, 37); break; \ - case 38: result = func_name(__VA_ARGS__, 38); break; \ - case 39: result = func_name(__VA_ARGS__, 39); break; \ - case 40: result = func_name(__VA_ARGS__, 40); break; \ - case 41: result = func_name(__VA_ARGS__, 41); break; \ - case 42: result = func_name(__VA_ARGS__, 42); break; \ - case 43: result = func_name(__VA_ARGS__, 43); break; \ - case 44: result = func_name(__VA_ARGS__, 44); break; \ - case 45: result = func_name(__VA_ARGS__, 45); break; \ - case 46: result = func_name(__VA_ARGS__, 46); break; \ - case 47: result = func_name(__VA_ARGS__, 47); break; \ - case 48: result = func_name(__VA_ARGS__, 48); break; \ - case 49: result = func_name(__VA_ARGS__, 49); break; \ - case 50: result = func_name(__VA_ARGS__, 50); break; \ - case 51: result = func_name(__VA_ARGS__, 51); break; \ - case 52: result = func_name(__VA_ARGS__, 52); break; \ - case 53: result = func_name(__VA_ARGS__, 53); break; \ - case 54: result = func_name(__VA_ARGS__, 54); break; \ - case 55: result = func_name(__VA_ARGS__, 55); break; \ - case 56: result = func_name(__VA_ARGS__, 56); break; \ - case 57: result = func_name(__VA_ARGS__, 57); break; \ - case 58: result = func_name(__VA_ARGS__, 58); break; \ - case 59: result = func_name(__VA_ARGS__, 59); break; \ - case 60: result = func_name(__VA_ARGS__, 60); break; \ - case 61: result = func_name(__VA_ARGS__, 61); break; \ - case 62: result = func_name(__VA_ARGS__, 62); break; \ - case 63: result = func_name(__VA_ARGS__, 63); break; \ - default: result = default_case; break; \ - } \ - } while (0) - -#define SIMDE_CONSTIFY_2_NO_RESULT_(func_name, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: func_name(__VA_ARGS__, 0); break; \ - case 1: func_name(__VA_ARGS__, 1); break; \ - default: default_case; break; \ - } \ - } while (0) - -#define SIMDE_CONSTIFY_4_NO_RESULT_(func_name, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: func_name(__VA_ARGS__, 0); break; \ - case 1: func_name(__VA_ARGS__, 1); break; \ - case 2: func_name(__VA_ARGS__, 2); break; \ - case 3: func_name(__VA_ARGS__, 3); break; \ - default: default_case; break; \ - } \ - } while (0) - -#define SIMDE_CONSTIFY_8_NO_RESULT_(func_name, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: func_name(__VA_ARGS__, 0); break; \ - case 1: func_name(__VA_ARGS__, 1); break; \ - case 2: func_name(__VA_ARGS__, 2); break; \ - case 3: func_name(__VA_ARGS__, 3); break; \ - case 4: func_name(__VA_ARGS__, 4); break; \ - case 5: func_name(__VA_ARGS__, 5); break; \ - case 6: func_name(__VA_ARGS__, 6); break; \ - case 7: func_name(__VA_ARGS__, 7); break; \ - default: default_case; break; \ - } \ - } while (0) - -#define SIMDE_CONSTIFY_16_NO_RESULT_(func_name, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: func_name(__VA_ARGS__, 0); break; \ - case 1: func_name(__VA_ARGS__, 1); break; \ - case 2: func_name(__VA_ARGS__, 2); break; \ - case 3: func_name(__VA_ARGS__, 3); break; \ - case 4: func_name(__VA_ARGS__, 4); break; \ - case 5: func_name(__VA_ARGS__, 5); break; \ - case 6: func_name(__VA_ARGS__, 6); break; \ - case 7: func_name(__VA_ARGS__, 7); break; \ - case 8: func_name(__VA_ARGS__, 8); break; \ - case 9: func_name(__VA_ARGS__, 9); break; \ - case 10: func_name(__VA_ARGS__, 10); break; \ - case 11: func_name(__VA_ARGS__, 11); break; \ - case 12: func_name(__VA_ARGS__, 12); break; \ - case 13: func_name(__VA_ARGS__, 13); break; \ - case 14: func_name(__VA_ARGS__, 14); break; \ - case 15: func_name(__VA_ARGS__, 15); break; \ - default: default_case; break; \ - } \ - } while (0) - -#define SIMDE_CONSTIFY_32_NO_RESULT_(func_name, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: func_name(__VA_ARGS__, 0); break; \ - case 1: func_name(__VA_ARGS__, 1); break; \ - case 2: func_name(__VA_ARGS__, 2); break; \ - case 3: func_name(__VA_ARGS__, 3); break; \ - case 4: func_name(__VA_ARGS__, 4); break; \ - case 5: func_name(__VA_ARGS__, 5); break; \ - case 6: func_name(__VA_ARGS__, 6); break; \ - case 7: func_name(__VA_ARGS__, 7); break; \ - case 8: func_name(__VA_ARGS__, 8); break; \ - case 9: func_name(__VA_ARGS__, 9); break; \ - case 10: func_name(__VA_ARGS__, 10); break; \ - case 11: func_name(__VA_ARGS__, 11); break; \ - case 12: func_name(__VA_ARGS__, 12); break; \ - case 13: func_name(__VA_ARGS__, 13); break; \ - case 14: func_name(__VA_ARGS__, 14); break; \ - case 15: func_name(__VA_ARGS__, 15); break; \ - case 16: func_name(__VA_ARGS__, 16); break; \ - case 17: func_name(__VA_ARGS__, 17); break; \ - case 18: func_name(__VA_ARGS__, 18); break; \ - case 19: func_name(__VA_ARGS__, 19); break; \ - case 20: func_name(__VA_ARGS__, 20); break; \ - case 21: func_name(__VA_ARGS__, 21); break; \ - case 22: func_name(__VA_ARGS__, 22); break; \ - case 23: func_name(__VA_ARGS__, 23); break; \ - case 24: func_name(__VA_ARGS__, 24); break; \ - case 25: func_name(__VA_ARGS__, 25); break; \ - case 26: func_name(__VA_ARGS__, 26); break; \ - case 27: func_name(__VA_ARGS__, 27); break; \ - case 28: func_name(__VA_ARGS__, 28); break; \ - case 29: func_name(__VA_ARGS__, 29); break; \ - case 30: func_name(__VA_ARGS__, 30); break; \ - case 31: func_name(__VA_ARGS__, 31); break; \ - default: default_case; break; \ - } \ - } while (0) - -#define SIMDE_CONSTIFY_64_NO_RESULT_(func_name, default_case, imm, ...) \ - do { \ - switch(imm) { \ - case 0: func_name(__VA_ARGS__, 0); break; \ - case 1: func_name(__VA_ARGS__, 1); break; \ - case 2: func_name(__VA_ARGS__, 2); break; \ - case 3: func_name(__VA_ARGS__, 3); break; \ - case 4: func_name(__VA_ARGS__, 4); break; \ - case 5: func_name(__VA_ARGS__, 5); break; \ - case 6: func_name(__VA_ARGS__, 6); break; \ - case 7: func_name(__VA_ARGS__, 7); break; \ - case 8: func_name(__VA_ARGS__, 8); break; \ - case 9: func_name(__VA_ARGS__, 9); break; \ - case 10: func_name(__VA_ARGS__, 10); break; \ - case 11: func_name(__VA_ARGS__, 11); break; \ - case 12: func_name(__VA_ARGS__, 12); break; \ - case 13: func_name(__VA_ARGS__, 13); break; \ - case 14: func_name(__VA_ARGS__, 14); break; \ - case 15: func_name(__VA_ARGS__, 15); break; \ - case 16: func_name(__VA_ARGS__, 16); break; \ - case 17: func_name(__VA_ARGS__, 17); break; \ - case 18: func_name(__VA_ARGS__, 18); break; \ - case 19: func_name(__VA_ARGS__, 19); break; \ - case 20: func_name(__VA_ARGS__, 20); break; \ - case 21: func_name(__VA_ARGS__, 21); break; \ - case 22: func_name(__VA_ARGS__, 22); break; \ - case 23: func_name(__VA_ARGS__, 23); break; \ - case 24: func_name(__VA_ARGS__, 24); break; \ - case 25: func_name(__VA_ARGS__, 25); break; \ - case 26: func_name(__VA_ARGS__, 26); break; \ - case 27: func_name(__VA_ARGS__, 27); break; \ - case 28: func_name(__VA_ARGS__, 28); break; \ - case 29: func_name(__VA_ARGS__, 29); break; \ - case 30: func_name(__VA_ARGS__, 30); break; \ - case 31: func_name(__VA_ARGS__, 31); break; \ - case 32: func_name(__VA_ARGS__, 32); break; \ - case 33: func_name(__VA_ARGS__, 33); break; \ - case 34: func_name(__VA_ARGS__, 34); break; \ - case 35: func_name(__VA_ARGS__, 35); break; \ - case 36: func_name(__VA_ARGS__, 36); break; \ - case 37: func_name(__VA_ARGS__, 37); break; \ - case 38: func_name(__VA_ARGS__, 38); break; \ - case 39: func_name(__VA_ARGS__, 39); break; \ - case 40: func_name(__VA_ARGS__, 40); break; \ - case 41: func_name(__VA_ARGS__, 41); break; \ - case 42: func_name(__VA_ARGS__, 42); break; \ - case 43: func_name(__VA_ARGS__, 43); break; \ - case 44: func_name(__VA_ARGS__, 44); break; \ - case 45: func_name(__VA_ARGS__, 45); break; \ - case 46: func_name(__VA_ARGS__, 46); break; \ - case 47: func_name(__VA_ARGS__, 47); break; \ - case 48: func_name(__VA_ARGS__, 48); break; \ - case 49: func_name(__VA_ARGS__, 49); break; \ - case 50: func_name(__VA_ARGS__, 50); break; \ - case 51: func_name(__VA_ARGS__, 51); break; \ - case 52: func_name(__VA_ARGS__, 52); break; \ - case 53: func_name(__VA_ARGS__, 53); break; \ - case 54: func_name(__VA_ARGS__, 54); break; \ - case 55: func_name(__VA_ARGS__, 55); break; \ - case 56: func_name(__VA_ARGS__, 56); break; \ - case 57: func_name(__VA_ARGS__, 57); break; \ - case 58: func_name(__VA_ARGS__, 58); break; \ - case 59: func_name(__VA_ARGS__, 59); break; \ - case 60: func_name(__VA_ARGS__, 60); break; \ - case 61: func_name(__VA_ARGS__, 61); break; \ - case 62: func_name(__VA_ARGS__, 62); break; \ - case 63: func_name(__VA_ARGS__, 63); break; \ - default: default_case; break; \ - } \ - } while (0) - -HEDLEY_DIAGNOSTIC_POP - -#endif diff --git a/ffi-deps/simde/simde/simde-detect-clang.h b/ffi-deps/simde/simde/simde-detect-clang.h deleted file mode 100644 index 15d695a..0000000 --- a/ffi-deps/simde/simde/simde-detect-clang.h +++ /dev/null @@ -1,125 +0,0 @@ -/* Detect Clang Version - * Created by Evan Nemerson - * - * To the extent possible under law, the author(s) have dedicated all - * copyright and related and neighboring rights to this software to - * the public domain worldwide. This software is distributed without - * any warranty. - * - * For details, see . - * SPDX-License-Identifier: CC0-1.0 - */ - -/* This file was originally part of SIMDe - * (). You're free to do with it as - * you please, but I do have a few small requests: - * - * * If you make improvements, please submit them back to SIMDe - * (at ) so others can - * benefit from them. - * * Please keep a link to SIMDe intact so people know where to submit - * improvements. - * * If you expose it publicly, please change the SIMDE_ prefix to - * something specific to your project. - * - * The version numbers clang exposes (in the ___clang_major__, - * __clang_minor__, and __clang_patchlevel__ macros) are unreliable. - * Vendors such as Apple will define these values to their version - * numbers; for example, "Apple Clang 4.0" is really clang 3.1, but - * __clang_major__ and __clang_minor__ are defined to 4 and 0 - * respectively, instead of 3 and 1. - * - * The solution is *usually* to use clang's feature detection macros - * () - * to determine if the feature you're interested in is available. This - * generally works well, and it should probably be the first thing you - * try. Unfortunately, it's not possible to check for everything. In - * particular, compiler bugs. - * - * This file just uses the feature checking macros to detect features - * added in specific versions of clang to identify which version of - * clang the compiler is based on. - * - * Right now it only goes back to 3.6, but I'm happy to accept patches - * to go back further. And, of course, newer versions are welcome if - * they're not already present, and if you find a way to detect a point - * release that would be great, too! - */ - -#if !defined(SIMDE_DETECT_CLANG_H) -#define SIMDE_DETECT_CLANG_H 1 - -/* Attempt to detect the upstream clang version number. I usually only - * worry about major version numbers (at least for 4.0+), but if you - * need more resolution I'm happy to accept patches that are able to - * detect minor versions as well. That said, you'll probably have a - * hard time with detection since AFAIK most minor releases don't add - * anything we can detect. Updated based on - * https://github.com/google/highway/blob/438c705a295176b96a50336527bb3e7ea365ffac/hwy/detect_compiler_arch.h#L73 - * - would welcome patches/updates there as well. - */ - -#if defined(__clang__) && !defined(SIMDE_DETECT_CLANG_VERSION) -# if __has_attribute(nouwtable) // no new warnings in 16.0 -# define SIMDE_DETECT_CLANG_VERSION 160000 -# elif __has_warning("-Warray-parameter") -# define SIMDE_DETECT_CLANG_VERSION 150000 -# elif __has_warning("-Wbitwise-instead-of-logical") -# define SIMDE_DETECT_CLANG_VERSION 140000 -# elif __has_warning("-Wwaix-compat") -# define SIMDE_DETECT_CLANG_VERSION 130000 -# elif __has_warning("-Wformat-insufficient-args") -# define SIMDE_DETECT_CLANG_VERSION 120000 -# elif __has_warning("-Wimplicit-const-int-float-conversion") -# define SIMDE_DETECT_CLANG_VERSION 110000 -# elif __has_warning("-Wmisleading-indentation") -# define SIMDE_DETECT_CLANG_VERSION 100000 -# elif defined(__FILE_NAME__) -# define SIMDE_DETECT_CLANG_VERSION 90000 -# elif __has_warning("-Wextra-semi-stmt") || __has_builtin(__builtin_rotateleft32) -# define SIMDE_DETECT_CLANG_VERSION 80000 -// For reasons unknown, XCode 10.3 (Apple LLVM version 10.0.1) is apparently -// based on Clang 7, but does not support the warning we test. -// See https://en.wikipedia.org/wiki/Xcode#Toolchain_versions and -// https://trac.macports.org/wiki/XcodeVersionInfo. -# elif __has_warning("-Wc++98-compat-extra-semi") || \ - (defined(__apple_build_version__) && __apple_build_version__ >= 10010000) -# define SIMDE_DETECT_CLANG_VERSION 70000 -# elif __has_warning("-Wpragma-pack") -# define SIMDE_DETECT_CLANG_VERSION 60000 -# elif __has_warning("-Wbitfield-enum-conversion") -# define SIMDE_DETECT_CLANG_VERSION 50000 -# elif __has_attribute(diagnose_if) -# define SIMDE_DETECT_CLANG_VERSION 40000 -# elif __has_warning("-Wcomma") -# define SIMDE_DETECT_CLANG_VERSION 39000 -# elif __has_warning("-Wdouble-promotion") -# define SIMDE_DETECT_CLANG_VERSION 38000 -# elif __has_warning("-Wshift-negative-value") -# define SIMDE_DETECT_CLANG_VERSION 37000 -# elif __has_warning("-Wambiguous-ellipsis") -# define SIMDE_DETECT_CLANG_VERSION 36000 -# else -# define SIMDE_DETECT_CLANG_VERSION 1 -# endif -#endif /* defined(__clang__) && !defined(SIMDE_DETECT_CLANG_VERSION) */ - -/* The SIMDE_DETECT_CLANG_VERSION_CHECK macro is pretty - * straightforward; it returns true if the compiler is a derivative - * of clang >= the specified version. - * - * Since this file is often (primarily?) useful for working around bugs - * it is also helpful to have a macro which returns true if only if the - * compiler is a version of clang *older* than the specified version to - * make it a bit easier to ifdef regions to add code for older versions, - * such as pragmas to disable a specific warning. */ - -#if defined(SIMDE_DETECT_CLANG_VERSION) -# define SIMDE_DETECT_CLANG_VERSION_CHECK(major, minor, revision) (SIMDE_DETECT_CLANG_VERSION >= ((major * 10000) + (minor * 1000) + (revision))) -# define SIMDE_DETECT_CLANG_VERSION_NOT(major, minor, revision) (SIMDE_DETECT_CLANG_VERSION < ((major * 10000) + (minor * 1000) + (revision))) -#else -# define SIMDE_DETECT_CLANG_VERSION_CHECK(major, minor, revision) (0) -# define SIMDE_DETECT_CLANG_VERSION_NOT(major, minor, revision) (0) -#endif - -#endif /* !defined(SIMDE_DETECT_CLANG_H) */ diff --git a/ffi-deps/simde/simde/simde-diagnostic.h b/ffi-deps/simde/simde/simde-diagnostic.h deleted file mode 100644 index a525d3a..0000000 --- a/ffi-deps/simde/simde/simde-diagnostic.h +++ /dev/null @@ -1,456 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2017-2020 Evan Nemerson - */ - -/* SIMDe targets a very wide range of standards and compilers, and our - * goal is to compile cleanly even with extremely aggressive warnings - * (i.e., -Weverything in clang, -Wextra in GCC, /W4 for MSVC, etc.) - * treated as errors. - * - * While our preference is to resolve the underlying issue a given - * diagnostic is warning us about, sometimes that's not possible. - * Fixing a warning in one compiler may cause problems in another. - * Sometimes a warning doesn't really apply to us (false positives), - * and sometimes adhering to a warning would mean dropping a feature - * we *know* the compiler supports since we have tested specifically - * for the compiler or feature. - * - * When practical, warnings are only disabled for specific code. For - * a list of warnings which are enabled by default in all SIMDe code, - * see SIMDE_DISABLE_UNWANTED_DIAGNOSTICS. Note that we restore the - * warning stack when SIMDe is done parsing, so code which includes - * SIMDe is not deprived of these warnings. - */ - -#if !defined(SIMDE_DIAGNOSTIC_H) -#define SIMDE_DIAGNOSTIC_H - -#include "hedley.h" -#include "simde-detect-clang.h" -#include "simde-arch.h" - -/* This is only to help us implement functions like _mm_undefined_ps. */ -#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - #undef SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ -#endif -#if HEDLEY_HAS_WARNING("-Wuninitialized") - #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("clang diagnostic ignored \"-Wuninitialized\"") -#elif HEDLEY_GCC_VERSION_CHECK(4,2,0) - #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("GCC diagnostic ignored \"-Wuninitialized\"") -#elif HEDLEY_PGI_VERSION_CHECK(19,10,0) - #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("diag_suppress 549") -#elif HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) && defined(__cplusplus) - #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("error_messages(off,SEC_UNINITIALIZED_MEM_READ,SEC_UNDEFINED_RETURN_VALUE,unassigned)") -#elif HEDLEY_SUNPRO_VERSION_CHECK(5,14,0) - #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("error_messages(off,SEC_UNINITIALIZED_MEM_READ,SEC_UNDEFINED_RETURN_VALUE)") -#elif HEDLEY_SUNPRO_VERSION_CHECK(5,12,0) && defined(__cplusplus) - #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("error_messages(off,unassigned)") -#elif \ - HEDLEY_TI_VERSION_CHECK(16,9,9) || \ - HEDLEY_TI_CL6X_VERSION_CHECK(8,0,0) || \ - HEDLEY_TI_CL7X_VERSION_CHECK(1,2,0) || \ - HEDLEY_TI_CLPRU_VERSION_CHECK(2,3,2) - #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("diag_suppress 551") -#elif HEDLEY_INTEL_VERSION_CHECK(13,0,0) - #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ _Pragma("warning(disable:592)") -#elif HEDLEY_MSVC_VERSION_CHECK(19,0,0) && !defined(__MSVC_RUNTIME_CHECKS) - #define SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ __pragma(warning(disable:4700)) -#endif - -/* GCC emits a lot of "notes" about the ABI being different for things - * in newer versions of GCC. We don't really care because all our - * functions are inlined and don't generate ABI. */ -#if HEDLEY_GCC_VERSION_CHECK(7,0,0) - #define SIMDE_DIAGNOSTIC_DISABLE_PSABI_ _Pragma("GCC diagnostic ignored \"-Wpsabi\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_PSABI_ -#endif - -/* Since MMX uses x87 FP registers, you're supposed to call _mm_empty() - * after each MMX function before any floating point instructions. - * Some compilers warn about functions which use MMX functions but - * don't call _mm_empty(). However, since SIMDe is implementyng the - * MMX API we shouldn't be calling _mm_empty(); we leave it to the - * caller to invoke simde_mm_empty(). */ -#if HEDLEY_INTEL_VERSION_CHECK(19,0,0) - #define SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_ _Pragma("warning(disable:13200 13203)") -#elif defined(HEDLEY_MSVC_VERSION) - #define SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_ __pragma(warning(disable:4799)) -#else - #define SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_ -#endif - -/* Intel is pushing people to use OpenMP SIMD instead of Cilk+, so they - * emit a diagnostic if you use #pragma simd instead of - * #pragma omp simd. SIMDe supports OpenMP SIMD, you just need to - * compile with -qopenmp or -qopenmp-simd and define - * SIMDE_ENABLE_OPENMP. Cilk+ is just a fallback. */ -#if HEDLEY_INTEL_VERSION_CHECK(18,0,0) - #define SIMDE_DIAGNOSTIC_DISABLE_SIMD_PRAGMA_DEPRECATED_ _Pragma("warning(disable:3948)") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_SIMD_PRAGMA_DEPRECATED_ -#endif - -/* MSVC emits a diagnostic when we call a function (like - * simde_mm_set_epi32) while initializing a struct. We currently do - * this a *lot* in the tests. */ -#if \ - defined(HEDLEY_MSVC_VERSION) - #define SIMDE_DIAGNOSTIC_DISABLE_NON_CONSTANT_AGGREGATE_INITIALIZER_ __pragma(warning(disable:4204)) -#else - #define SIMDE_DIAGNOSTIC_DISABLE_NON_CONSTANT_AGGREGATE_INITIALIZER_ -#endif - -/* This warning needs a lot of work. It is triggered if all you do is - * pass the value to memcpy/__builtin_memcpy, or if you initialize a - * member of the union, even if that member takes up the entire union. - * Last tested with clang-10, hopefully things will improve in the - * future; if clang fixes this I'd love to enable it. */ -#if \ - HEDLEY_HAS_WARNING("-Wconditional-uninitialized") - #define SIMDE_DIAGNOSTIC_DISABLE_CONDITIONAL_UNINITIALIZED_ _Pragma("clang diagnostic ignored \"-Wconditional-uninitialized\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_CONDITIONAL_UNINITIALIZED_ -#endif - -/* This warning is meant to catch things like `0.3 + 0.4 == 0.7`, which - * will is false. However, SIMDe uses these operations exclusively - * for things like _mm_cmpeq_ps, for which we really do want to check - * for equality (or inequality). - * - * If someone wants to put together a SIMDE_FLOAT_EQUAL(a, op, b) macro - * which just wraps a check in some code do disable this diagnostic I'd - * be happy to accept it. */ -#if \ - HEDLEY_HAS_WARNING("-Wfloat-equal") || \ - HEDLEY_GCC_VERSION_CHECK(3,0,0) - #define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ _Pragma("GCC diagnostic ignored \"-Wfloat-equal\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ -#endif - -/* This is because we use HEDLEY_STATIC_ASSERT for static assertions. - * If Hedley can't find an implementation it will preprocess to - * nothing, which means there will be a trailing semi-colon. */ -#if HEDLEY_HAS_WARNING("-Wextra-semi") - #define SIMDE_DIAGNOSTIC_DISABLE_EXTRA_SEMI_ _Pragma("clang diagnostic ignored \"-Wextra-semi\"") -#elif HEDLEY_GCC_VERSION_CHECK(8,1,0) && defined(__cplusplus) - #define SIMDE_DIAGNOSTIC_DISABLE_EXTRA_SEMI_ _Pragma("GCC diagnostic ignored \"-Wextra-semi\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_EXTRA_SEMI_ -#endif - -/* We do use a few variadic macros, which technically aren't available - * until C99 and C++11, but every compiler I'm aware of has supported - * them for much longer. That said, usage is isolated to the test - * suite and compilers known to support them. */ -#if HEDLEY_HAS_WARNING("-Wvariadic-macros") || HEDLEY_GCC_VERSION_CHECK(4,0,0) - #if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") - #define SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_ \ - _Pragma("clang diagnostic ignored \"-Wvariadic-macros\"") \ - _Pragma("clang diagnostic ignored \"-Wc++98-compat-pedantic\"") - #else - #define SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_ _Pragma("GCC diagnostic ignored \"-Wvariadic-macros\"") - #endif -#else - #define SIMDE_DIAGNOSTIC_DISABLE_VARIADIC_MACROS_ -#endif - -/* emscripten requires us to use a __wasm_unimplemented_simd128__ macro - * before we can access certain SIMD intrinsics, but this diagnostic - * warns about it being a reserved name. It is a reserved name, but - * it's reserved for the compiler and we are using it to convey - * information to the compiler. - * - * This is also used when enabling native aliases since we don't get to - * choose the macro names. */ -#if HEDLEY_HAS_WARNING("-Wreserved-id-macro") - #define SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_MACRO_ _Pragma("clang diagnostic ignored \"-Wreserved-id-macro\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_MACRO_ -#endif - -/* Similar to above; types like simde__m128i are reserved due to the - * double underscore, but we didn't choose them, Intel did. */ -#if HEDLEY_HAS_WARNING("-Wreserved-identifier") - #define SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_ _Pragma("clang diagnostic ignored \"-Wreserved-identifier\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_ -#endif - -/* clang 3.8 warns about the packed attribute being unnecessary when - * used in the _mm_loadu_* functions. That *may* be true for version - * 3.8, but for later versions it is crucial in order to make unaligned - * access safe. */ -#if HEDLEY_HAS_WARNING("-Wpacked") - #define SIMDE_DIAGNOSTIC_DISABLE_PACKED_ _Pragma("clang diagnostic ignored \"-Wpacked\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_PACKED_ -#endif - -/* Triggered when assigning a float to a double implicitly. We use - * explicit casts in SIMDe, this is only used in the test suite. */ -#if HEDLEY_HAS_WARNING("-Wdouble-promotion") - #define SIMDE_DIAGNOSTIC_DISABLE_DOUBLE_PROMOTION_ _Pragma("clang diagnostic ignored \"-Wdouble-promotion\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_DOUBLE_PROMOTION_ -#endif - -/* Several compilers treat conformant array parameters as VLAs. We - * test to make sure we're in C mode (C++ doesn't support CAPs), and - * that the version of the standard supports CAPs. We also reject - * some buggy compilers like MSVC (the logic is in Hedley if you want - * to take a look), but with certain warnings enabled some compilers - * still like to emit a diagnostic. */ -#if HEDLEY_HAS_WARNING("-Wvla") - #define SIMDE_DIAGNOSTIC_DISABLE_VLA_ _Pragma("clang diagnostic ignored \"-Wvla\"") -#elif HEDLEY_GCC_VERSION_CHECK(4,3,0) - #define SIMDE_DIAGNOSTIC_DISABLE_VLA_ _Pragma("GCC diagnostic ignored \"-Wvla\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_VLA_ -#endif - -/* If you add an unused attribute to a function and don't use it, clang - * may emit this. */ -#if HEDLEY_HAS_WARNING("-Wused-but-marked-unused") - #define SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_ _Pragma("clang diagnostic ignored \"-Wused-but-marked-unused\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_ -#endif - -#if HEDLEY_HAS_WARNING("-Wpass-failed") - #define SIMDE_DIAGNOSTIC_DISABLE_PASS_FAILED_ _Pragma("clang diagnostic ignored \"-Wpass-failed\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_PASS_FAILED_ -#endif - -#if HEDLEY_HAS_WARNING("-Wpadded") - #define SIMDE_DIAGNOSTIC_DISABLE_PADDED_ _Pragma("clang diagnostic ignored \"-Wpadded\"") -#elif HEDLEY_MSVC_VERSION_CHECK(19,0,0) /* Likely goes back further */ - #define SIMDE_DIAGNOSTIC_DISABLE_PADDED_ __pragma(warning(disable:4324)) -#else - #define SIMDE_DIAGNOSTIC_DISABLE_PADDED_ -#endif - -#if HEDLEY_HAS_WARNING("-Wzero-as-null-pointer-constant") - #define SIMDE_DIAGNOSTIC_DISABLE_ZERO_AS_NULL_POINTER_CONSTANT_ _Pragma("clang diagnostic ignored \"-Wzero-as-null-pointer-constant\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_ZERO_AS_NULL_POINTER_CONSTANT_ -#endif - -#if HEDLEY_HAS_WARNING("-Wold-style-cast") - #define SIMDE_DIAGNOSTIC_DISABLE_OLD_STYLE_CAST_ _Pragma("clang diagnostic ignored \"-Wold-style-cast\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_OLD_STYLE_CAST_ -#endif - -#if HEDLEY_HAS_WARNING("-Wcast-function-type") || HEDLEY_GCC_VERSION_CHECK(8,0,0) - #define SIMDE_DIAGNOSTIC_DISABLE_CAST_FUNCTION_TYPE_ _Pragma("GCC diagnostic ignored \"-Wcast-function-type\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_CAST_FUNCTION_TYPE_ -#endif - -/* clang will emit this warning when we use C99 extensions whan not in - * C99 mode, even though it does support this. In such cases we check - * the compiler and version first, so we know it's not a problem. */ -#if HEDLEY_HAS_WARNING("-Wc99-extensions") - #define SIMDE_DIAGNOSTIC_DISABLE_C99_EXTENSIONS_ _Pragma("clang diagnostic ignored \"-Wc99-extensions\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_C99_EXTENSIONS_ -#endif - -/* Similar problm as above; we rely on some basic C99 support, but clang - * has started warning obut this even in C17 mode with -Weverything. */ -#if HEDLEY_HAS_WARNING("-Wdeclaration-after-statement") - #define SIMDE_DIAGNOSTIC_DISABLE_DECLARATION_AFTER_STATEMENT_ _Pragma("clang diagnostic ignored \"-Wdeclaration-after-statement\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_DECLARATION_AFTER_STATEMENT_ -#endif - -/* https://github.com/simd-everywhere/simde/issues/277 */ -#if defined(HEDLEY_GCC_VERSION) && HEDLEY_GCC_VERSION_CHECK(4,6,0) && !HEDLEY_GCC_VERSION_CHECK(6,4,0) && defined(__cplusplus) - #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE_ _Pragma("GCC diagnostic ignored \"-Wunused-but-set-variable\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE_ -#endif - -/* This is the warning that you normally define _CRT_SECURE_NO_WARNINGS - * to silence, but you have to do that before including anything and - * that would require reordering includes. */ -#if defined(_MSC_VER) - #define SIMDE_DIAGNOSTIC_DISABLE_ANNEX_K_ __pragma(warning(disable:4996)) -#else - #define SIMDE_DIAGNOSTIC_DISABLE_ANNEX_K_ -#endif - -/* Some compilers, such as clang, may use `long long` for 64-bit - * integers, but `long long` triggers a diagnostic with - * -Wc++98-compat-pedantic which says 'long long' is incompatible with - * C++98. */ -#if HEDLEY_HAS_WARNING("-Wc++98-compat-pedantic") - #if HEDLEY_HAS_WARNING("-Wc++11-long-long") - #define SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ \ - _Pragma("clang diagnostic ignored \"-Wc++98-compat-pedantic\"") \ - _Pragma("clang diagnostic ignored \"-Wc++11-long-long\"") - #else - #define SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ _Pragma("clang diagnostic ignored \"-Wc++98-compat-pedantic\"") - #endif -#else - #define SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ -#endif - -/* Some problem as above */ -#if HEDLEY_HAS_WARNING("-Wc++11-long-long") - #define SIMDE_DIAGNOSTIC_DISABLE_CPP11_LONG_LONG_ _Pragma("clang diagnostic ignored \"-Wc++11-long-long\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_CPP11_LONG_LONG_ -#endif - -/* emscripten emits this whenever stdin/stdout/stderr is used in a - * macro. */ -#if HEDLEY_HAS_WARNING("-Wdisabled-macro-expansion") - #define SIMDE_DIAGNOSTIC_DISABLE_DISABLED_MACRO_EXPANSION_ _Pragma("clang diagnostic ignored \"-Wdisabled-macro-expansion\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_DISABLED_MACRO_EXPANSION_ -#endif - -/* Clang uses C11 generic selections to implement some AltiVec - * functions, which triggers this diagnostic when not compiling - * in C11 mode */ -#if HEDLEY_HAS_WARNING("-Wc11-extensions") - #define SIMDE_DIAGNOSTIC_DISABLE_C11_EXTENSIONS_ _Pragma("clang diagnostic ignored \"-Wc11-extensions\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_C11_EXTENSIONS_ -#endif - -/* Clang sometimes triggers this warning in macros in the AltiVec and - * NEON headers, or due to missing functions. */ -#if HEDLEY_HAS_WARNING("-Wvector-conversion") - #define SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_ _Pragma("clang diagnostic ignored \"-Wvector-conversion\"") - /* For NEON, the situation with -Wvector-conversion in clang < 10 is - * bad enough that we just disable the warning altogether. On x86, - * clang has similar issues on several sse4.2+ intrinsics before 3.8. */ - #if \ - (defined(SIMDE_ARCH_ARM) && SIMDE_DETECT_CLANG_VERSION_NOT(10,0,0)) || \ - SIMDE_DETECT_CLANG_VERSION_NOT(3,8,0) - #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_VECTOR_CONVERSION_ SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_ - #endif -#else - #define SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_ -#endif -#if !defined(SIMDE_DIAGNOSTIC_DISABLE_BUGGY_VECTOR_CONVERSION_) - #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_VECTOR_CONVERSION_ -#endif - -/* Prior to 5.0, clang didn't support disabling diagnostics in - * statement exprs. As a result, some macros we use don't - * properly silence warnings. */ -#if SIMDE_DETECT_CLANG_VERSION_NOT(5,0,0) && HEDLEY_HAS_WARNING("-Wcast-qual") && HEDLEY_HAS_WARNING("-Wcast-align") - #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_CASTS_ _Pragma("clang diagnostic ignored \"-Wcast-qual\"") _Pragma("clang diagnostic ignored \"-Wcast-align\"") -#elif SIMDE_DETECT_CLANG_VERSION_NOT(5,0,0) && HEDLEY_HAS_WARNING("-Wcast-qual") - #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_CASTS_ _Pragma("clang diagnostic ignored \"-Wcast-qual\"") -#elif SIMDE_DETECT_CLANG_VERSION_NOT(5,0,0) && HEDLEY_HAS_WARNING("-Wcast-align") - #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_CASTS_ _Pragma("clang diagnostic ignored \"-Wcast-align\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_BUGGY_CASTS_ -#endif - -/* SLEEF triggers this a *lot* in their headers */ -#if HEDLEY_HAS_WARNING("-Wignored-qualifiers") - #define SIMDE_DIAGNOSTIC_DISABLE_IGNORED_QUALIFIERS_ _Pragma("clang diagnostic ignored \"-Wignored-qualifiers\"") -#elif HEDLEY_GCC_VERSION_CHECK(4,3,0) - #define SIMDE_DIAGNOSTIC_DISABLE_IGNORED_QUALIFIERS_ _Pragma("GCC diagnostic ignored \"-Wignored-qualifiers\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_IGNORED_QUALIFIERS_ -#endif - -/* GCC emits this under some circumstances when using __int128 */ -#if HEDLEY_GCC_VERSION_CHECK(4,8,0) - #define SIMDE_DIAGNOSTIC_DISABLE_PEDANTIC_ _Pragma("GCC diagnostic ignored \"-Wpedantic\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_PEDANTIC_ -#endif - -/* MSVC doesn't like (__assume(0), code) and will warn about code being - * unreachable, but we want it there because not all compilers - * understand the unreachable macro and will complain if it is missing. - * I'm planning on adding a new macro to Hedley to handle this a bit - * more elegantly, but until then... */ -#if defined(HEDLEY_MSVC_VERSION) - #define SIMDE_DIAGNOSTIC_DISABLE_UNREACHABLE_ __pragma(warning(disable:4702)) -#elif defined(__clang__) - #define SIMDE_DIAGNOSTIC_DISABLE_UNREACHABLE_ HEDLEY_PRAGMA(clang diagnostic ignored "-Wunreachable-code") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_UNREACHABLE_ -#endif - -/* This is a false positive from GCC in a few places. */ -#if HEDLEY_GCC_VERSION_CHECK(4,7,0) - #define SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ _Pragma("GCC diagnostic ignored \"-Wmaybe-uninitialized\"") -#else - #define SIMDE_DIAGNOSTIC_DISABLE_MAYBE_UNINITIAZILED_ -#endif - -#if defined(SIMDE_ENABLE_NATIVE_ALIASES) - #define SIMDE_DISABLE_UNWANTED_DIAGNOSTICS_NATIVE_ALIASES_ \ - SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_MACRO_ -#else - #define SIMDE_DISABLE_UNWANTED_DIAGNOSTICS_NATIVE_ALIASES_ -#endif - -/* Some native functions on E2K with instruction set < v6 are declared - * as deprecated due to inefficiency. Still they are more efficient - * than SIMDe implementation. So we're using them, and switching off - * these deprecation warnings. */ -#if defined(HEDLEY_MCST_LCC_VERSION) -# define SIMDE_LCC_DISABLE_DEPRECATED_WARNINGS _Pragma("diag_suppress 1215,1444") -# define SIMDE_LCC_REVERT_DEPRECATED_WARNINGS _Pragma("diag_default 1215,1444") -#else -# define SIMDE_LCC_DISABLE_DEPRECATED_WARNINGS -# define SIMDE_LCC_REVERT_DEPRECATED_WARNINGS -#endif - -#define SIMDE_DISABLE_UNWANTED_DIAGNOSTICS \ - HEDLEY_DIAGNOSTIC_DISABLE_UNUSED_FUNCTION \ - SIMDE_DISABLE_UNWANTED_DIAGNOSTICS_NATIVE_ALIASES_ \ - SIMDE_DIAGNOSTIC_DISABLE_PSABI_ \ - SIMDE_DIAGNOSTIC_DISABLE_NO_EMMS_INSTRUCTION_ \ - SIMDE_DIAGNOSTIC_DISABLE_SIMD_PRAGMA_DEPRECATED_ \ - SIMDE_DIAGNOSTIC_DISABLE_CONDITIONAL_UNINITIALIZED_ \ - SIMDE_DIAGNOSTIC_DISABLE_DECLARATION_AFTER_STATEMENT_ \ - SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL_ \ - SIMDE_DIAGNOSTIC_DISABLE_NON_CONSTANT_AGGREGATE_INITIALIZER_ \ - SIMDE_DIAGNOSTIC_DISABLE_EXTRA_SEMI_ \ - SIMDE_DIAGNOSTIC_DISABLE_VLA_ \ - SIMDE_DIAGNOSTIC_DISABLE_USED_BUT_MARKED_UNUSED_ \ - SIMDE_DIAGNOSTIC_DISABLE_PASS_FAILED_ \ - SIMDE_DIAGNOSTIC_DISABLE_CPP98_COMPAT_PEDANTIC_ \ - SIMDE_DIAGNOSTIC_DISABLE_CPP11_LONG_LONG_ \ - SIMDE_DIAGNOSTIC_DISABLE_BUGGY_UNUSED_BUT_SET_VARIBALE_ \ - SIMDE_DIAGNOSTIC_DISABLE_BUGGY_CASTS_ \ - SIMDE_DIAGNOSTIC_DISABLE_BUGGY_VECTOR_CONVERSION_ \ - SIMDE_DIAGNOSTIC_DISABLE_RESERVED_ID_ - -#endif /* !defined(SIMDE_DIAGNOSTIC_H) */ diff --git a/ffi-deps/simde/simde/simde-f16.h b/ffi-deps/simde/simde/simde-f16.h deleted file mode 100644 index 632ef62..0000000 --- a/ffi-deps/simde/simde/simde-f16.h +++ /dev/null @@ -1,319 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2021 Evan Nemerson - */ - -#include "hedley.h" -#include "simde-common.h" -#include "simde-detect-clang.h" - -#if !defined(SIMDE_FLOAT16_H) -#define SIMDE_FLOAT16_H - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -/* Portable version which should work on pretty much any compiler. - * Obviously you can't rely on compiler support for things like - * conversion to/from 32-bit floats, so make sure you always use the - * functions and macros in this file! - * - * The portable implementations are (heavily) based on CC0 code by - * Fabian Giesen: (see also - * ). - * I have basically just modified it to get rid of some UB (lots of - * aliasing, right shifting a negative value), use fixed-width types, - * and work in C. */ -#define SIMDE_FLOAT16_API_PORTABLE 1 -/* _Float16, per C standard (TS 18661-3; - * ). */ -#define SIMDE_FLOAT16_API_FLOAT16 2 -/* clang >= 6.0 supports __fp16 as an interchange format on all - * targets, but only allows you to use them for arguments and return - * values on targets which have defined an ABI. We get around the - * restriction by wrapping the __fp16 in a struct, but we can't do - * that on Arm since it would break compatibility with the NEON F16 - * functions. */ -#define SIMDE_FLOAT16_API_FP16_NO_ABI 3 -/* This is basically __fp16 as specified by Arm, where arugments and - * return values are raw __fp16 values not structs. */ -#define SIMDE_FLOAT16_API_FP16 4 - -/* Choosing an implementation. This is a bit rough, but I don't have - * any ideas on how to improve it. If you do, patches are definitely - * welcome. */ -#if !defined(SIMDE_FLOAT16_API) - #if defined(__ARM_FP16_FORMAT_IEEE) && (defined(SIMDE_ARM_NEON_FP16) || defined(__ARM_FP16_ARGS)) - #define SIMDE_FLOAT16_API SIMDE_FLOAT16_API_FP16 - #elif !defined(__EMSCRIPTEN__) && !(defined(__clang__) && defined(SIMDE_ARCH_POWER)) && \ - !(defined(HEDLEY_MSVC_VERSION) && defined(__clang__)) && \ - !(defined(SIMDE_ARCH_MIPS) && defined(__clang__)) && \ - !(defined(__clang__) && defined(SIMDE_ARCH_RISCV64)) && ( \ - defined(SIMDE_X86_AVX512FP16_NATIVE) || \ - (defined(SIMDE_ARCH_X86_SSE2) && HEDLEY_GCC_VERSION_CHECK(12,0,0)) || \ - (defined(SIMDE_ARCH_AARCH64) && HEDLEY_GCC_VERSION_CHECK(7,0,0) && !defined(__cplusplus)) || \ - ((defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)) && SIMDE_DETECT_CLANG_VERSION_CHECK(15,0,0)) || \ - (!(defined(SIMDE_ARCH_X86) || defined(SIMDE_ARCH_AMD64)) && SIMDE_DETECT_CLANG_VERSION_CHECK(6,0,0))) - /* We haven't found a better way to detect this. It seems like defining - * __STDC_WANT_IEC_60559_TYPES_EXT__, then including float.h, then - * checking for defined(FLT16_MAX) should work, but both gcc and - * clang will define the constants even if _Float16 is not - * supported. Ideas welcome. */ - #define SIMDE_FLOAT16_API SIMDE_FLOAT16_API_FLOAT16 - #elif defined(__FLT16_MIN__) && \ - (defined(__clang__) && \ - (!defined(SIMDE_ARCH_AARCH64) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) \ - && !defined(SIMDE_ARCH_RISCV64)) - #define SIMDE_FLOAT16_API SIMDE_FLOAT16_API_FP16_NO_ABI - #else - #define SIMDE_FLOAT16_API SIMDE_FLOAT16_API_PORTABLE - #endif -#endif - -#if SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16 - typedef _Float16 simde_float16; - #define SIMDE_FLOAT16_IS_SCALAR 1 - #if !defined(__cplusplus) - #define SIMDE_FLOAT16_C(value) value##f16 - #else - #define SIMDE_FLOAT16_C(value) HEDLEY_STATIC_CAST(_Float16, (value)) - #endif -#elif SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16_NO_ABI - typedef struct { __fp16 value; } simde_float16; - #if defined(SIMDE_STATEMENT_EXPR_) && !defined(SIMDE_TESTS_H) - #define SIMDE_FLOAT16_C(value) (__extension__({ ((simde_float16) { HEDLEY_DIAGNOSTIC_PUSH SIMDE_DIAGNOSTIC_DISABLE_C99_EXTENSIONS_ HEDLEY_STATIC_CAST(__fp16, (value)) }); HEDLEY_DIAGNOSTIC_POP })) - #else - #define SIMDE_FLOAT16_C(value) ((simde_float16) { HEDLEY_STATIC_CAST(__fp16, (value)) }) - #define SIMDE_FLOAT16_IS_SCALAR 1 - #endif -#elif SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16 - typedef __fp16 simde_float16; - #define SIMDE_FLOAT16_IS_SCALAR 1 - #define SIMDE_FLOAT16_C(value) HEDLEY_STATIC_CAST(__fp16, (value)) -#elif SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_PORTABLE - typedef struct { uint16_t value; } simde_float16; -#else - #error No 16-bit floating point API. -#endif - -#if \ - defined(SIMDE_VECTOR_OPS) && \ - (SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_PORTABLE) && \ - (SIMDE_FLOAT16_API != SIMDE_FLOAT16_API_FP16_NO_ABI) - #define SIMDE_FLOAT16_VECTOR -#endif - -/* Reinterpret -- you *generally* shouldn't need these, they're really - * intended for internal use. However, on x86 half-precision floats - * get stuffed into a __m128i/__m256i, so it may be useful. */ - -SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_float16_as_uint16, uint16_t, simde_float16) -SIMDE_DEFINE_CONVERSION_FUNCTION_(simde_uint16_as_float16, simde_float16, uint16_t) - -#if SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_PORTABLE - #define SIMDE_NANHF simde_uint16_as_float16(0x7E00) // a quiet Not-a-Number - #define SIMDE_INFINITYHF simde_uint16_as_float16(0x7C00) - #define SIMDE_NINFINITYHF simde_uint16_as_float16(0xFC00) -#else - #if SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16_NO_ABI - #if SIMDE_MATH_BUILTIN_LIBM(nanf16) - #define SIMDE_NANHF SIMDE_FLOAT16_C(__builtin_nanf16("")) - #elif defined(SIMDE_MATH_NAN) - #define SIMDE_NANHF SIMDE_FLOAT16_C(SIMDE_MATH_NAN) - #endif - #if SIMDE_MATH_BUILTIN_LIBM(inf16) - #define SIMDE_INFINITYHF SIMDE_FLOAT16_C(__builtin_inf16()) - #define SIMDE_NINFINITYHF SIMDE_FLOAT16_C(-__builtin_inf16()) - #else - #define SIMDE_INFINITYHF SIMDE_FLOAT16_C(SIMDE_MATH_INFINITY) - #define SIMDE_NINFINITYHF SIMDE_FLOAT16_C(-SIMDE_MATH_INFINITY) - #endif - #else - #if SIMDE_MATH_BUILTIN_LIBM(nanf16) - #define SIMDE_NANHF __builtin_nanf16("") - #elif defined(SIMDE_MATH_NAN) - #define SIMDE_NANHF SIMDE_MATH_NAN - #endif - #if SIMDE_MATH_BUILTIN_LIBM(inf16) - #define SIMDE_INFINITYHF __builtin_inf16() - #define SIMDE_NINFINITYHF -(__builtin_inf16()) - #else - #define SIMDE_INFINITYHF HEDLEY_STATIC_CAST(simde_float16, SIMDE_MATH_INFINITY) - #define SIMDE_NINFINITYHF HEDLEY_STATIC_CAST(simde_float16, -SIMDE_MATH_INFINITY) - #endif - #endif -#endif - -/* Conversion -- convert between single-precision and half-precision - * floats. */ -static HEDLEY_ALWAYS_INLINE HEDLEY_CONST -simde_float16 -simde_float16_from_float32 (simde_float32 value) { - simde_float16 res; - - #if \ - (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FLOAT16) || \ - (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16) - res = HEDLEY_STATIC_CAST(simde_float16, value); - #elif (SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16_NO_ABI) - res.value = HEDLEY_STATIC_CAST(__fp16, value); - #else - /* This code is CC0, based heavily on code by Fabian Giesen. */ - uint32_t f32u = simde_float32_as_uint32(value); - static const uint32_t f32u_infty = UINT32_C(255) << 23; - static const uint32_t f16u_max = (UINT32_C(127) + UINT32_C(16)) << 23; - static const uint32_t denorm_magic = - ((UINT32_C(127) - UINT32_C(15)) + (UINT32_C(23) - UINT32_C(10)) + UINT32_C(1)) << 23; - uint16_t f16u; - - uint32_t sign = f32u & (UINT32_C(1) << 31); - f32u ^= sign; - - /* NOTE all the integer compares in this function cast the operands - * to signed values to help compilers vectorize to SSE2, which lacks - * unsigned comparison instructions. This is fine since all - * operands are below 0x80000000 (we clear the sign bit). */ - - if (f32u > f16u_max) { /* result is Inf or NaN (all exponent bits set) */ - f16u = (f32u > f32u_infty) ? UINT32_C(0x7e00) : UINT32_C(0x7c00); /* NaN->qNaN and Inf->Inf */ - } else { /* (De)normalized number or zero */ - if (f32u < (UINT32_C(113) << 23)) { /* resulting FP16 is subnormal or zero */ - /* use a magic value to align our 10 mantissa bits at the bottom of - * the float. as long as FP addition is round-to-nearest-even this - * just works. */ - f32u = simde_float32_as_uint32(simde_uint32_as_float32(f32u) + simde_uint32_as_float32(denorm_magic)); - - /* and one integer subtract of the bias later, we have our final float! */ - f16u = HEDLEY_STATIC_CAST(uint16_t, f32u - denorm_magic); - } else { - uint32_t mant_odd = (f32u >> 13) & 1; - - /* update exponent, rounding bias part 1 */ - f32u += (HEDLEY_STATIC_CAST(uint32_t, 15 - 127) << 23) + UINT32_C(0xfff); - /* rounding bias part 2 */ - f32u += mant_odd; - /* take the bits! */ - f16u = HEDLEY_STATIC_CAST(uint16_t, f32u >> 13); - } - } - - f16u |= sign >> 16; - res = simde_uint16_as_float16(f16u); - #endif - - return res; -} - -static HEDLEY_ALWAYS_INLINE HEDLEY_CONST -simde_float32 -simde_float16_to_float32 (simde_float16 value) { - simde_float32 res; - - #if defined(SIMDE_FLOAT16_FLOAT16) || defined(SIMDE_FLOAT16_FP16) - res = HEDLEY_STATIC_CAST(simde_float32, value); - #else - /* This code is CC0, based heavily on code by Fabian Giesen. */ - uint16_t half = simde_float16_as_uint16(value); - const simde_float32 denorm_magic = simde_uint32_as_float32((UINT32_C(113) << 23)); - const uint32_t shifted_exp = UINT32_C(0x7c00) << 13; /* exponent mask after shift */ - uint32_t f32u; - - f32u = (half & UINT32_C(0x7fff)) << 13; /* exponent/mantissa bits */ - uint32_t exp = shifted_exp & f32u; /* just the exponent */ - f32u += (UINT32_C(127) - UINT32_C(15)) << 23; /* exponent adjust */ - - /* handle exponent special cases */ - if (exp == shifted_exp) /* Inf/NaN? */ - f32u += (UINT32_C(128) - UINT32_C(16)) << 23; /* extra exp adjust */ - else if (exp == 0) { /* Zero/Denormal? */ - f32u += (1) << 23; /* extra exp adjust */ - f32u = simde_float32_as_uint32(simde_uint32_as_float32(f32u) - denorm_magic); /* renormalize */ - } - - f32u |= (half & UINT32_C(0x8000)) << 16; /* sign bit */ - res = simde_uint32_as_float32(f32u); - #endif - - return res; -} - -#ifdef SIMDE_FLOAT16_C - #define SIMDE_FLOAT16_VALUE(value) SIMDE_FLOAT16_C(value) -#else - #define SIMDE_FLOAT16_VALUE(value) simde_float16_from_float32(SIMDE_FLOAT32_C(value)) -#endif - -#if !defined(simde_isinfhf) && defined(simde_math_isinff) - #define simde_isinfhf(a) simde_math_isinff(simde_float16_to_float32(a)) -#endif -#if !defined(simde_isnanhf) && defined(simde_math_isnanf) - #define simde_isnanhf(a) simde_math_isnanf(simde_float16_to_float32(a)) -#endif -#if !defined(simde_isnormalhf) && defined(simde_math_isnormalf) - #define simde_isnormalhf(a) simde_math_isnormalf(simde_float16_to_float32(a)) -#endif -#if !defined(simde_issubnormalhf) && defined(simde_math_issubnormalf) - #define simde_issubnormalhf(a) simde_math_issubnormalf(simde_float16_to_float32(a)) -#endif - -#define simde_fpclassifyhf(a) simde_math_fpclassifyf(simde_float16_to_float32(a)) - -static HEDLEY_INLINE -uint8_t -simde_fpclasshf(simde_float16 v, const int imm8) { - uint16_t bits = simde_float16_as_uint16(v); - uint8_t negative = (bits >> 15) & 1; - uint16_t const ExpMask = 0x7C00; // [14:10] - uint16_t const MantMask = 0x03FF; // [9:0] - uint8_t exponent_all_ones = ((bits & ExpMask) == ExpMask); - uint8_t exponent_all_zeros = ((bits & ExpMask) == 0); - uint8_t mantissa_all_zeros = ((bits & MantMask) == 0); - uint8_t zero = exponent_all_zeros & mantissa_all_zeros; - uint8_t signaling_bit = (bits >> 9) & 1; - - uint8_t result = 0; - uint8_t snan = exponent_all_ones & (!mantissa_all_zeros) & (!signaling_bit); - uint8_t qnan = exponent_all_ones & (!mantissa_all_zeros) & signaling_bit; - uint8_t positive_zero = (!negative) & zero; - uint8_t negative_zero = negative & zero; - uint8_t positive_infinity = (!negative) & exponent_all_ones & mantissa_all_zeros; - uint8_t negative_infinity = negative & exponent_all_ones & mantissa_all_zeros; - uint8_t denormal = exponent_all_zeros & (!mantissa_all_zeros); - uint8_t finite_negative = negative & (!exponent_all_ones) & (!zero); - result = (((imm8 >> 0) & qnan) | \ - ((imm8 >> 1) & positive_zero) | \ - ((imm8 >> 2) & negative_zero) | \ - ((imm8 >> 3) & positive_infinity) | \ - ((imm8 >> 4) & negative_infinity) | \ - ((imm8 >> 5) & denormal) | \ - ((imm8 >> 6) & finite_negative) | \ - ((imm8 >> 7) & snan)); - return result; -} - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_FLOAT16_H) */ diff --git a/ffi-deps/simde/simde/simde-features.h b/ffi-deps/simde/simde/simde-features.h deleted file mode 100644 index 622d129..0000000 --- a/ffi-deps/simde/simde/simde-features.h +++ /dev/null @@ -1,752 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - */ - -/* simde-arch.h is used to determine which features are available according - to the compiler. However, we want to make it possible to forcibly enable - or disable APIs */ - -#if !defined(SIMDE_FEATURES_H) -#define SIMDE_FEATURES_H - -#include "simde-arch.h" -#include "simde-diagnostic.h" - -#if !defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_SVML_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_SVML) - #define SIMDE_X86_SVML_NATIVE - #endif -#endif - -#if !defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) && !defined(SIMDE_X86_AVX512VP2INTERSECT_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512VP2INTERSECT) - #define SIMDE_X86_AVX512VP2INTERSECT_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && !defined(SIMDE_X86_AVX512VPOPCNTDQ_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512VPOPCNTDQ) - #define SIMDE_X86_AVX512VPOPCNTDQ_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512BITALG_NATIVE) && !defined(SIMDE_X86_AVX512BITALG_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512BITALG) - #define SIMDE_X86_AVX512BITALG_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512BITALG_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512VBMI_NATIVE) && !defined(SIMDE_X86_AVX512VBMI_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512VBMI) - #define SIMDE_X86_AVX512VBMI_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512VBMI_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512VBMI2_NATIVE) && !defined(SIMDE_X86_AVX512VBMI2_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512VBMI2) - #define SIMDE_X86_AVX512VBMI2_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512VBMI2_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512VNNI_NATIVE) && !defined(SIMDE_X86_AVX512VNNI_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512VNNI) - #define SIMDE_X86_AVX512VNNI_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512VNNI_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX5124VNNIW_NATIVE) && !defined(SIMDE_X86_AVX5124VNNIW_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX5124VNNIW) - #define SIMDE_X86_AVX5124VNNIW_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX5124VNNIW_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512CD_NATIVE) && !defined(SIMDE_X86_AVX512CD_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512CD) - #define SIMDE_X86_AVX512CD_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512CD_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512DQ_NATIVE) && !defined(SIMDE_X86_AVX512DQ_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512DQ) - #define SIMDE_X86_AVX512DQ_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512DQ_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_X86_AVX512VL_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512VL) - #define SIMDE_X86_AVX512VL_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_X86_AVX512BW_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512BW) - #define SIMDE_X86_AVX512BW_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512FP16_NATIVE) && !defined(SIMDE_X86_AVX512FP16_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512FP16) - #define SIMDE_X86_AVX512FP16_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512BW_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512BF16_NATIVE) && !defined(SIMDE_X86_AVX512BF16_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512BF16) - #define SIMDE_X86_AVX512BF16_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512BF16_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_X86_AVX512F_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX512F) - #define SIMDE_X86_AVX512F_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_X86_AVX2_NATIVE) - #define SIMDE_X86_AVX2_NATIVE -#endif - -#if !defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_X86_FMA_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_FMA) - #define SIMDE_X86_FMA_NATIVE - #endif -#endif -#if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_X86_AVX_NATIVE) - #define SIMDE_X86_AVX_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_X86_AVX2_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX2) - #define SIMDE_X86_AVX2_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_X86_AVX_NATIVE) - #define SIMDE_X86_AVX_NATIVE -#endif - -#if !defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_X86_AVX_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AVX) - #define SIMDE_X86_AVX_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_X86_SSE4_2_NATIVE) - #define SIMDE_X86_SSE4_2_NATIVE -#endif - -#if !defined(SIMDE_X86_XOP_NATIVE) && !defined(SIMDE_X86_XOP_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_XOP) - #define SIMDE_X86_XOP_NATIVE - #endif -#endif -#if defined(SIMDE_X86_XOP_NATIVE) && !defined(SIMDE_X86_SSE4_2_NATIVE) - #define SIMDE_X86_SSE4_2_NATIVE -#endif - -#if !defined(SIMDE_X86_SSE4_2_NATIVE) && !defined(SIMDE_X86_SSE4_2_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_SSE4_2) - #define SIMDE_X86_SSE4_2_NATIVE - #endif -#endif -#if defined(SIMDE_X86_SSE4_2_NATIVE) && !defined(SIMDE_X86_SSE4_1_NATIVE) - #define SIMDE_X86_SSE4_1_NATIVE -#endif - -#if !defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(SIMDE_X86_SSE4_1_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_SSE4_1) - #define SIMDE_X86_SSE4_1_NATIVE - #endif -#endif -#if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(SIMDE_X86_SSSE3_NATIVE) - #define SIMDE_X86_SSSE3_NATIVE -#endif - -#if !defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_X86_SSSE3_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_SSSE3) - #define SIMDE_X86_SSSE3_NATIVE - #endif -#endif -#if defined(SIMDE_X86_SSSE3_NATIVE) && !defined(SIMDE_X86_SSE3_NATIVE) - #define SIMDE_X86_SSE3_NATIVE -#endif - -#if !defined(SIMDE_X86_SSE3_NATIVE) && !defined(SIMDE_X86_SSE3_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_SSE3) - #define SIMDE_X86_SSE3_NATIVE - #endif -#endif -#if defined(SIMDE_X86_SSE3_NATIVE) && !defined(SIMDE_X86_SSE2_NATIVE) - #define SIMDE_X86_SSE2_NATIVE -#endif - -#if !defined(SIMDE_X86_AES_NATIVE) && !defined(SIMDE_X86_AES_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_AES) - #define SIMDE_X86_AES_NATIVE - #endif -#endif -#if defined(SIMDE_X86_AES_NATIVE) && !defined(SIMDE_X86_SSE2_NATIVE) - #define SIMDE_X86_SSE2_NATIVE -#endif - -#if !defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_X86_SSE2_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_SSE2) - #define SIMDE_X86_SSE2_NATIVE - #endif -#endif -#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_X86_SSE_NATIVE) - #define SIMDE_X86_SSE_NATIVE -#endif - -#if !defined(SIMDE_X86_SSE_NATIVE) && !defined(SIMDE_X86_SSE_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_SSE) - #define SIMDE_X86_SSE_NATIVE - #endif -#endif - -#if !defined(SIMDE_X86_MMX_NATIVE) && !defined(SIMDE_X86_MMX_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_MMX) - #define SIMDE_X86_MMX_NATIVE - #endif -#endif - -#if !defined(SIMDE_X86_GFNI_NATIVE) && !defined(SIMDE_X86_GFNI_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_GFNI) - #define SIMDE_X86_GFNI_NATIVE - #endif -#endif - -#if !defined(SIMDE_X86_PCLMUL_NATIVE) && !defined(SIMDE_X86_PCLMUL_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_PCLMUL) - #define SIMDE_X86_PCLMUL_NATIVE - #endif -#endif - -#if !defined(SIMDE_X86_VPCLMULQDQ_NATIVE) && !defined(SIMDE_X86_VPCLMULQDQ_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_VPCLMULQDQ) - #define SIMDE_X86_VPCLMULQDQ_NATIVE - #endif -#endif - -#if !defined(SIMDE_X86_F16C_NATIVE) && !defined(SIMDE_X86_F16C_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86_F16C) - #define SIMDE_X86_F16C_NATIVE - #endif -#endif - -#if !defined(SIMDE_X86_SVML_NATIVE) && !defined(SIMDE_X86_SVML_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_X86) && (defined(__INTEL_COMPILER) || (HEDLEY_MSVC_VERSION_CHECK(14, 20, 0) && !defined(__clang__))) - #define SIMDE_X86_SVML_NATIVE - #endif -#endif - -#if defined(HEDLEY_MSVC_VERSION) - #pragma warning(push) - #pragma warning(disable:4799) -#endif - -#if \ - defined(SIMDE_X86_AVX_NATIVE) || defined(SIMDE_X86_GFNI_NATIVE) || defined(SIMDE_X86_SVML_NATIVE) - #include -#elif defined(SIMDE_X86_SSE4_2_NATIVE) - #include -#elif defined(SIMDE_X86_SSE4_1_NATIVE) - #include -#elif defined(SIMDE_X86_SSSE3_NATIVE) - #include -#elif defined(SIMDE_X86_SSE3_NATIVE) - #include -#elif defined(SIMDE_X86_SSE2_NATIVE) - #include -#elif defined(SIMDE_X86_SSE_NATIVE) - #include -#elif defined(SIMDE_X86_MMX_NATIVE) - #include -#endif - -#if defined(SIMDE_X86_XOP_NATIVE) - #if defined(_MSC_VER) - #include - #else - #include - #endif -#endif - -#if defined(SIMDE_X86_AES_NATIVE) - #include -#endif - -#if defined(HEDLEY_MSVC_VERSION) - #pragma warning(pop) -#endif - -#if !defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_ARM_NEON_A64V8_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_ARM_NEON) && defined(SIMDE_ARCH_AARCH64) && SIMDE_ARCH_ARM_CHECK(8,0) - #define SIMDE_ARM_NEON_A64V8_NATIVE - #endif -#endif -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && !defined(SIMDE_ARM_NEON_A32V8_NATIVE) - #define SIMDE_ARM_NEON_A32V8_NATIVE -#endif - -#if !defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_ARM_NEON_A32V8_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_ARM_NEON) && SIMDE_ARCH_ARM_CHECK(8,0) && (__ARM_NEON_FP & 0x02) - #define SIMDE_ARM_NEON_A32V8_NATIVE - #endif -#endif -#if defined(__ARM_ACLE) - #include -#endif -#if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define SIMDE_ARM_NEON_A32V7_NATIVE -#endif - -#if !defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_ARM_NEON_A32V7_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_ARM_NEON) && SIMDE_ARCH_ARM_CHECK(7,0) - #define SIMDE_ARM_NEON_A32V7_NATIVE - #endif -#endif -#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #include - #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - #include - #endif -#endif - -#if !defined(SIMDE_ARM_SVE_NATIVE) && !defined(SIMDE_ARM_SVE_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_ARM_SVE) - #define SIMDE_ARM_SVE_NATIVE - #include - #endif -#endif - -#if !defined(SIMDE_WASM_SIMD128_NATIVE) && !defined(SIMDE_WASM_SIMD128_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_WASM_SIMD128) - #define SIMDE_WASM_SIMD128_NATIVE - #endif -#endif - -#if !defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) && !defined(SIMDE_WASM_RELAXED_SIMD_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_WASM_RELAXED_SIMD) - #define SIMDE_WASM_RELAXED_SIMD_NATIVE - #endif -#endif -#if defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_WASM_RELAXED_SIMD_NATIVE) - #include -#endif - -#if !defined(SIMDE_POWER_ALTIVEC_P9_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P9_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if SIMDE_ARCH_POWER_ALTIVEC_CHECK(900) - #define SIMDE_POWER_ALTIVEC_P9_NATIVE - #endif -#endif -#if defined(SIMDE_POWER_ALTIVEC_P9_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P8) - #define SIMDE_POWER_ALTIVEC_P8_NATIVE -#endif - -#if !defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P8_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if SIMDE_ARCH_POWER_ALTIVEC_CHECK(800) - #define SIMDE_POWER_ALTIVEC_P8_NATIVE - #endif -#endif -#if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P7) - #define SIMDE_POWER_ALTIVEC_P7_NATIVE -#endif - -#if !defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P7_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if SIMDE_ARCH_POWER_ALTIVEC_CHECK(700) - #define SIMDE_POWER_ALTIVEC_P7_NATIVE - #endif -#endif -#if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P6) - #define SIMDE_POWER_ALTIVEC_P6_NATIVE -#endif - -#if !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P6_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if SIMDE_ARCH_POWER_ALTIVEC_CHECK(600) - #define SIMDE_POWER_ALTIVEC_P6_NATIVE - #endif -#endif -#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P5) - #define SIMDE_POWER_ALTIVEC_P5_NATIVE -#endif - -#if !defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) && !defined(SIMDE_POWER_ALTIVEC_P5_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if SIMDE_ARCH_POWER_ALTIVEC_CHECK(500) - #define SIMDE_POWER_ALTIVEC_P5_NATIVE - #endif -#endif - -#if !defined(SIMDE_ZARCH_ZVECTOR_15_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_15_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if SIMDE_ARCH_ZARCH_CHECK(13) && defined(SIMDE_ARCH_ZARCH_ZVECTOR) - #define SIMDE_ZARCH_ZVECTOR_15_NATIVE - #endif -#endif - -#if !defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_14_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if SIMDE_ARCH_ZARCH_CHECK(12) && defined(SIMDE_ARCH_ZARCH_ZVECTOR) - #define SIMDE_ZARCH_ZVECTOR_14_NATIVE - #endif -#endif - -#if !defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_13_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if SIMDE_ARCH_ZARCH_CHECK(11) && defined(SIMDE_ARCH_ZARCH_ZVECTOR) - #define SIMDE_ZARCH_ZVECTOR_13_NATIVE - #endif -#endif - -#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - /* AltiVec conflicts with lots of stuff. The bool keyword conflicts - * with the bool keyword in C++ and the bool macro in C99+ (defined - * in stdbool.h). The vector keyword conflicts with std::vector in - * C++ if you are `using std;`. - * - * Luckily AltiVec allows you to use `__vector`/`__bool`/`__pixel` - * instead, but altivec.h will unconditionally define - * `vector`/`bool`/`pixel` so we need to work around that. - * - * Unfortunately this means that if your code uses AltiVec directly - * it may break. If this is the case you'll want to define - * `SIMDE_POWER_ALTIVEC_NO_UNDEF` before including SIMDe. Or, even - * better, port your code to use the double-underscore versions. */ - #if defined(bool) - #undef bool - #endif - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - #include - - #if !defined(SIMDE_POWER_ALTIVEC_NO_UNDEF) - #if defined(vector) - #undef vector - #endif - #if defined(pixel) - #undef pixel - #endif - #if defined(bool) - #undef bool - #endif - #endif /* !defined(SIMDE_POWER_ALTIVEC_NO_UNDEF) */ - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - #include - #endif - - /* Use these intsead of vector/pixel/bool in SIMDe. */ - #define SIMDE_POWER_ALTIVEC_VECTOR(T) __vector T - #define SIMDE_POWER_ALTIVEC_PIXEL __pixel - #define SIMDE_POWER_ALTIVEC_BOOL __bool - - /* Re-define bool if we're using stdbool.h */ - #if !defined(__cplusplus) && defined(__bool_true_false_are_defined) && !defined(SIMDE_POWER_ALTIVEC_NO_UNDEF) - #define bool _Bool - #endif -#endif - -#if !defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) && !defined(SIMDE_MIPS_LOONGSON_MMI_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_MIPS_LOONGSON_MMI) - #define SIMDE_MIPS_LOONGSON_MMI_NATIVE 1 - #endif -#endif -#if defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - #include -#endif - -#if !defined(SIMDE_MIPS_MSA_NATIVE) && !defined(SIMDE_MIPS_MSA_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_MIPS_MSA) - #define SIMDE_MIPS_MSA_NATIVE 1 - #endif -#endif -#if defined(SIMDE_MIPS_MSA_NATIVE) - #include -#endif - -/* This is used to determine whether or not to fall back on a vector - * function in an earlier ISA extensions, as well as whether - * we expected any attempts at vectorization to be fruitful or if we - * expect to always be running serial code. - * - * Note that, for some architectures (okay, *one* architecture) there - * can be a split where some types are supported for one vector length - * but others only for a shorter length. Therefore, it is possible to - * provide separate values for float/int/double types. */ - -#if !defined(SIMDE_NATURAL_VECTOR_SIZE) - #if defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_NATURAL_VECTOR_SIZE (512) - #elif defined(SIMDE_X86_AVX2_NATIVE) - #define SIMDE_NATURAL_VECTOR_SIZE (256) - #elif defined(SIMDE_X86_AVX_NATIVE) - #define SIMDE_NATURAL_FLOAT_VECTOR_SIZE (256) - #define SIMDE_NATURAL_INT_VECTOR_SIZE (128) - #define SIMDE_NATURAL_DOUBLE_VECTOR_SIZE (128) - #elif \ - defined(SIMDE_X86_SSE2_NATIVE) || \ - defined(SIMDE_ARM_NEON_A32V7_NATIVE) || \ - defined(SIMDE_WASM_SIMD128_NATIVE) || \ - defined(SIMDE_POWER_ALTIVEC_P5_NATIVE) || \ - defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) || \ - defined(SIMDE_MIPS_MSA_NATIVE) - #define SIMDE_NATURAL_VECTOR_SIZE (128) - #elif defined(SIMDE_X86_SSE_NATIVE) - #define SIMDE_NATURAL_FLOAT_VECTOR_SIZE (128) - #define SIMDE_NATURAL_INT_VECTOR_SIZE (64) - #define SIMDE_NATURAL_DOUBLE_VECTOR_SIZE (0) - #endif - - #if !defined(SIMDE_NATURAL_VECTOR_SIZE) - #if defined(SIMDE_NATURAL_FLOAT_VECTOR_SIZE) - #define SIMDE_NATURAL_VECTOR_SIZE SIMDE_NATURAL_FLOAT_VECTOR_SIZE - #elif defined(SIMDE_NATURAL_INT_VECTOR_SIZE) - #define SIMDE_NATURAL_VECTOR_SIZE SIMDE_NATURAL_INT_VECTOR_SIZE - #elif defined(SIMDE_NATURAL_DOUBLE_VECTOR_SIZE) - #define SIMDE_NATURAL_VECTOR_SIZE SIMDE_NATURAL_DOUBLE_VECTOR_SIZE - #else - #define SIMDE_NATURAL_VECTOR_SIZE (0) - #endif - #endif - - #if !defined(SIMDE_NATURAL_FLOAT_VECTOR_SIZE) - #define SIMDE_NATURAL_FLOAT_VECTOR_SIZE SIMDE_NATURAL_VECTOR_SIZE - #endif - #if !defined(SIMDE_NATURAL_INT_VECTOR_SIZE) - #define SIMDE_NATURAL_INT_VECTOR_SIZE SIMDE_NATURAL_VECTOR_SIZE - #endif - #if !defined(SIMDE_NATURAL_DOUBLE_VECTOR_SIZE) - #define SIMDE_NATURAL_DOUBLE_VECTOR_SIZE SIMDE_NATURAL_VECTOR_SIZE - #endif -#endif - -#define SIMDE_NATURAL_VECTOR_SIZE_LE(x) ((SIMDE_NATURAL_VECTOR_SIZE > 0) && (SIMDE_NATURAL_VECTOR_SIZE <= (x))) -#define SIMDE_NATURAL_VECTOR_SIZE_GE(x) ((SIMDE_NATURAL_VECTOR_SIZE > 0) && (SIMDE_NATURAL_VECTOR_SIZE >= (x))) -#define SIMDE_NATURAL_FLOAT_VECTOR_SIZE_LE(x) ((SIMDE_NATURAL_FLOAT_VECTOR_SIZE > 0) && (SIMDE_NATURAL_FLOAT_VECTOR_SIZE <= (x))) -#define SIMDE_NATURAL_FLOAT_VECTOR_SIZE_GE(x) ((SIMDE_NATURAL_FLOAT_VECTOR_SIZE > 0) && (SIMDE_NATURAL_FLOAT_VECTOR_SIZE >= (x))) -#define SIMDE_NATURAL_INT_VECTOR_SIZE_LE(x) ((SIMDE_NATURAL_INT_VECTOR_SIZE > 0) && (SIMDE_NATURAL_INT_VECTOR_SIZE <= (x))) -#define SIMDE_NATURAL_INT_VECTOR_SIZE_GE(x) ((SIMDE_NATURAL_INT_VECTOR_SIZE > 0) && (SIMDE_NATURAL_INT_VECTOR_SIZE >= (x))) -#define SIMDE_NATURAL_DOUBLE_VECTOR_SIZE_LE(x) ((SIMDE_NATURAL_DOUBLE_VECTOR_SIZE > 0) && (SIMDE_NATURAL_DOUBLE_VECTOR_SIZE <= (x))) -#define SIMDE_NATURAL_DOUBLE_VECTOR_SIZE_GE(x) ((SIMDE_NATURAL_DOUBLE_VECTOR_SIZE > 0) && (SIMDE_NATURAL_DOUBLE_VECTOR_SIZE >= (x))) - -/* Native aliases */ -#if defined(SIMDE_ENABLE_NATIVE_ALIASES) - #if !defined(SIMDE_X86_MMX_NATIVE) - #define SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_SSE_NATIVE) - #define SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_SSE2_NATIVE) - #define SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_SSE3_NATIVE) - #define SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_SSSE3_NATIVE) - #define SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_SSE4_1_NATIVE) - #define SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_SSE4_2_NATIVE) - #define SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX_NATIVE) - #define SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX2_NATIVE) - #define SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_FMA_NATIVE) - #define SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512F_NATIVE) - #define SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512VL_NATIVE) - #define SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512VBMI_NATIVE) - #define SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512VBMI2_NATIVE) - #define SIMDE_X86_AVX512VBMI2_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512BW_NATIVE) - #define SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512VNNI_NATIVE) - #define SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX5124VNNIW_NATIVE) - #define SIMDE_X86_AVX5124VNNIW_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512BF16_NATIVE) - #define SIMDE_X86_AVX512BF16_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512BITALG_NATIVE) - #define SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) - #define SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) - #define SIMDE_X86_AVX512VP2INTERSECT_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512DQ_NATIVE) - #define SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512CD_NATIVE) - #define SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AVX512FP16_NATIVE) - #define SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_GFNI_NATIVE) - #define SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_PCLMUL_NATIVE) - #define SIMDE_X86_PCLMUL_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_VPCLMULQDQ_NATIVE) - #define SIMDE_X86_VPCLMULQDQ_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_F16C_NATIVE) - #define SIMDE_X86_F16C_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_AES_NATIVE) - #define SIMDE_X86_AES_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_X86_SVML_NATIVE) - #define SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES - #endif - - #if !defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define SIMDE_ARM_NEON_A32V7_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_ARM_NEON_A32V8_NATIVE) - #define SIMDE_ARM_NEON_A32V8_ENABLE_NATIVE_ALIASES - #endif - #if !defined(SIMDE_ARM_NEON_A64V8_NATIVE) - #define SIMDE_ARM_NEON_A64V8_ENABLE_NATIVE_ALIASES - #endif - - #if !defined(SIMDE_ARM_SVE_NATIVE) - #define SIMDE_ARM_SVE_ENABLE_NATIVE_ALIASES - #endif - - #if !defined(SIMDE_MIPS_MSA_NATIVE) - #define SIMDE_MIPS_MSA_ENABLE_NATIVE_ALIASES - #endif - - #if !defined(SIMDE_WASM_SIMD128_NATIVE) - #define SIMDE_WASM_SIMD128_ENABLE_NATIVE_ALIASES - #endif -#endif - -/* Are floating point values stored using IEEE 754? Knowing - * this at during preprocessing is a bit tricky, mostly because what - * we're curious about is how values are stored and not whether the - * implementation is fully conformant in terms of rounding, NaN - * handling, etc. - * - * For example, if you use -ffast-math or -Ofast on - * GCC or clang IEEE 754 isn't strictly followed, therefore IEE 754 - * support is not advertised (by defining __STDC_IEC_559__). - * - * However, what we care about is whether it is safe to assume that - * floating point values are stored in IEEE 754 format, in which case - * we can provide faster implementations of some functions. - * - * Luckily every vaugely modern architecture I'm aware of uses IEEE 754- - * so we just assume IEEE 754 for now. There is a test which verifies - * this, if that test fails sowewhere please let us know and we'll add - * an exception for that platform. Meanwhile, you can define - * SIMDE_NO_IEEE754_STORAGE. */ -#if !defined(SIMDE_IEEE754_STORAGE) && !defined(SIMDE_NO_IEE754_STORAGE) - #define SIMDE_IEEE754_STORAGE -#endif - -#if defined(SIMDE_ARCH_ARM_NEON_FP16) - #define SIMDE_ARM_NEON_FP16 -#endif - -#if defined(SIMDE_ARCH_ARM_NEON_BF16) - #define SIMDE_ARM_NEON_BF16 -#endif - -#if !defined(SIMDE_LOONGARCH_LASX_NATIVE) && !defined(SIMDE_LOONGARCH_LASX_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_LOONGARCH_LASX) - #define SIMDE_LOONGARCH_LASX_NATIVE - #endif -#endif - -#if !defined(SIMDE_LOONGARCH_LSX_NATIVE) && !defined(SIMDE_LOONGARCH_LSX_NO_NATIVE) && !defined(SIMDE_NO_NATIVE) - #if defined(SIMDE_ARCH_LOONGARCH_LSX) - #define SIMDE_LOONGARCH_LSX_NATIVE - #endif -#endif - -#if defined(SIMDE_LOONGARCH_LASX_NATIVE) - #include -#endif -#if defined(SIMDE_LOONGARCH_LSX_NATIVE) - #include -#endif - -#endif /* !defined(SIMDE_FEATURES_H) */ diff --git a/ffi-deps/simde/simde/simde-math.h b/ffi-deps/simde/simde/simde-math.h deleted file mode 100644 index 02de568..0000000 --- a/ffi-deps/simde/simde/simde-math.h +++ /dev/null @@ -1,2065 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2017-2020 Evan Nemerson - * 2023 Yi-Yen Chung (Copyright owned by Andes Technology) - */ - -/* Attempt to find math functions. Functions may be in , - * , compiler built-ins/intrinsics, or platform/architecture - * specific headers. In some cases, especially those not built in to - * libm, we may need to define our own implementations. */ - -#if !defined(SIMDE_MATH_H) -#define SIMDE_MATH_H 1 - -#include "hedley.h" -#include "simde-features.h" - -#include -#if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - #include -#endif - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS - -/* SLEEF support - * https://sleef.org/ - * - * If you include prior to including SIMDe, SIMDe will use - * SLEEF. You can also define SIMDE_MATH_SLEEF_ENABLE prior to - * including SIMDe to force the issue. - * - * Note that SLEEF does requires linking to libsleef. - * - * By default, SIMDe will use the 1 ULP functions, but if you use - * SIMDE_ACCURACY_PREFERENCE of 0 we will use up to 4 ULP. This is - * only the case for the simde_math_* functions; for code in other - * SIMDe headers which calls SLEEF directly we may use functions with - * greater error if the API we're implementing is less precise (for - * example, SVML guarantees 4 ULP, so we will generally use the 3.5 - * ULP functions from SLEEF). */ -#if !defined(SIMDE_MATH_SLEEF_DISABLE) - #if defined(__SLEEF_H__) - #define SIMDE_MATH_SLEEF_ENABLE - #endif -#endif - -#if defined(SIMDE_MATH_SLEEF_ENABLE) && !defined(__SLEEF_H__) - HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_IGNORED_QUALIFIERS_ - #include - HEDLEY_DIAGNOSTIC_POP -#endif - -#if defined(SIMDE_MATH_SLEEF_ENABLE) && defined(__SLEEF_H__) - #if defined(SLEEF_VERSION_MAJOR) - #define SIMDE_MATH_SLEEF_VERSION_CHECK(major, minor, patch) (HEDLEY_VERSION_ENCODE(SLEEF_VERSION_MAJOR, SLEEF_VERSION_MINOR, SLEEF_VERSION_PATCHLEVEL) >= HEDLEY_VERSION_ENCODE(major, minor, patch)) - #else - #define SIMDE_MATH_SLEEF_VERSION_CHECK(major, minor, patch) (HEDLEY_VERSION_ENCODE(3,0,0) >= HEDLEY_VERSION_ENCODE(major, minor, patch)) - #endif -#else - #define SIMDE_MATH_SLEEF_VERSION_CHECK(major, minor, patch) (0) -#endif - -#if defined(__has_builtin) - #define SIMDE_MATH_BUILTIN_LIBM(func) __has_builtin(__builtin_##func) -#elif \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_GCC_VERSION_CHECK(4,4,0) - #define SIMDE_MATH_BUILTIN_LIBM(func) (1) -#else - #define SIMDE_MATH_BUILTIN_LIBM(func) (0) -#endif - -#if defined(HUGE_VAL) - /* Looks like or has already been included. */ - - /* The math.h from libc++ (yes, the C header from the C++ standard - * library) will define an isnan function, but not an isnan macro - * like the C standard requires. So we detect the header guards - * macro libc++ uses. */ - #if defined(isnan) || (defined(_LIBCPP_MATH_H) && !defined(_LIBCPP_CMATH)) - #define SIMDE_MATH_HAVE_MATH_H - #elif defined(__cplusplus) - #define SIMDE_MATH_HAVE_CMATH - #endif -#elif defined(__has_include) - #if defined(__cplusplus) && (__cplusplus >= 201103L) && __has_include() - #define SIMDE_MATH_HAVE_CMATH - #include - #elif __has_include() - #define SIMDE_MATH_HAVE_MATH_H - #include - #elif !defined(SIMDE_MATH_NO_LIBM) - #define SIMDE_MATH_NO_LIBM - #endif -#elif !defined(SIMDE_MATH_NO_LIBM) - #if defined(__cplusplus) && (__cplusplus >= 201103L) - #define SIMDE_MATH_HAVE_CMATH - HEDLEY_DIAGNOSTIC_PUSH - #if defined(HEDLEY_MSVC_VERSION) - /* VS 14 emits this diagnostic about noexcept being used on a - * function, which we can't do anything about. */ - #pragma warning(disable:4996) - #endif - #include - HEDLEY_DIAGNOSTIC_POP - #else - #define SIMDE_MATH_HAVE_MATH_H - #include - #endif -#endif - -#if !defined(SIMDE_MATH_INFINITY) - #if \ - HEDLEY_HAS_BUILTIN(__builtin_inf) || \ - HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_CRAY_VERSION_CHECK(8,1,0) - #define SIMDE_MATH_INFINITY (__builtin_inf()) - #elif defined(INFINITY) - #define SIMDE_MATH_INFINITY INFINITY - #endif -#endif - -#if !defined(SIMDE_INFINITYF) - #if \ - HEDLEY_HAS_BUILTIN(__builtin_inff) || \ - HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(13,1,0) - #define SIMDE_MATH_INFINITYF (__builtin_inff()) - #elif defined(INFINITYF) - #define SIMDE_MATH_INFINITYF INFINITYF - #elif defined(SIMDE_MATH_INFINITY) - #define SIMDE_MATH_INFINITYF HEDLEY_STATIC_CAST(float, SIMDE_MATH_INFINITY) - #endif -#endif - -#if !defined(SIMDE_MATH_NAN) - #if \ - HEDLEY_HAS_BUILTIN(__builtin_nan) || \ - HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_CRAY_VERSION_CHECK(8,1,0) || \ - HEDLEY_IBM_VERSION_CHECK(13,1,0) - #define SIMDE_MATH_NAN (__builtin_nan("")) - #elif defined(NAN) - #define SIMDE_MATH_NAN NAN - #endif -#endif - -#if !defined(SIMDE_NANF) - #if \ - HEDLEY_HAS_BUILTIN(__builtin_nanf) || \ - HEDLEY_GCC_VERSION_CHECK(3,3,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_CRAY_VERSION_CHECK(8,1,0) - #define SIMDE_MATH_NANF (__builtin_nanf("")) - #elif defined(NANF) - #define SIMDE_MATH_NANF NANF - #elif defined(SIMDE_MATH_NAN) - #define SIMDE_MATH_NANF HEDLEY_STATIC_CAST(float, SIMDE_MATH_NAN) - #endif -#endif - -#if !defined(SIMDE_MATH_PI) - #if defined(M_PI) - #define SIMDE_MATH_PI M_PI - #else - #define SIMDE_MATH_PI 3.14159265358979323846 - #endif -#endif - -#if !defined(SIMDE_MATH_PIF) - #if defined(M_PI) - #define SIMDE_MATH_PIF HEDLEY_STATIC_CAST(float, M_PI) - #else - #define SIMDE_MATH_PIF 3.14159265358979323846f - #endif -#endif - -#if !defined(SIMDE_MATH_PI_OVER_180) - #define SIMDE_MATH_PI_OVER_180 0.0174532925199432957692369076848861271344287188854172545609719144 -#endif - -#if !defined(SIMDE_MATH_PI_OVER_180F) - #define SIMDE_MATH_PI_OVER_180F 0.0174532925199432957692369076848861271344287188854172545609719144f -#endif - -#if !defined(SIMDE_MATH_180_OVER_PI) - #define SIMDE_MATH_180_OVER_PI 57.295779513082320876798154814105170332405472466564321549160243861 -#endif - -#if !defined(SIMDE_MATH_180_OVER_PIF) - #define SIMDE_MATH_180_OVER_PIF 57.295779513082320876798154814105170332405472466564321549160243861f -#endif - -#if !defined(SIMDE_MATH_FLT_MIN) - #if defined(__FLT_MIN__) - #define SIMDE_MATH_FLT_MIN __FLT_MIN__ - #else - #if !defined(FLT_MIN) - #if defined(__cplusplus) - #include - #else - #include - #endif - #endif - #define SIMDE_MATH_FLT_MIN FLT_MIN - #endif -#endif - -#if !defined(SIMDE_MATH_FLT_MAX) - #if defined(__FLT_MAX__) - #define SIMDE_MATH_FLT_MAX __FLT_MAX__ - #else - #if !defined(FLT_MAX) - #if defined(__cplusplus) - #include - #else - #include - #endif - #endif - #define SIMDE_MATH_FLT_MAX FLT_MAX - #endif -#endif - -#if !defined(SIMDE_MATH_DBL_MIN) - #if defined(__DBL_MIN__) - #define SIMDE_MATH_DBL_MIN __DBL_MIN__ - #else - #if !defined(DBL_MIN) - #if defined(__cplusplus) - #include - #else - #include - #endif - #endif - #define SIMDE_MATH_DBL_MIN DBL_MIN - #endif -#endif - -#if !defined(SIMDE_MATH_DBL_MAX) - #if defined(__DBL_MAX__) - #define SIMDE_MATH_DBL_MAX __DBL_MAX__ - #else - #if !defined(DBL_MAX) - #if defined(__cplusplus) - #include - #else - #include - #endif - #endif - #define SIMDE_MATH_DBL_MAX DBL_MAX - #endif -#endif - -/*** Classification macros from C99 ***/ - -#if !defined(simde_math_isinf) - #if SIMDE_MATH_BUILTIN_LIBM(isinf) - #define simde_math_isinf(v) __builtin_isinf(v) - #elif defined(isinf) || defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_isinf(v) isinf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_isinf(v) std::isinf(v) - #endif -#endif - -#if !defined(simde_math_isinff) - #if HEDLEY_HAS_BUILTIN(__builtin_isinff) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) - #define simde_math_isinff(v) __builtin_isinff(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_isinff(v) std::isinf(v) - #elif defined(simde_math_isinf) - #define simde_math_isinff(v) simde_math_isinf(HEDLEY_STATIC_CAST(double, v)) - #endif -#endif - -#if !defined(simde_math_isnan) - #if SIMDE_MATH_BUILTIN_LIBM(isnan) - #define simde_math_isnan(v) __builtin_isnan(v) - #elif defined(isnan) || defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_isnan(v) isnan(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_isnan(v) std::isnan(v) - #endif -#endif - -#if !defined(simde_math_isnanf) - #if HEDLEY_HAS_BUILTIN(__builtin_isnanf) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) - /* XL C/C++ has __builtin_isnan but not __builtin_isnanf */ - #define simde_math_isnanf(v) __builtin_isnanf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_isnanf(v) std::isnan(v) - #elif defined(simde_math_isnan) - #define simde_math_isnanf(v) simde_math_isnan(HEDLEY_STATIC_CAST(double, v)) - #endif -#endif - -#if !defined(simde_math_isnormal) - #if SIMDE_MATH_BUILTIN_LIBM(isnormal) - #define simde_math_isnormal(v) __builtin_isnormal(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_isnormal(v) isnormal(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_isnormal(v) std::isnormal(v) - #endif -#endif - -#if !defined(simde_math_isnormalf) - #if HEDLEY_HAS_BUILTIN(__builtin_isnormalf) - #define simde_math_isnormalf(v) __builtin_isnormalf(v) - #elif SIMDE_MATH_BUILTIN_LIBM(isnormal) - #define simde_math_isnormalf(v) __builtin_isnormal(v) - #elif defined(isnormalf) - #define simde_math_isnormalf(v) isnormalf(v) - #elif defined(isnormal) || defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_isnormalf(v) isnormal(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_isnormalf(v) std::isnormal(v) - #elif defined(simde_math_isnormal) - #define simde_math_isnormalf(v) simde_math_isnormal(v) - #endif -#endif - -#if !defined(simde_math_issubnormalf) - #if SIMDE_MATH_BUILTIN_LIBM(fpclassify) - #define simde_math_issubnormalf(v) __builtin_fpclassify(0, 0, 0, 1, 0, v) - #elif defined(fpclassify) - #define simde_math_issubnormalf(v) (fpclassify(v) == FP_SUBNORMAL) - #elif defined(SIMDE_IEEE754_STORAGE) - #define simde_math_issubnormalf(v) (((simde_float32_as_uint32(v) & UINT32_C(0x7F800000)) == UINT32_C(0)) && ((simde_float32_as_uint32(v) & UINT32_C(0x007FFFFF)) != UINT32_C(0))) - #endif -#endif - -#if !defined(simde_math_issubnormal) - #if SIMDE_MATH_BUILTIN_LIBM(fpclassify) - #define simde_math_issubnormal(v) __builtin_fpclassify(0, 0, 0, 1, 0, v) - #elif defined(fpclassify) - #define simde_math_issubnormal(v) (fpclassify(v) == FP_SUBNORMAL) - #elif defined(SIMDE_IEEE754_STORAGE) - #define simde_math_issubnormal(v) (((simde_float64_as_uint64(v) & UINT64_C(0x7FF0000000000000)) == UINT64_C(0)) && ((simde_float64_as_uint64(v) & UINT64_C(0x00FFFFFFFFFFFFF)) != UINT64_C(0))) - #endif -#endif - -#if defined(FP_NAN) - #define SIMDE_MATH_FP_NAN FP_NAN -#else - #define SIMDE_MATH_FP_NAN 0 -#endif -#if defined(FP_INFINITE) - #define SIMDE_MATH_FP_INFINITE FP_INFINITE -#else - #define SIMDE_MATH_FP_INFINITE 1 -#endif -#if defined(FP_ZERO) - #define SIMDE_MATH_FP_ZERO FP_ZERO -#else - #define SIMDE_MATH_FP_ZERO 2 -#endif -#if defined(FP_SUBNORMAL) - #define SIMDE_MATH_FP_SUBNORMAL FP_SUBNORMAL -#else - #define SIMDE_MATH_FP_SUBNORMAL 3 -#endif -#if defined(FP_NORMAL) - #define SIMDE_MATH_FP_NORMAL FP_NORMAL -#else - #define SIMDE_MATH_FP_NORMAL 4 -#endif - -static HEDLEY_INLINE -int -simde_math_fpclassifyf(float v) { - #if SIMDE_MATH_BUILTIN_LIBM(fpclassify) - return __builtin_fpclassify(SIMDE_MATH_FP_NAN, SIMDE_MATH_FP_INFINITE, SIMDE_MATH_FP_NORMAL, SIMDE_MATH_FP_SUBNORMAL, SIMDE_MATH_FP_ZERO, v); - #elif defined(fpclassify) - return fpclassify(v); - #else - return - simde_math_isnormalf(v) ? SIMDE_MATH_FP_NORMAL : - (v == 0.0f) ? SIMDE_MATH_FP_ZERO : - simde_math_isnanf(v) ? SIMDE_MATH_FP_NAN : - simde_math_isinff(v) ? SIMDE_MATH_FP_INFINITE : - SIMDE_MATH_FP_SUBNORMAL; - #endif -} - -static HEDLEY_INLINE -int -simde_math_fpclassify(double v) { - #if SIMDE_MATH_BUILTIN_LIBM(fpclassify) - return __builtin_fpclassify(SIMDE_MATH_FP_NAN, SIMDE_MATH_FP_INFINITE, SIMDE_MATH_FP_NORMAL, SIMDE_MATH_FP_SUBNORMAL, SIMDE_MATH_FP_ZERO, v); - #elif defined(fpclassify) - return fpclassify(v); - #else - return - simde_math_isnormal(v) ? SIMDE_MATH_FP_NORMAL : - (v == 0.0) ? SIMDE_MATH_FP_ZERO : - simde_math_isnan(v) ? SIMDE_MATH_FP_NAN : - simde_math_isinf(v) ? SIMDE_MATH_FP_INFINITE : - SIMDE_MATH_FP_SUBNORMAL; - #endif -} - -#define SIMDE_MATH_FP_QNAN 0x01 -#define SIMDE_MATH_FP_PZERO 0x02 -#define SIMDE_MATH_FP_NZERO 0x04 -#define SIMDE_MATH_FP_PINF 0x08 -#define SIMDE_MATH_FP_NINF 0x10 -#define SIMDE_MATH_FP_DENORMAL 0x20 -#define SIMDE_MATH_FP_NEGATIVE 0x40 -#define SIMDE_MATH_FP_SNAN 0x80 - -static HEDLEY_INLINE -uint8_t -simde_math_fpclassf(float v, const int imm8) { - union { - float f; - uint32_t u; - } fu; - fu.f = v; - uint32_t bits = fu.u; - uint8_t NegNum = (bits >> 31) & 1; - uint32_t const ExpMask = 0x3F800000; // [30:23] - uint32_t const MantMask = 0x007FFFFF; // [22:0] - uint8_t ExpAllOnes = ((bits & ExpMask) == ExpMask); - uint8_t ExpAllZeros = ((bits & ExpMask) == 0); - uint8_t MantAllZeros = ((bits & MantMask) == 0); - uint8_t ZeroNumber = ExpAllZeros & MantAllZeros; - uint8_t SignalingBit = (bits >> 22) & 1; - - uint8_t result = 0; - uint8_t qNaN_res = ExpAllOnes & (!MantAllZeros) & SignalingBit; - uint8_t Pzero_res = (!NegNum) & ExpAllZeros & MantAllZeros; - uint8_t Nzero_res = NegNum & ExpAllZeros & MantAllZeros; - uint8_t Pinf_res = (!NegNum) & ExpAllOnes & MantAllZeros; - uint8_t Ninf_res = NegNum & ExpAllOnes & MantAllZeros; - uint8_t Denorm_res = ExpAllZeros & (!MantAllZeros); - uint8_t FinNeg_res = NegNum & (!ExpAllOnes) & (!ZeroNumber); - uint8_t sNaN_res = ExpAllOnes & (!MantAllZeros) & (!SignalingBit); - result = (((imm8 >> 0) & qNaN_res) | \ - ((imm8 >> 1) & Pzero_res) | \ - ((imm8 >> 2) & Nzero_res) | \ - ((imm8 >> 3) & Pinf_res) | \ - ((imm8 >> 4) & Ninf_res) | \ - ((imm8 >> 5) & Denorm_res) | \ - ((imm8 >> 6) & FinNeg_res) | \ - ((imm8 >> 7) & sNaN_res)); - return result; -} - -static HEDLEY_INLINE -uint8_t -simde_math_fpclass(double v, const int imm8) { - union { - double d; - uint64_t u; - } du; - du.d = v; - uint64_t bits = du.u; - uint8_t NegNum = (bits >> 63) & 1; - uint64_t const ExpMask = 0x3FF0000000000000; // [62:52] - uint64_t const MantMask = 0x000FFFFFFFFFFFFF; // [51:0] - uint8_t ExpAllOnes = ((bits & ExpMask) == ExpMask); - uint8_t ExpAllZeros = ((bits & ExpMask) == 0); - uint8_t MantAllZeros = ((bits & MantMask) == 0); - uint8_t ZeroNumber = ExpAllZeros & MantAllZeros; - uint8_t SignalingBit = (bits >> 51) & 1; - - uint8_t result = 0; - uint8_t qNaN_res = ExpAllOnes & (!MantAllZeros) & SignalingBit; - uint8_t Pzero_res = (!NegNum) & ExpAllZeros & MantAllZeros; - uint8_t Nzero_res = NegNum & ExpAllZeros & MantAllZeros; - uint8_t Pinf_res = (!NegNum) & ExpAllOnes & MantAllZeros; - uint8_t Ninf_res = NegNum & ExpAllOnes & MantAllZeros; - uint8_t Denorm_res = ExpAllZeros & (!MantAllZeros); - uint8_t FinNeg_res = NegNum & (!ExpAllOnes) & (!ZeroNumber); - uint8_t sNaN_res = ExpAllOnes & (!MantAllZeros) & (!SignalingBit); - result = (((imm8 >> 0) & qNaN_res) | \ - ((imm8 >> 1) & Pzero_res) | \ - ((imm8 >> 2) & Nzero_res) | \ - ((imm8 >> 3) & Pinf_res) | \ - ((imm8 >> 4) & Ninf_res) | \ - ((imm8 >> 5) & Denorm_res) | \ - ((imm8 >> 6) & FinNeg_res) | \ - ((imm8 >> 7) & sNaN_res)); - return result; -} - -/*** Manipulation functions ***/ - -#if !defined(simde_math_nextafter) - #if \ - (HEDLEY_HAS_BUILTIN(__builtin_nextafter) && !defined(HEDLEY_IBM_VERSION)) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) - #define simde_math_nextafter(x, y) __builtin_nextafter(x, y) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_nextafter(x, y) std::nextafter(x, y) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_nextafter(x, y) nextafter(x, y) - #endif -#endif - -#if !defined(simde_math_nextafterf) - #if \ - (HEDLEY_HAS_BUILTIN(__builtin_nextafterf) && !defined(HEDLEY_IBM_VERSION)) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) || \ - HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) - #define simde_math_nextafterf(x, y) __builtin_nextafterf(x, y) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_nextafterf(x, y) std::nextafter(x, y) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_nextafterf(x, y) nextafterf(x, y) - #endif -#endif - -/*** Functions from C99 ***/ - -#if !defined(simde_math_abs) - #if SIMDE_MATH_BUILTIN_LIBM(abs) - #define simde_math_abs(v) __builtin_abs(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_abs(v) std::abs(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_abs(v) abs(v) - #endif -#endif - -#if !defined(simde_math_labs) - #if SIMDE_MATH_BUILTIN_LIBM(labs) - #define simde_math_labs(v) __builtin_labs(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_labs(v) std::labs(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_labs(v) labs(v) - #endif -#endif - -#if !defined(simde_math_llabs) - #if SIMDE_MATH_BUILTIN_LIBM(llabs) - #define simde_math_llabs(v) __builtin_llabs(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_llabs(v) std::llabs(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_llabs(v) llabs(v) - #endif -#endif - -#if !defined(simde_math_fabsf) - #if SIMDE_MATH_BUILTIN_LIBM(fabsf) - #define simde_math_fabsf(v) __builtin_fabsf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_fabsf(v) std::abs(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_fabsf(v) fabsf(v) - #endif -#endif - -#if !defined(simde_math_acos) - #if SIMDE_MATH_BUILTIN_LIBM(acos) - #define simde_math_acos(v) __builtin_acos(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_acos(v) std::acos(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_acos(v) acos(v) - #endif -#endif - -#if !defined(simde_math_acosf) - #if SIMDE_MATH_BUILTIN_LIBM(acosf) - #define simde_math_acosf(v) __builtin_acosf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_acosf(v) std::acos(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_acosf(v) acosf(v) - #endif -#endif - -#if !defined(simde_math_acosh) - #if SIMDE_MATH_BUILTIN_LIBM(acosh) - #define simde_math_acosh(v) __builtin_acosh(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_acosh(v) std::acosh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_acosh(v) acosh(v) - #endif -#endif - -#if !defined(simde_math_acoshf) - #if SIMDE_MATH_BUILTIN_LIBM(acoshf) - #define simde_math_acoshf(v) __builtin_acoshf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_acoshf(v) std::acosh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_acoshf(v) acoshf(v) - #endif -#endif - -#if !defined(simde_math_asin) - #if SIMDE_MATH_BUILTIN_LIBM(asin) - #define simde_math_asin(v) __builtin_asin(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_asin(v) std::asin(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_asin(v) asin(v) - #endif -#endif - -#if !defined(simde_math_asinf) - #if SIMDE_MATH_BUILTIN_LIBM(asinf) - #define simde_math_asinf(v) __builtin_asinf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_asinf(v) std::asin(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_asinf(v) asinf(v) - #endif -#endif - -#if !defined(simde_math_asinh) - #if SIMDE_MATH_BUILTIN_LIBM(asinh) - #define simde_math_asinh(v) __builtin_asinh(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_asinh(v) std::asinh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_asinh(v) asinh(v) - #endif -#endif - -#if !defined(simde_math_asinhf) - #if SIMDE_MATH_BUILTIN_LIBM(asinhf) - #define simde_math_asinhf(v) __builtin_asinhf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_asinhf(v) std::asinh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_asinhf(v) asinhf(v) - #endif -#endif - -#if !defined(simde_math_atan) - #if SIMDE_MATH_BUILTIN_LIBM(atan) - #define simde_math_atan(v) __builtin_atan(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_atan(v) std::atan(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_atan(v) atan(v) - #endif -#endif - -#if !defined(simde_math_atan2) - #if SIMDE_MATH_BUILTIN_LIBM(atan2) - #define simde_math_atan2(y, x) __builtin_atan2(y, x) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_atan2(y, x) std::atan2(y, x) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_atan2(y, x) atan2(y, x) - #endif -#endif - -#if !defined(simde_math_atan2f) - #if SIMDE_MATH_BUILTIN_LIBM(atan2f) - #define simde_math_atan2f(y, x) __builtin_atan2f(y, x) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_atan2f(y, x) std::atan2(y, x) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_atan2f(y, x) atan2f(y, x) - #endif -#endif - -#if !defined(simde_math_atanf) - #if SIMDE_MATH_BUILTIN_LIBM(atanf) - #define simde_math_atanf(v) __builtin_atanf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_atanf(v) std::atan(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_atanf(v) atanf(v) - #endif -#endif - -#if !defined(simde_math_atanh) - #if SIMDE_MATH_BUILTIN_LIBM(atanh) - #define simde_math_atanh(v) __builtin_atanh(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_atanh(v) std::atanh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_atanh(v) atanh(v) - #endif -#endif - -#if !defined(simde_math_atanhf) - #if SIMDE_MATH_BUILTIN_LIBM(atanhf) - #define simde_math_atanhf(v) __builtin_atanhf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_atanhf(v) std::atanh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_atanhf(v) atanhf(v) - #endif -#endif - -#if !defined(simde_math_cbrt) - #if SIMDE_MATH_BUILTIN_LIBM(cbrt) - #define simde_math_cbrt(v) __builtin_cbrt(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_cbrt(v) std::cbrt(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_cbrt(v) cbrt(v) - #endif -#endif - -#if !defined(simde_math_cbrtf) - #if SIMDE_MATH_BUILTIN_LIBM(cbrtf) - #define simde_math_cbrtf(v) __builtin_cbrtf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_cbrtf(v) std::cbrt(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_cbrtf(v) cbrtf(v) - #endif -#endif - -#if !defined(simde_math_ceil) - #if SIMDE_MATH_BUILTIN_LIBM(ceil) - #define simde_math_ceil(v) __builtin_ceil(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_ceil(v) std::ceil(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_ceil(v) ceil(v) - #endif -#endif - -#if !defined(simde_math_ceilf) - #if SIMDE_MATH_BUILTIN_LIBM(ceilf) - #define simde_math_ceilf(v) __builtin_ceilf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_ceilf(v) std::ceil(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_ceilf(v) ceilf(v) - #endif -#endif - -#if !defined(simde_math_copysign) - #if SIMDE_MATH_BUILTIN_LIBM(copysign) - #define simde_math_copysign(x, y) __builtin_copysign(x, y) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_copysign(x, y) std::copysign(x, y) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_copysign(x, y) copysign(x, y) - #endif -#endif - -#if !defined(simde_math_copysignf) - #if SIMDE_MATH_BUILTIN_LIBM(copysignf) - #define simde_math_copysignf(x, y) __builtin_copysignf(x, y) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_copysignf(x, y) std::copysignf(x, y) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_copysignf(x, y) copysignf(x, y) - #endif -#endif - -#if !defined(simde_math_signbit) - #if SIMDE_MATH_BUILTIN_LIBM(signbit) - #if (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(7,0,0)) - #define simde_math_signbit(x) __builtin_signbit(x) - #else - #define simde_math_signbit(x) __builtin_signbit(HEDLEY_STATIC_CAST(double, (x))) - #endif - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_signbit(x) std::signbit(x) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_signbit(x) signbit(x) - #endif -#endif - -#if !defined(simde_math_cos) - #if SIMDE_MATH_BUILTIN_LIBM(cos) - #define simde_math_cos(v) __builtin_cos(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_cos(v) std::cos(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_cos(v) cos(v) - #endif -#endif - -#if !defined(simde_math_cosf) - #if defined(SIMDE_MATH_SLEEF_ENABLE) - #if SIMDE_ACCURACY_PREFERENCE < 1 - #define simde_math_cosf(v) Sleef_cosf_u35(v) - #else - #define simde_math_cosf(v) Sleef_cosf_u10(v) - #endif - #elif SIMDE_MATH_BUILTIN_LIBM(cosf) - #define simde_math_cosf(v) __builtin_cosf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_cosf(v) std::cos(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_cosf(v) cosf(v) - #endif -#endif - -#if !defined(simde_math_cosh) - #if SIMDE_MATH_BUILTIN_LIBM(cosh) - #define simde_math_cosh(v) __builtin_cosh(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_cosh(v) std::cosh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_cosh(v) cosh(v) - #endif -#endif - -#if !defined(simde_math_coshf) - #if SIMDE_MATH_BUILTIN_LIBM(coshf) - #define simde_math_coshf(v) __builtin_coshf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_coshf(v) std::cosh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_coshf(v) coshf(v) - #endif -#endif - -#if !defined(simde_math_erf) - #if SIMDE_MATH_BUILTIN_LIBM(erf) - #define simde_math_erf(v) __builtin_erf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_erf(v) std::erf(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_erf(v) erf(v) - #endif -#endif - -#if !defined(simde_math_erff) - #if SIMDE_MATH_BUILTIN_LIBM(erff) - #define simde_math_erff(v) __builtin_erff(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_erff(v) std::erf(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_erff(v) erff(v) - #endif -#endif - -#if !defined(simde_math_erfc) - #if SIMDE_MATH_BUILTIN_LIBM(erfc) - #define simde_math_erfc(v) __builtin_erfc(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_erfc(v) std::erfc(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_erfc(v) erfc(v) - #endif -#endif - -#if !defined(simde_math_erfcf) - #if SIMDE_MATH_BUILTIN_LIBM(erfcf) - #define simde_math_erfcf(v) __builtin_erfcf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_erfcf(v) std::erfc(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_erfcf(v) erfcf(v) - #endif -#endif - -#if !defined(simde_math_exp) - #if SIMDE_MATH_BUILTIN_LIBM(exp) - #define simde_math_exp(v) __builtin_exp(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_exp(v) std::exp(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_exp(v) exp(v) - #endif -#endif - -#if !defined(simde_math_expf) - #if SIMDE_MATH_BUILTIN_LIBM(expf) - #define simde_math_expf(v) __builtin_expf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_expf(v) std::exp(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_expf(v) expf(v) - #endif -#endif - -#if !defined(simde_math_expm1) - #if SIMDE_MATH_BUILTIN_LIBM(expm1) - #define simde_math_expm1(v) __builtin_expm1(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_expm1(v) std::expm1(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_expm1(v) expm1(v) - #endif -#endif - -#if !defined(simde_math_expm1f) - #if SIMDE_MATH_BUILTIN_LIBM(expm1f) - #define simde_math_expm1f(v) __builtin_expm1f(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_expm1f(v) std::expm1(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_expm1f(v) expm1f(v) - #endif -#endif - -#if !defined(simde_math_exp2) - #if SIMDE_MATH_BUILTIN_LIBM(exp2) - #define simde_math_exp2(v) __builtin_exp2(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_exp2(v) std::exp2(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_exp2(v) exp2(v) - #endif -#endif - -#if !defined(simde_math_exp2f) - #if SIMDE_MATH_BUILTIN_LIBM(exp2f) - #define simde_math_exp2f(v) __builtin_exp2f(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_exp2f(v) std::exp2(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_exp2f(v) exp2f(v) - #endif -#endif - -#if HEDLEY_HAS_BUILTIN(__builtin_exp10) || HEDLEY_GCC_VERSION_CHECK(3,4,0) - # define simde_math_exp10(v) __builtin_exp10(v) -#else -# define simde_math_exp10(v) pow(10.0, (v)) -#endif - -#if HEDLEY_HAS_BUILTIN(__builtin_exp10f) || HEDLEY_GCC_VERSION_CHECK(3,4,0) - # define simde_math_exp10f(v) __builtin_exp10f(v) -#else -# define simde_math_exp10f(v) powf(10.0f, (v)) -#endif - -#if !defined(simde_math_fabs) - #if SIMDE_MATH_BUILTIN_LIBM(fabs) - #define simde_math_fabs(v) __builtin_fabs(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_fabs(v) std::fabs(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_fabs(v) fabs(v) - #endif -#endif - -#if !defined(simde_math_fabsf) - #if SIMDE_MATH_BUILTIN_LIBM(fabsf) - #define simde_math_fabsf(v) __builtin_fabsf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_fabsf(v) std::fabs(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_fabsf(v) fabsf(v) - #endif -#endif - -#if !defined(simde_math_floor) - #if SIMDE_MATH_BUILTIN_LIBM(floor) - #define simde_math_floor(v) __builtin_floor(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_floor(v) std::floor(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_floor(v) floor(v) - #endif -#endif - -#if !defined(simde_math_floorf) - #if SIMDE_MATH_BUILTIN_LIBM(floorf) - #define simde_math_floorf(v) __builtin_floorf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_floorf(v) std::floor(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_floorf(v) floorf(v) - #endif -#endif - -#if !defined(simde_math_fma) - #if SIMDE_MATH_BUILTIN_LIBM(fma) - #define simde_math_fma(x, y, z) __builtin_fma(x, y, z) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_fma(x, y, z) std::fma(x, y, z) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_fma(x, y, z) fma(x, y, z) - #endif -#endif - -#if !defined(simde_math_fmaf) - #if SIMDE_MATH_BUILTIN_LIBM(fmaf) - #define simde_math_fmaf(x, y, z) __builtin_fmaf(x, y, z) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_fmaf(x, y, z) std::fma(x, y, z) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_fmaf(x, y, z) fmaf(x, y, z) - #endif -#endif - -#if !defined(simde_math_fmax) - #if SIMDE_MATH_BUILTIN_LIBM(fmax) - #define simde_math_fmax(x, y) __builtin_fmax(x, y) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_fmax(x, y) std::fmax(x, y) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_fmax(x, y) fmax(x, y) - #endif -#endif - -#if !defined(simde_math_fmaxf) - #if SIMDE_MATH_BUILTIN_LIBM(fmaxf) - #define simde_math_fmaxf(x, y) __builtin_fmaxf(x, y) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_fmaxf(x, y) std::fmax(x, y) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_fmaxf(x, y) fmaxf(x, y) - #endif -#endif - -#if !defined(simde_math_hypot) - #if SIMDE_MATH_BUILTIN_LIBM(hypot) - #define simde_math_hypot(y, x) __builtin_hypot(y, x) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_hypot(y, x) std::hypot(y, x) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_hypot(y, x) hypot(y, x) - #endif -#endif - -#if !defined(simde_math_hypotf) - #if SIMDE_MATH_BUILTIN_LIBM(hypotf) - #define simde_math_hypotf(y, x) __builtin_hypotf(y, x) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_hypotf(y, x) std::hypot(y, x) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_hypotf(y, x) hypotf(y, x) - #endif -#endif - -#if !defined(simde_math_log) - #if SIMDE_MATH_BUILTIN_LIBM(log) - #define simde_math_log(v) __builtin_log(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_log(v) std::log(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_log(v) log(v) - #endif -#endif - -#if !defined(simde_math_logf) - #if SIMDE_MATH_BUILTIN_LIBM(logf) - #define simde_math_logf(v) __builtin_logf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_logf(v) std::log(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_logf(v) logf(v) - #endif -#endif - -#if !defined(simde_math_logb) - #if SIMDE_MATH_BUILTIN_LIBM(logb) - #define simde_math_logb(v) __builtin_logb(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_logb(v) std::logb(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_logb(v) logb(v) - #endif -#endif - -#if !defined(simde_math_logbf) - #if SIMDE_MATH_BUILTIN_LIBM(logbf) - #define simde_math_logbf(v) __builtin_logbf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_logbf(v) std::logb(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_logbf(v) logbf(v) - #endif -#endif - -#if !defined(simde_math_log1p) - #if SIMDE_MATH_BUILTIN_LIBM(log1p) - #define simde_math_log1p(v) __builtin_log1p(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_log1p(v) std::log1p(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_log1p(v) log1p(v) - #endif -#endif - -#if !defined(simde_math_log1pf) - #if SIMDE_MATH_BUILTIN_LIBM(log1pf) - #define simde_math_log1pf(v) __builtin_log1pf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_log1pf(v) std::log1p(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_log1pf(v) log1pf(v) - #endif -#endif - -#if !defined(simde_math_log2) - #if SIMDE_MATH_BUILTIN_LIBM(log2) - #define simde_math_log2(v) __builtin_log2(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_log2(v) std::log2(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_log2(v) log2(v) - #endif -#endif - -#if !defined(simde_math_log2f) - #if SIMDE_MATH_BUILTIN_LIBM(log2f) - #define simde_math_log2f(v) __builtin_log2f(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_log2f(v) std::log2(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_log2f(v) log2f(v) - #endif -#endif - -#if !defined(simde_math_log10) - #if SIMDE_MATH_BUILTIN_LIBM(log10) - #define simde_math_log10(v) __builtin_log10(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_log10(v) std::log10(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_log10(v) log10(v) - #endif -#endif - -#if !defined(simde_math_log10f) - #if SIMDE_MATH_BUILTIN_LIBM(log10f) - #define simde_math_log10f(v) __builtin_log10f(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_log10f(v) std::log10(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_log10f(v) log10f(v) - #endif -#endif - -#if !defined(simde_math_modf) - #if SIMDE_MATH_BUILTIN_LIBM(modf) - #define simde_math_modf(x, iptr) __builtin_modf(x, iptr) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_modf(x, iptr) std::modf(x, iptr) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_modf(x, iptr) modf(x, iptr) - #endif -#endif - -#if !defined(simde_math_modff) - #if SIMDE_MATH_BUILTIN_LIBM(modff) - #define simde_math_modff(x, iptr) __builtin_modff(x, iptr) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_modff(x, iptr) std::modf(x, iptr) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_modff(x, iptr) modff(x, iptr) - #endif -#endif - -#if !defined(simde_math_nearbyint) - #if SIMDE_MATH_BUILTIN_LIBM(nearbyint) - #define simde_math_nearbyint(v) __builtin_nearbyint(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_nearbyint(v) std::nearbyint(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_nearbyint(v) nearbyint(v) - #endif -#endif - -#if !defined(simde_math_nearbyintf) - #if SIMDE_MATH_BUILTIN_LIBM(nearbyintf) - #define simde_math_nearbyintf(v) __builtin_nearbyintf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_nearbyintf(v) std::nearbyint(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_nearbyintf(v) nearbyintf(v) - #endif -#endif - -#if !defined(simde_math_pow) - #if SIMDE_MATH_BUILTIN_LIBM(pow) - #define simde_math_pow(y, x) __builtin_pow(y, x) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_pow(y, x) std::pow(y, x) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_pow(y, x) pow(y, x) - #endif -#endif - -#if !defined(simde_math_powf) - #if SIMDE_MATH_BUILTIN_LIBM(powf) - #define simde_math_powf(y, x) __builtin_powf(y, x) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_powf(y, x) std::pow(y, x) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_powf(y, x) powf(y, x) - #endif -#endif - -#if !defined(simde_math_rint) - #if SIMDE_MATH_BUILTIN_LIBM(rint) - #define simde_math_rint(v) __builtin_rint(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_rint(v) std::rint(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_rint(v) rint(v) - #endif -#endif - -#if !defined(simde_math_rintf) - #if SIMDE_MATH_BUILTIN_LIBM(rintf) - #define simde_math_rintf(v) __builtin_rintf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_rintf(v) std::rint(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_rintf(v) rintf(v) - #endif -#endif - -#if !defined(simde_math_round) - #if SIMDE_MATH_BUILTIN_LIBM(round) - #define simde_math_round(v) __builtin_round(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_round(v) std::round(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_round(v) round(v) - #endif -#endif - -#if !defined(simde_math_roundf) - #if SIMDE_MATH_BUILTIN_LIBM(roundf) - #define simde_math_roundf(v) __builtin_roundf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_roundf(v) std::round(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_roundf(v) roundf(v) - #endif -#endif - -#if !defined(simde_math_roundeven) - #if \ - (!defined(HEDLEY_EMSCRIPTEN_VERSION) && HEDLEY_HAS_BUILTIN(__builtin_roundeven)) || \ - HEDLEY_GCC_VERSION_CHECK(10,0,0) - #define simde_math_roundeven(v) __builtin_roundeven(v) - #elif defined(simde_math_round) && defined(simde_math_fabs) - static HEDLEY_INLINE - double - simde_math_roundeven(double v) { - double rounded = simde_math_round(v); - double diff = rounded - v; - if (HEDLEY_UNLIKELY(simde_math_fabs(diff) == 0.5) && (HEDLEY_STATIC_CAST(int64_t, rounded) & 1)) { - rounded = v - diff; - } - return rounded; - } - #define simde_math_roundeven simde_math_roundeven - #endif -#endif - -#if !defined(simde_math_roundevenf) - #if \ - (!defined(HEDLEY_EMSCRIPTEN_VERSION) && HEDLEY_HAS_BUILTIN(__builtin_roundevenf)) || \ - HEDLEY_GCC_VERSION_CHECK(10,0,0) - #define simde_math_roundevenf(v) __builtin_roundevenf(v) - #elif defined(simde_math_roundf) && defined(simde_math_fabsf) - static HEDLEY_INLINE - float - simde_math_roundevenf(float v) { - float rounded = simde_math_roundf(v); - float diff = rounded - v; - if (HEDLEY_UNLIKELY(simde_math_fabsf(diff) == 0.5f) && (HEDLEY_STATIC_CAST(int32_t, rounded) & 1)) { - rounded = v - diff; - } - return rounded; - } - #define simde_math_roundevenf simde_math_roundevenf - #endif -#endif - -#if !defined(simde_math_sin) - #if SIMDE_MATH_BUILTIN_LIBM(sin) - #define simde_math_sin(v) __builtin_sin(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_sin(v) std::sin(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_sin(v) sin(v) - #endif -#endif - -#if !defined(simde_math_sinf) - #if SIMDE_MATH_BUILTIN_LIBM(sinf) - #define simde_math_sinf(v) __builtin_sinf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_sinf(v) std::sin(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_sinf(v) sinf(v) - #endif -#endif - -#if !defined(simde_math_sinh) - #if SIMDE_MATH_BUILTIN_LIBM(sinh) - #define simde_math_sinh(v) __builtin_sinh(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_sinh(v) std::sinh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_sinh(v) sinh(v) - #endif -#endif - -#if !defined(simde_math_sinhf) - #if SIMDE_MATH_BUILTIN_LIBM(sinhf) - #define simde_math_sinhf(v) __builtin_sinhf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_sinhf(v) std::sinh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_sinhf(v) sinhf(v) - #endif -#endif - -#if !defined(simde_math_sqrt) - #if SIMDE_MATH_BUILTIN_LIBM(sqrt) - #define simde_math_sqrt(v) __builtin_sqrt(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_sqrt(v) std::sqrt(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_sqrt(v) sqrt(v) - #endif -#endif - -#if !defined(simde_math_sqrtf) - #if SIMDE_MATH_BUILTIN_LIBM(sqrtf) - #define simde_math_sqrtf(v) __builtin_sqrtf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_sqrtf(v) std::sqrt(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_sqrtf(v) sqrtf(v) - #endif -#endif - -#if !defined(simde_math_sqrtl) - #if SIMDE_MATH_BUILTIN_LIBM(sqrtl) - #define simde_math_sqrtl(v) __builtin_sqrtl(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_sqrtl(v) std::sqrt(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_sqrtl(v) sqrtl(v) - #endif -#endif - -#if !defined(simde_math_tan) - #if SIMDE_MATH_BUILTIN_LIBM(tan) - #define simde_math_tan(v) __builtin_tan(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_tan(v) std::tan(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_tan(v) tan(v) - #endif -#endif - -#if !defined(simde_math_tanf) - #if SIMDE_MATH_BUILTIN_LIBM(tanf) - #define simde_math_tanf(v) __builtin_tanf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_tanf(v) std::tan(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_tanf(v) tanf(v) - #endif -#endif - -#if !defined(simde_math_tanh) - #if SIMDE_MATH_BUILTIN_LIBM(tanh) - #define simde_math_tanh(v) __builtin_tanh(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_tanh(v) std::tanh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_tanh(v) tanh(v) - #endif -#endif - -#if !defined(simde_math_tanhf) - #if SIMDE_MATH_BUILTIN_LIBM(tanhf) - #define simde_math_tanhf(v) __builtin_tanhf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_tanhf(v) std::tanh(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_tanhf(v) tanhf(v) - #endif -#endif - -#if !defined(simde_math_trunc) - #if SIMDE_MATH_BUILTIN_LIBM(trunc) - #define simde_math_trunc(v) __builtin_trunc(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_trunc(v) std::trunc(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_trunc(v) trunc(v) - #endif -#endif - -#if !defined(simde_math_truncf) - #if SIMDE_MATH_BUILTIN_LIBM(truncf) - #define simde_math_truncf(v) __builtin_truncf(v) - #elif defined(SIMDE_MATH_HAVE_CMATH) - #define simde_math_truncf(v) std::trunc(v) - #elif defined(SIMDE_MATH_HAVE_MATH_H) - #define simde_math_truncf(v) truncf(v) - #endif -#endif - -/*** Comparison macros (which don't raise invalid errors) ***/ - -#if defined(isunordered) - #define simde_math_isunordered(x, y) isunordered(x, y) -#elif HEDLEY_HAS_BUILTIN(__builtin_isunordered) - #define simde_math_isunordered(x, y) __builtin_isunordered(x, y) -#else - static HEDLEY_INLINE - int simde_math_isunordered(double x, double y) { - return (x != y) && (x != x || y != y); - } - #define simde_math_isunordered simde_math_isunordered - - static HEDLEY_INLINE - int simde_math_isunorderedf(float x, float y) { - return (x != y) && (x != x || y != y); - } - #define simde_math_isunorderedf simde_math_isunorderedf -#endif -#if !defined(simde_math_isunorderedf) - #define simde_math_isunorderedf simde_math_isunordered -#endif - -/*** Additional functions not in libm ***/ - -#if defined(simde_math_fabs) && defined(simde_math_sqrt) && defined(simde_math_exp) - static HEDLEY_INLINE - double - simde_math_cdfnorm(double x) { - /* https://www.johndcook.com/blog/cpp_phi/ - * Public Domain */ - static const double a1 = 0.254829592; - static const double a2 = -0.284496736; - static const double a3 = 1.421413741; - static const double a4 = -1.453152027; - static const double a5 = 1.061405429; - static const double p = 0.3275911; - - const int sign = x < 0; - x = simde_math_fabs(x) / simde_math_sqrt(2.0); - - /* A&S formula 7.1.26 */ - double t = 1.0 / (1.0 + p * x); - double y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * simde_math_exp(-x * x); - - return 0.5 * (1.0 + (sign ? -y : y)); - } - #define simde_math_cdfnorm simde_math_cdfnorm -#endif - -#if defined(simde_math_fabsf) && defined(simde_math_sqrtf) && defined(simde_math_expf) - static HEDLEY_INLINE - float - simde_math_cdfnormf(float x) { - /* https://www.johndcook.com/blog/cpp_phi/ - * Public Domain */ - static const float a1 = 0.254829592f; - static const float a2 = -0.284496736f; - static const float a3 = 1.421413741f; - static const float a4 = -1.453152027f; - static const float a5 = 1.061405429f; - static const float p = 0.3275911f; - - const int sign = x < 0; - x = simde_math_fabsf(x) / simde_math_sqrtf(2.0f); - - /* A&S formula 7.1.26 */ - float t = 1.0f / (1.0f + p * x); - float y = 1.0f - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * simde_math_expf(-x * x); - - return 0.5f * (1.0f + (sign ? -y : y)); - } - #define simde_math_cdfnormf simde_math_cdfnormf -#endif - -#if !defined(simde_math_cdfnorminv) && defined(simde_math_log) && defined(simde_math_sqrt) - /*https://web.archive.org/web/20150910081113/http://home.online.no/~pjacklam/notes/invnorm/impl/sprouse/ltqnorm.c*/ - static HEDLEY_INLINE - double - simde_math_cdfnorminv(double p) { - static const double a[6] = { - -3.969683028665376e+01, - 2.209460984245205e+02, - -2.759285104469687e+02, - 1.383577518672690e+02, - -3.066479806614716e+01, - 2.506628277459239e+00 - }; - - static const double b[5] = { - -5.447609879822406e+01, - 1.615858368580409e+02, - -1.556989798598866e+02, - 6.680131188771972e+01, - -1.328068155288572e+01 - }; - - static const double c[6] = { - -7.784894002430293e-03, - -3.223964580411365e-01, - -2.400758277161838e+00, - -2.549732539343734e+00, - 4.374664141464968e+00, - 2.938163982698783e+00 - }; - - static const double d[4] = { - 7.784695709041462e-03, - 3.224671290700398e-01, - 2.445134137142996e+00, - 3.754408661907416e+00 - }; - - static const double low = 0.02425; - static const double high = 0.97575; - double q, r; - - if (p < 0 || p > 1) { - return 0.0; - } else if (p == 0) { - return -SIMDE_MATH_INFINITY; - } else if (p == 1) { - return SIMDE_MATH_INFINITY; - } else if (p < low) { - q = simde_math_sqrt(-2.0 * simde_math_log(p)); - return - (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / - (((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1)); - } else if (p > high) { - q = simde_math_sqrt(-2.0 * simde_math_log(1.0 - p)); - return - -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / - (((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1)); - } else { - q = p - 0.5; - r = q * q; - return (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * - q / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1); - } -} -#define simde_math_cdfnorminv simde_math_cdfnorminv -#endif - -#if !defined(simde_math_cdfnorminvf) && defined(simde_math_logf) && defined(simde_math_sqrtf) - static HEDLEY_INLINE - float - simde_math_cdfnorminvf(float p) { - static const float a[6] = { - -3.969683028665376e+01f, - 2.209460984245205e+02f, - -2.759285104469687e+02f, - 1.383577518672690e+02f, - -3.066479806614716e+01f, - 2.506628277459239e+00f - }; - static const float b[5] = { - -5.447609879822406e+01f, - 1.615858368580409e+02f, - -1.556989798598866e+02f, - 6.680131188771972e+01f, - -1.328068155288572e+01f - }; - static const float c[6] = { - -7.784894002430293e-03f, - -3.223964580411365e-01f, - -2.400758277161838e+00f, - -2.549732539343734e+00f, - 4.374664141464968e+00f, - 2.938163982698783e+00f - }; - static const float d[4] = { - 7.784695709041462e-03f, - 3.224671290700398e-01f, - 2.445134137142996e+00f, - 3.754408661907416e+00f - }; - static const float low = 0.02425f; - static const float high = 0.97575f; - float q, r; - - if (p < 0 || p > 1) { - return 0.0f; - } else if (p == 0) { - return -SIMDE_MATH_INFINITYF; - } else if (p == 1) { - return SIMDE_MATH_INFINITYF; - } else if (p < low) { - q = simde_math_sqrtf(-2.0f * simde_math_logf(p)); - return - (((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / - (((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1)); - } else if (p > high) { - q = simde_math_sqrtf(-2.0f * simde_math_logf(1.0f - p)); - return - -(((((c[0] * q + c[1]) * q + c[2]) * q + c[3]) * q + c[4]) * q + c[5]) / - (((((d[0] * q + d[1]) * q + d[2]) * q + d[3]) * q + 1)); - } else { - q = p - 0.5f; - r = q * q; - return (((((a[0] * r + a[1]) * r + a[2]) * r + a[3]) * r + a[4]) * r + a[5]) * - q / (((((b[0] * r + b[1]) * r + b[2]) * r + b[3]) * r + b[4]) * r + 1); - } - } - #define simde_math_cdfnorminvf simde_math_cdfnorminvf -#endif - -#if !defined(simde_math_erfinv) && defined(simde_math_log) && defined(simde_math_copysign) && defined(simde_math_sqrt) - static HEDLEY_INLINE - double - simde_math_erfinv(double x) { - /* https://stackoverflow.com/questions/27229371/inverse-error-function-in-c - * - * The original answer on SO uses a constant of 0.147, but in my - * testing 0.14829094707965850830078125 gives a lower average absolute error - * (0.0001410958211636170744895935 vs. 0.0001465479290345683693885803). - * That said, if your goal is to minimize the *maximum* absolute - * error, 0.15449436008930206298828125 provides significantly better - * results; 0.0009250640869140625000000000 vs ~ 0.005. */ - double tt1, tt2, lnx; - double sgn = simde_math_copysign(1.0, x); - - x = (1.0 - x) * (1.0 + x); - lnx = simde_math_log(x); - - tt1 = 2.0 / (SIMDE_MATH_PI * 0.14829094707965850830078125) + 0.5 * lnx; - tt2 = (1.0 / 0.14829094707965850830078125) * lnx; - - return sgn * simde_math_sqrt(-tt1 + simde_math_sqrt(tt1 * tt1 - tt2)); - } - #define simde_math_erfinv simde_math_erfinv -#endif - -#if !defined(simde_math_erfinvf) && defined(simde_math_logf) && defined(simde_math_copysignf) && defined(simde_math_sqrtf) - static HEDLEY_INLINE - float - simde_math_erfinvf(float x) { - float tt1, tt2, lnx; - float sgn = simde_math_copysignf(1.0f, x); - - x = (1.0f - x) * (1.0f + x); - lnx = simde_math_logf(x); - - tt1 = 2.0f / (SIMDE_MATH_PIF * 0.14829094707965850830078125f) + 0.5f * lnx; - tt2 = (1.0f / 0.14829094707965850830078125f) * lnx; - - return sgn * simde_math_sqrtf(-tt1 + simde_math_sqrtf(tt1 * tt1 - tt2)); - } - #define simde_math_erfinvf simde_math_erfinvf -#endif - -#if !defined(simde_math_erfcinv) && defined(simde_math_erfinv) && defined(simde_math_log) && defined(simde_math_sqrt) - static HEDLEY_INLINE - double - simde_math_erfcinv(double x) { - if(x >= 0.0625 && x < 2.0) { - return simde_math_erfinv(1.0 - x); - } else if (x < 0.0625 && x >= 1.0e-100) { - static const double p[6] = { - 0.1550470003116, - 1.382719649631, - 0.690969348887, - -1.128081391617, - 0.680544246825, - -0.16444156791 - }; - static const double q[3] = { - 0.155024849822, - 1.385228141995, - 1.000000000000 - }; - - const double t = 1.0 / simde_math_sqrt(-simde_math_log(x)); - return (p[0] / t + p[1] + t * (p[2] + t * (p[3] + t * (p[4] + t * p[5])))) / - (q[0] + t * (q[1] + t * (q[2]))); - } else if (x < 1.0e-100 && x >= SIMDE_MATH_DBL_MIN) { - static const double p[4] = { - 0.00980456202915, - 0.363667889171, - 0.97302949837, - -0.5374947401 - }; - static const double q[3] = { - 0.00980451277802, - 0.363699971544, - 1.000000000000 - }; - - const double t = 1.0 / simde_math_sqrt(-simde_math_log(x)); - return (p[0] / t + p[1] + t * (p[2] + t * p[3])) / - (q[0] + t * (q[1] + t * (q[2]))); - } else if (!simde_math_isnormal(x)) { - return SIMDE_MATH_INFINITY; - } else { - return -SIMDE_MATH_INFINITY; - } - } - - #define simde_math_erfcinv simde_math_erfcinv -#endif - -#if !defined(simde_math_erfcinvf) && defined(simde_math_erfinvf) && defined(simde_math_logf) && defined(simde_math_sqrtf) - static HEDLEY_INLINE - float - simde_math_erfcinvf(float x) { - if(x >= 0.0625f && x < 2.0f) { - return simde_math_erfinvf(1.0f - x); - } else if (x < 0.0625f && x >= SIMDE_MATH_FLT_MIN) { - static const float p[6] = { - 0.1550470003116f, - 1.382719649631f, - 0.690969348887f, - -1.128081391617f, - 0.680544246825f - -0.164441567910f - }; - static const float q[3] = { - 0.155024849822f, - 1.385228141995f, - 1.000000000000f - }; - - const float t = 1.0f / simde_math_sqrtf(-simde_math_logf(x)); - return (p[0] / t + p[1] + t * (p[2] + t * (p[3] + t * (p[4] + t * p[5])))) / - (q[0] + t * (q[1] + t * (q[2]))); - } else if (x < SIMDE_MATH_FLT_MIN && simde_math_isnormalf(x)) { - static const float p[4] = { - 0.00980456202915f, - 0.36366788917100f, - 0.97302949837000f, - -0.5374947401000f - }; - static const float q[3] = { - 0.00980451277802f, - 0.36369997154400f, - 1.00000000000000f - }; - - const float t = 1.0f / simde_math_sqrtf(-simde_math_logf(x)); - return (p[0] / t + p[1] + t * (p[2] + t * p[3])) / - (q[0] + t * (q[1] + t * (q[2]))); - } else { - return simde_math_isnormalf(x) ? -SIMDE_MATH_INFINITYF : SIMDE_MATH_INFINITYF; - } - } - - #define simde_math_erfcinvf simde_math_erfcinvf -#endif - -static HEDLEY_INLINE -double -simde_math_rad2deg(double radians) { - return radians * SIMDE_MATH_180_OVER_PI; -} - -static HEDLEY_INLINE -float -simde_math_rad2degf(float radians) { - return radians * SIMDE_MATH_180_OVER_PIF; -} - -static HEDLEY_INLINE -double -simde_math_deg2rad(double degrees) { - return degrees * SIMDE_MATH_PI_OVER_180; -} - -static HEDLEY_INLINE -float -simde_math_deg2radf(float degrees) { - return degrees * (SIMDE_MATH_PI_OVER_180F); -} - -/*** Saturated arithmetic ***/ - -static HEDLEY_INLINE -int8_t -simde_math_adds_i8(int8_t a, int8_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqaddb_s8(a, b); - #else - uint8_t a_ = HEDLEY_STATIC_CAST(uint8_t, a); - uint8_t b_ = HEDLEY_STATIC_CAST(uint8_t, b); - uint8_t r_ = a_ + b_; - - a_ = (a_ >> ((8 * sizeof(r_)) - 1)) + INT8_MAX; - if (HEDLEY_STATIC_CAST(int8_t, ((a_ ^ b_) | ~(b_ ^ r_))) >= 0) { - r_ = a_; - } - - return HEDLEY_STATIC_CAST(int8_t, r_); - #endif -} - -static HEDLEY_INLINE -int16_t -simde_math_adds_i16(int16_t a, int16_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqaddh_s16(a, b); - #else - uint16_t a_ = HEDLEY_STATIC_CAST(uint16_t, a); - uint16_t b_ = HEDLEY_STATIC_CAST(uint16_t, b); - uint16_t r_ = a_ + b_; - - a_ = (a_ >> ((8 * sizeof(r_)) - 1)) + INT16_MAX; - if (HEDLEY_STATIC_CAST(int16_t, ((a_ ^ b_) | ~(b_ ^ r_))) >= 0) { - r_ = a_; - } - - return HEDLEY_STATIC_CAST(int16_t, r_); - #endif -} - -static HEDLEY_INLINE -int32_t -simde_math_adds_i32(int32_t a, int32_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqadds_s32(a, b); - #else - uint32_t a_ = HEDLEY_STATIC_CAST(uint32_t, a); - uint32_t b_ = HEDLEY_STATIC_CAST(uint32_t, b); - uint32_t r_ = a_ + b_; - - a_ = (a_ >> ((8 * sizeof(r_)) - 1)) + INT32_MAX; - if (HEDLEY_STATIC_CAST(int32_t, ((a_ ^ b_) | ~(b_ ^ r_))) >= 0) { - r_ = a_; - } - - return HEDLEY_STATIC_CAST(int32_t, r_); - #endif -} - -static HEDLEY_INLINE -int64_t -simde_math_adds_i64(int64_t a, int64_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqaddd_s64(a, b); - #else - uint64_t a_ = HEDLEY_STATIC_CAST(uint64_t, a); - uint64_t b_ = HEDLEY_STATIC_CAST(uint64_t, b); - uint64_t r_ = a_ + b_; - - a_ = (a_ >> ((8 * sizeof(r_)) - 1)) + INT64_MAX; - if (HEDLEY_STATIC_CAST(int64_t, ((a_ ^ b_) | ~(b_ ^ r_))) >= 0) { - r_ = a_; - } - - return HEDLEY_STATIC_CAST(int64_t, r_); - #endif -} - -static HEDLEY_INLINE -uint8_t -simde_math_adds_u8(uint8_t a, uint8_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqaddb_u8(a, b); - #else - uint8_t r = a + b; - r |= -(r < a); - return r; - #endif -} - -static HEDLEY_INLINE -uint16_t -simde_math_adds_u16(uint16_t a, uint16_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqaddh_u16(a, b); - #else - uint16_t r = a + b; - r |= -(r < a); - return r; - #endif -} - -static HEDLEY_INLINE -uint32_t -simde_math_adds_u32(uint32_t a, uint32_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqadds_u32(a, b); - #else - uint32_t r = a + b; - r |= -(r < a); - return r; - #endif -} - -static HEDLEY_INLINE -uint64_t -simde_math_adds_u64(uint64_t a, uint64_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqaddd_u64(a, b); - #else - uint64_t r = a + b; - r |= -(r < a); - return r; - #endif -} - -static HEDLEY_INLINE -int8_t -simde_math_subs_i8(int8_t a, int8_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqsubb_s8(a, b); - #else - uint8_t a_ = HEDLEY_STATIC_CAST(uint8_t, a); - uint8_t b_ = HEDLEY_STATIC_CAST(uint8_t, b); - uint8_t r_ = a_ - b_; - - a_ = (a_ >> 7) + INT8_MAX; - - if (HEDLEY_STATIC_CAST(int8_t, (a_ ^ b_) & (a_ ^ r_)) < 0) { - r_ = a_; - } - - return HEDLEY_STATIC_CAST(int8_t, r_); - #endif -} - -static HEDLEY_INLINE -int16_t -simde_math_subs_i16(int16_t a, int16_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqsubh_s16(a, b); - #else - uint16_t a_ = HEDLEY_STATIC_CAST(uint16_t, a); - uint16_t b_ = HEDLEY_STATIC_CAST(uint16_t, b); - uint16_t r_ = a_ - b_; - - a_ = (a_ >> 15) + INT16_MAX; - - if (HEDLEY_STATIC_CAST(int16_t, (a_ ^ b_) & (a_ ^ r_)) < 0) { - r_ = a_; - } - - return HEDLEY_STATIC_CAST(int16_t, r_); - #endif -} - -static HEDLEY_INLINE -int32_t -simde_math_subs_i32(int32_t a, int32_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqsubs_s32(a, b); - #else - uint32_t a_ = HEDLEY_STATIC_CAST(uint32_t, a); - uint32_t b_ = HEDLEY_STATIC_CAST(uint32_t, b); - uint32_t r_ = a_ - b_; - - a_ = (a_ >> 31) + INT32_MAX; - - if (HEDLEY_STATIC_CAST(int32_t, (a_ ^ b_) & (a_ ^ r_)) < 0) { - r_ = a_; - } - - return HEDLEY_STATIC_CAST(int32_t, r_); - #endif -} - -static HEDLEY_INLINE -int64_t -simde_math_subs_i64(int64_t a, int64_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqsubd_s64(a, b); - #else - uint64_t a_ = HEDLEY_STATIC_CAST(uint64_t, a); - uint64_t b_ = HEDLEY_STATIC_CAST(uint64_t, b); - uint64_t r_ = a_ - b_; - - a_ = (a_ >> 63) + INT64_MAX; - - if (HEDLEY_STATIC_CAST(int64_t, (a_ ^ b_) & (a_ ^ r_)) < 0) { - r_ = a_; - } - - return HEDLEY_STATIC_CAST(int64_t, r_); - #endif -} - -static HEDLEY_INLINE -uint8_t -simde_math_subs_u8(uint8_t a, uint8_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqsubb_u8(a, b); - #else - uint8_t res = a - b; - res &= -(res <= a); - return res; - #endif -} - -static HEDLEY_INLINE -uint16_t -simde_math_subs_u16(uint16_t a, uint16_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqsubh_u16(a, b); - #else - uint16_t res = a - b; - res &= -(res <= a); - return res; - #endif -} - -static HEDLEY_INLINE -uint32_t -simde_math_subs_u32(uint32_t a, uint32_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqsubs_u32(a, b); - #else - uint32_t res = a - b; - res &= -(res <= a); - return res; - #endif -} - -static HEDLEY_INLINE -uint64_t -simde_math_subs_u64(uint64_t a, uint64_t b) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vqsubd_u64(a, b); - #else - uint64_t res = a - b; - res &= -(res <= a); - return res; - #endif -} - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_MATH_H) */ diff --git a/ffi-deps/simde/simde/x86/aes.h b/ffi-deps/simde/simde/x86/aes.h deleted file mode 100644 index 1d5b049..0000000 --- a/ffi-deps/simde/simde/x86/aes.h +++ /dev/null @@ -1,417 +0,0 @@ -/* MIT License - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - */ - -#if !defined(SIMDE_X86_AES_H) -#define SIMDE_X86_AES_H - -/* - * Advanced Encryption Standard - * @author Dani Huertas - * @email huertas.dani@gmail.com - * - * Based on the document FIPS PUB 197 - */ - -#include "sse2.h" - -/* - * Multiplication in GF(2^8) - * http://en.wikipedia.org/wiki/Finite_field_arithmetic - * Irreducible polynomial m(x) = x8 + x4 + x3 + x + 1 - * - * NOTE: This function can be easily replaced with a look up table for a speed - * boost, at the expense of an increase in memory size. - -SIMDE_FUNCTION_ATTRIBUTES -uint8_t gmult(uint8_t a, uint8_t b) { - uint8_t p = 0, i = 0, hbs = 0; - - for (i = 0; i < 8; i++) { - if (b & 1) { - p ^= a; - } - - hbs = a & 0x80; - a <<= 1; - if (hbs) a ^= 0x1b; // 0000 0001 0001 1011 - b >>= 1; - } - - return (uint8_t)p; -} - */ - -#if !(defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO)) - -#include "../simde-aes.h" - -/* - * Transformation in the Cipher and Inverse Cipher in which a Round - * Key is added to the State using an XOR operation. The length of a - * Round Key equals the size of the State (i.e., for Nb = 4, the Round - * Key length equals 128 bits/16 bytes). - */ -SIMDE_FUNCTION_ATTRIBUTES -void simde_x_aes_add_round_key(uint8_t *state, simde__m128i_private w, uint8_t r) { - - int Nb = simde_x_aes_Nb; - uint8_t c; - - for (c = 0; c < Nb; c++) { - state[Nb*0+c] = state[Nb*0+c]^w.u8[4*Nb*r+4*c+0]; - state[Nb*1+c] = state[Nb*1+c]^w.u8[4*Nb*r+4*c+1]; - state[Nb*2+c] = state[Nb*2+c]^w.u8[4*Nb*r+4*c+2]; - state[Nb*3+c] = state[Nb*3+c]^w.u8[4*Nb*r+4*c+3]; - } -} - -/* - * Transformation in the Cipher that takes all of the columns of the - * State and mixes their data (independently of one another) to - * produce new columns. - */ -SIMDE_FUNCTION_ATTRIBUTES -void simde_x_aes_mix_columns(uint8_t *state) { - - int Nb = simde_x_aes_Nb; - // uint8_t k[] = {0x02, 0x01, 0x01, 0x03}; // a(x) = {02} + {01}x + {01}x2 + {03}x3 - uint8_t i, j, col[4], res[4]; - - for (j = 0; j < Nb; j++) { - for (i = 0; i < 4; i++) { - col[i] = state[Nb*i+j]; - } - - //coef_mult(k, col, res); - simde_x_aes_coef_mult_lookup(0, col, res); - - for (i = 0; i < 4; i++) { - state[Nb*i+j] = res[i]; - } - } -} - -/* - * Transformation in the Inverse Cipher that is the inverse of - * MixColumns(). - */ -SIMDE_FUNCTION_ATTRIBUTES -void simde_x_aes_inv_mix_columns(uint8_t *state) { - - int Nb = simde_x_aes_Nb; - // uint8_t k[] = {0x0e, 0x09, 0x0d, 0x0b}; // a(x) = {0e} + {09}x + {0d}x2 + {0b}x3 - uint8_t i, j, col[4], res[4]; - - for (j = 0; j < Nb; j++) { - for (i = 0; i < 4; i++) { - col[i] = state[Nb*i+j]; - } - - //coef_mult(k, col, res); - simde_x_aes_coef_mult_lookup(4, col, res); - - for (i = 0; i < 4; i++) { - state[Nb*i+j] = res[i]; - } - } -} - -/* - * Transformation in the Cipher that processes the State by cyclically - * shifting the last three rows of the State by different offsets. - */ -SIMDE_FUNCTION_ATTRIBUTES -void simde_x_aes_shift_rows(uint8_t *state) { - - int Nb = simde_x_aes_Nb; - uint8_t i, k, s, tmp; - - for (i = 1; i < 4; i++) { - // shift(1,4)=1; shift(2,4)=2; shift(3,4)=3 - // shift(r, 4) = r; - s = 0; - while (s < i) { - tmp = state[Nb*i+0]; - - for (k = 1; k < Nb; k++) { - state[Nb*i+k-1] = state[Nb*i+k]; - } - - state[Nb*i+Nb-1] = tmp; - s++; - } - } -} - -/* - * Transformation in the Inverse Cipher that is the inverse of - * ShiftRows(). - */ -SIMDE_FUNCTION_ATTRIBUTES -void simde_x_aes_inv_shift_rows(uint8_t *state) { - - uint8_t Nb = simde_x_aes_Nb; - uint8_t i, k, s, tmp; - - for (i = 1; i < 4; i++) { - s = 0; - while (s < i) { - tmp = state[Nb*i+Nb-1]; - - for (k = Nb-1; k > 0; k--) { - state[Nb*i+k] = state[Nb*i+k-1]; - } - - state[Nb*i+0] = tmp; - s++; - } - } -} - -/* - * Transformation in the Cipher that processes the State using a non - * linear byte substitution table (S-box) that operates on each of the - * State bytes independently. - */ -SIMDE_FUNCTION_ATTRIBUTES -void simde_x_aes_sub_bytes(uint8_t *state) { - - int Nb = simde_x_aes_Nb; - uint8_t i, j; - - for (i = 0; i < 4; i++) { - for (j = 0; j < Nb; j++) { - // s_box row: yyyy ---- - // s_box col: ---- xxxx - // s_box[16*(yyyy) + xxxx] == s_box[yyyyxxxx] - state[Nb*i+j] = simde_x_aes_s_box[state[Nb*i+j]]; - } - } -} - -/* - * Transformation in the Inverse Cipher that is the inverse of - * SubBytes(). - */ -SIMDE_FUNCTION_ATTRIBUTES -void simde_x_aes_inv_sub_bytes(uint8_t *state) { - - int Nb = simde_x_aes_Nb; - uint8_t i, j; - - for (i = 0; i < 4; i++) { - for (j = 0; j < Nb; j++) { - state[Nb*i+j] = simde_x_aes_inv_s_box[state[Nb*i+j]]; - } - } -} - -/* - * Performs the AES cipher operation - */ -SIMDE_FUNCTION_ATTRIBUTES -void simde_x_aes_enc(simde__m128i_private in, simde__m128i_private *out, simde__m128i_private w, int is_last) { - - int Nb = simde_x_aes_Nb; - uint8_t state[4*simde_x_aes_Nb]; - uint8_t r = 0, i, j; - - for (i = 0; i < 4; i++) { - for (j = 0; j < Nb; j++) { - state[Nb*i+j] = in.u8[i+4*j]; - } - } - - simde_x_aes_sub_bytes(state); - simde_x_aes_shift_rows(state); - - if (!is_last) - simde_x_aes_mix_columns(state); - - simde_x_aes_add_round_key(state, w, r); - - for (i = 0; i < 4; i++) { - for (j = 0; j < Nb; j++) { - out->u8[i+4*j] = state[Nb*i+j]; - } - } -} - -/* - * Performs the AES inverse cipher operation - */ -SIMDE_FUNCTION_ATTRIBUTES -void simde_x_aes_dec(simde__m128i_private in, simde__m128i_private *out, simde__m128i_private w, int is_last) { - - int Nb = simde_x_aes_Nb; - uint8_t state[4*simde_x_aes_Nb]; - uint8_t r = 0, i, j; - - for (i = 0; i < 4; i++) { - for (j = 0; j < Nb; j++) { - state[Nb*i+j] = in.u8[i+4*j]; - } - } - - simde_x_aes_inv_shift_rows(state); - simde_x_aes_inv_sub_bytes(state); - - if (!is_last) - simde_x_aes_inv_mix_columns(state); - - simde_x_aes_add_round_key(state, w, r); - - for (i = 0; i < 4; i++) { - for (j = 0; j < Nb; j++) { - out->u8[i+4*j] = state[Nb*i+j]; - } - } -} -#endif // if !(defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO)) - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i simde_mm_aesenc_si128(simde__m128i a, simde__m128i round_key) { - #if defined(SIMDE_X86_AES_NATIVE) - return _mm_aesenc_si128(a, round_key); - #else - simde__m128i_private result_; - simde__m128i_private a_ = simde__m128i_to_private(a); - simde__m128i_private round_key_ = simde__m128i_to_private(round_key); - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) - result_.neon_u8 = veorq_u8( - vaesmcq_u8(vaeseq_u8(a_.neon_u8, vdupq_n_u8(0))), - round_key_.neon_u8); - #else - simde_x_aes_enc(a_, &result_, round_key_, 0); - #endif - return simde__m128i_from_private(result_); - #endif -} -#if defined(SIMDE_X86_AES_ENABLE_NATIVE_ALIASES) - #define _mm_aesenc_si128(a, b) simde_mm_aesenc_si128(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i simde_mm_aesdec_si128(simde__m128i a, simde__m128i round_key) { - #if defined(SIMDE_X86_AES_NATIVE) - return _mm_aesdec_si128(a, round_key); - #else - simde__m128i_private result_; - simde__m128i_private a_ = simde__m128i_to_private(a); - simde__m128i_private round_key_ = simde__m128i_to_private(round_key); - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) - result_.neon_u8 = veorq_u8( - vaesimcq_u8(vaesdq_u8(a_.neon_u8, vdupq_n_u8(0))), - round_key_.neon_u8); - #else - simde_x_aes_dec(a_, &result_, round_key_, 0); - #endif - return simde__m128i_from_private(result_); - #endif -} -#if defined(SIMDE_X86_AES_ENABLE_NATIVE_ALIASES) - #define _mm_aesdec_si128(a, b) simde_mm_aesdec_si128(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i simde_mm_aesenclast_si128(simde__m128i a, simde__m128i round_key) { - #if defined(SIMDE_X86_AES_NATIVE) - return _mm_aesenclast_si128(a, round_key); - #else - simde__m128i_private result_; - simde__m128i_private a_ = simde__m128i_to_private(a); - simde__m128i_private round_key_ = simde__m128i_to_private(round_key); - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) - result_.neon_u8 = vaeseq_u8(a_.neon_u8, vdupq_n_u8(0)); - result_.neon_i32 = veorq_s32(result_.neon_i32, round_key_.neon_i32); // _mm_xor_si128 - #else - simde_x_aes_enc(a_, &result_, round_key_, 1); - #endif - return simde__m128i_from_private(result_); - #endif -} -#if defined(SIMDE_X86_AES_ENABLE_NATIVE_ALIASES) - #define _mm_aesenclast_si128(a, b) simde_mm_aesenclast_si128(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i simde_mm_aesdeclast_si128(simde__m128i a, simde__m128i round_key) { - #if defined(SIMDE_X86_AES_NATIVE) - return _mm_aesdeclast_si128(a, round_key); - #else - simde__m128i_private result_; - simde__m128i_private a_ = simde__m128i_to_private(a); - simde__m128i_private round_key_ = simde__m128i_to_private(round_key); - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) - result_.neon_u8 = veorq_u8( - vaesdq_u8(a_.neon_u8, vdupq_n_u8(0)), - round_key_.neon_u8); - #else - simde_x_aes_dec(a_, &result_, round_key_, 1); - #endif - return simde__m128i_from_private(result_); - #endif -} -#if defined(SIMDE_X86_AES_ENABLE_NATIVE_ALIASES) - #define _mm_aesdeclast_si128(a, b) simde_mm_aesdeclast_si128(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i simde_mm_aesimc_si128(simde__m128i a) { - #if defined(SIMDE_X86_AES_NATIVE) - return _mm_aesimc_si128(a); - #else - simde__m128i_private result_ = simde__m128i_to_private(simde_mm_setzero_si128()); - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_ARCH_ARM_CRYPTO) - result_.neon_u8 = vaesimcq_u8(a_.neon_u8); - #else - int Nb = simde_x_aes_Nb; - // uint8_t k[] = {0x0e, 0x09, 0x0d, 0x0b}; // a(x) = {0e} + {09}x + {0d}x2 + {0b}x3 - uint8_t i, j, col[4], res[4]; - - for (j = 0; j < Nb; j++) { - for (i = 0; i < 4; i++) { - col[i] = a_.u8[Nb*j+i]; - } - - //coef_mult(k, col, res); - simde_x_aes_coef_mult_lookup(4, col, res); - - for (i = 0; i < 4; i++) { - result_.u8[Nb*j+i] = res[i]; - } - } - #endif - return simde__m128i_from_private(result_); - #endif -} -#if defined(SIMDE_X86_AES_ENABLE_NATIVE_ALIASES) - #define _mm_aesimc_si128(a) simde_mm_aesimc_si128(a) -#endif - -#undef simde_x_aes_Nb - -#endif /* !defined(SIMDE_X86_AES_H) */ diff --git a/ffi-deps/simde/simde/x86/avx.h b/ffi-deps/simde/simde/x86/avx.h deleted file mode 100644 index 2314f95..0000000 --- a/ffi-deps/simde/simde/x86/avx.h +++ /dev/null @@ -1,6267 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2018-2020 Evan Nemerson - * 2020 Michael R. Crusoe - */ - -#include "sse.h" -#if !defined(SIMDE_X86_AVX_H) -#define SIMDE_X86_AVX_H - -#include "sse4.2.h" -#include "../simde-f16.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -typedef union { - #if defined(SIMDE_VECTOR_SUBSCRIPT) - SIMDE_ALIGN_TO_32 int8_t i8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int16_t i16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int32_t i32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int64_t i64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint8_t u8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint16_t u16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint32_t u32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint64_t u64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_32 simde_int128 i128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 simde_uint128 u128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #endif - SIMDE_ALIGN_TO_32 simde_float32 f32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 simde_float64 f64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int_fast32_t i32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint_fast32_t u32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #else - SIMDE_ALIGN_TO_32 int8_t i8[32]; - SIMDE_ALIGN_TO_32 int16_t i16[16]; - SIMDE_ALIGN_TO_32 int32_t i32[8]; - SIMDE_ALIGN_TO_32 int64_t i64[4]; - SIMDE_ALIGN_TO_32 uint8_t u8[32]; - SIMDE_ALIGN_TO_32 uint16_t u16[16]; - SIMDE_ALIGN_TO_32 uint32_t u32[8]; - SIMDE_ALIGN_TO_32 uint64_t u64[4]; - SIMDE_ALIGN_TO_32 int_fast32_t i32f[32 / sizeof(int_fast32_t)]; - SIMDE_ALIGN_TO_32 uint_fast32_t u32f[32 / sizeof(uint_fast32_t)]; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_32 simde_int128 i128[2]; - SIMDE_ALIGN_TO_32 simde_uint128 u128[2]; - #endif - SIMDE_ALIGN_TO_32 simde_float32 f32[8]; - SIMDE_ALIGN_TO_32 simde_float64 f64[4]; - #endif - - SIMDE_ALIGN_TO_32 simde__m128_private m128_private[2]; - SIMDE_ALIGN_TO_32 simde__m128 m128[2]; - - #if defined(SIMDE_X86_AVX_NATIVE) - SIMDE_ALIGN_TO_32 __m256 n; - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(int) altivec_i32[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32[2]; - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(long long) altivec_i64[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[2]; - #endif - #endif -} simde__m256_private; - -typedef union { - #if defined(SIMDE_VECTOR_SUBSCRIPT) - SIMDE_ALIGN_TO_32 int8_t i8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int16_t i16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int32_t i32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int64_t i64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint8_t u8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint16_t u16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint32_t u32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint64_t u64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_32 simde_int128 i128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 simde_uint128 u128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #endif - SIMDE_ALIGN_TO_32 simde_float32 f32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 simde_float64 f64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int_fast32_t i32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint_fast32_t u32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #else - SIMDE_ALIGN_TO_32 int8_t i8[32]; - SIMDE_ALIGN_TO_32 int16_t i16[16]; - SIMDE_ALIGN_TO_32 int32_t i32[8]; - SIMDE_ALIGN_TO_32 int64_t i64[4]; - SIMDE_ALIGN_TO_32 uint8_t u8[32]; - SIMDE_ALIGN_TO_32 uint16_t u16[16]; - SIMDE_ALIGN_TO_32 uint32_t u32[8]; - SIMDE_ALIGN_TO_32 uint64_t u64[4]; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_32 simde_int128 i128[2]; - SIMDE_ALIGN_TO_32 simde_uint128 u128[2]; - #endif - SIMDE_ALIGN_TO_32 simde_float32 f32[8]; - SIMDE_ALIGN_TO_32 simde_float64 f64[4]; - SIMDE_ALIGN_TO_32 int_fast32_t i32f[32 / sizeof(int_fast32_t)]; - SIMDE_ALIGN_TO_32 uint_fast32_t u32f[32 / sizeof(uint_fast32_t)]; - #endif - - SIMDE_ALIGN_TO_32 simde__m128d_private m128d_private[2]; - SIMDE_ALIGN_TO_32 simde__m128d m128d[2]; - - #if defined(SIMDE_X86_AVX_NATIVE) - SIMDE_ALIGN_TO_32 __m256d n; - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32[2]; - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[2]; - #endif - #endif -} simde__m256d_private; - -typedef union { - #if defined(SIMDE_VECTOR_SUBSCRIPT) - SIMDE_ALIGN_TO_32 int8_t i8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int16_t i16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int32_t i32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int64_t i64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint8_t u8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint16_t u16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint32_t u32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint64_t u64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_32 simde_int128 i128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 simde_uint128 u128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #endif - #if defined(SIMDE_FLOAT16_VECTOR) - SIMDE_ALIGN_TO_32 simde_float16 f16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #else - SIMDE_ALIGN_TO_32 simde_float16 f16[16]; - #endif - SIMDE_ALIGN_TO_32 simde_float32 f32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 simde_float64 f64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int_fast32_t i32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint_fast32_t u32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #else - SIMDE_ALIGN_TO_32 int8_t i8[32]; - SIMDE_ALIGN_TO_32 int16_t i16[16]; - SIMDE_ALIGN_TO_32 int32_t i32[8]; - SIMDE_ALIGN_TO_32 int64_t i64[4]; - SIMDE_ALIGN_TO_32 uint8_t u8[32]; - SIMDE_ALIGN_TO_32 uint16_t u16[16]; - SIMDE_ALIGN_TO_32 uint32_t u32[8]; - SIMDE_ALIGN_TO_32 uint64_t u64[4]; - SIMDE_ALIGN_TO_32 int_fast32_t i32f[32 / sizeof(int_fast32_t)]; - SIMDE_ALIGN_TO_32 uint_fast32_t u32f[32 / sizeof(uint_fast32_t)]; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_32 simde_int128 i128[2]; - SIMDE_ALIGN_TO_32 simde_uint128 u128[2]; - #endif - SIMDE_ALIGN_TO_32 simde_float16 f16[16]; - SIMDE_ALIGN_TO_32 simde_float32 f32[8]; - SIMDE_ALIGN_TO_32 simde_float64 f64[4]; - #endif - - SIMDE_ALIGN_TO_32 simde__m128i_private m128i_private[2]; - SIMDE_ALIGN_TO_32 simde__m128i m128i[2]; - - #if defined(SIMDE_X86_AVX_NATIVE) - SIMDE_ALIGN_TO_32 __m256i n; - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32[2]; - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[2]; - #endif - #endif -} simde__m256i_private; - -#if defined(SIMDE_X86_AVX_NATIVE) - typedef __m256 simde__m256; - typedef __m256i simde__m256i; - typedef __m256d simde__m256d; -#elif defined(SIMDE_VECTOR_SUBSCRIPT) - typedef simde_float32 simde__m256 SIMDE_ALIGN_TO_32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - typedef int_fast32_t simde__m256i SIMDE_ALIGN_TO_32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - typedef simde_float64 simde__m256d SIMDE_ALIGN_TO_32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; -#else - typedef simde__m256_private simde__m256; - typedef simde__m256i_private simde__m256i; - typedef simde__m256d_private simde__m256d; -#endif - -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #if !defined(HEDLEY_INTEL_VERSION) && !defined(_AVXINTRIN_H_INCLUDED) && !defined(__AVXINTRIN_H) && !defined(_CMP_EQ_OQ) - typedef simde__m256 __m256; - typedef simde__m256i __m256i; - typedef simde__m256d __m256d; - #else - #undef __m256 - #define __m256 simde__m256 - #undef __m256i - #define __m256i simde__m256i - #undef __m256d - #define __m256d simde__m256d - #endif -#endif - -HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256), "simde__m256 size incorrect"); -HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256_private), "simde__m256_private size incorrect"); -HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256i), "simde__m256i size incorrect"); -HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256i_private), "simde__m256i_private size incorrect"); -HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256d), "simde__m256d size incorrect"); -HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256d_private), "simde__m256d_private size incorrect"); -#if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF) -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256) == 32, "simde__m256 is not 32-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256_private) == 32, "simde__m256_private is not 32-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256i) == 32, "simde__m256i is not 32-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256i_private) == 32, "simde__m256i_private is not 32-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256d) == 32, "simde__m256d is not 32-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256d_private) == 32, "simde__m256d_private is not 32-byte aligned"); -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde__m256_from_private(simde__m256_private v) { - simde__m256 r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256_private -simde__m256_to_private(simde__m256 v) { - simde__m256_private r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde__m256i_from_private(simde__m256i_private v) { - simde__m256i r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i_private -simde__m256i_to_private(simde__m256i v) { - simde__m256i_private r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde__m256d_from_private(simde__m256d_private v) { - simde__m256d r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d_private -simde__m256d_to_private(simde__m256d v) { - simde__m256d_private r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -#define SIMDE_CMP_EQ_OQ 0 -#define SIMDE_CMP_LT_OS 1 -#define SIMDE_CMP_LE_OS 2 -#define SIMDE_CMP_UNORD_Q 3 -#define SIMDE_CMP_NEQ_UQ 4 -#define SIMDE_CMP_NLT_US 5 -#define SIMDE_CMP_NLE_US 6 -#define SIMDE_CMP_ORD_Q 7 -#define SIMDE_CMP_EQ_UQ 8 -#define SIMDE_CMP_NGE_US 9 -#define SIMDE_CMP_NGT_US 10 -#define SIMDE_CMP_FALSE_OQ 11 -#define SIMDE_CMP_NEQ_OQ 12 -#define SIMDE_CMP_GE_OS 13 -#define SIMDE_CMP_GT_OS 14 -#define SIMDE_CMP_TRUE_UQ 15 -#define SIMDE_CMP_EQ_OS 16 -#define SIMDE_CMP_LT_OQ 17 -#define SIMDE_CMP_LE_OQ 18 -#define SIMDE_CMP_UNORD_S 19 -#define SIMDE_CMP_NEQ_US 20 -#define SIMDE_CMP_NLT_UQ 21 -#define SIMDE_CMP_NLE_UQ 22 -#define SIMDE_CMP_ORD_S 23 -#define SIMDE_CMP_EQ_US 24 -#define SIMDE_CMP_NGE_UQ 25 -#define SIMDE_CMP_NGT_UQ 26 -#define SIMDE_CMP_FALSE_OS 27 -#define SIMDE_CMP_NEQ_OS 28 -#define SIMDE_CMP_GE_OQ 29 -#define SIMDE_CMP_GT_OQ 30 -#define SIMDE_CMP_TRUE_US 31 - -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) && !defined(_CMP_EQ_OQ) -#define _CMP_EQ_OQ SIMDE_CMP_EQ_OQ -#define _CMP_LT_OS SIMDE_CMP_LT_OS -#define _CMP_LE_OS SIMDE_CMP_LE_OS -#define _CMP_UNORD_Q SIMDE_CMP_UNORD_Q -#define _CMP_NEQ_UQ SIMDE_CMP_NEQ_UQ -#define _CMP_NLT_US SIMDE_CMP_NLT_US -#define _CMP_NLE_US SIMDE_CMP_NLE_US -#define _CMP_ORD_Q SIMDE_CMP_ORD_Q -#define _CMP_EQ_UQ SIMDE_CMP_EQ_UQ -#define _CMP_NGE_US SIMDE_CMP_NGE_US -#define _CMP_NGT_US SIMDE_CMP_NGT_US -#define _CMP_FALSE_OQ SIMDE_CMP_FALSE_OQ -#define _CMP_NEQ_OQ SIMDE_CMP_NEQ_OQ -#define _CMP_GE_OS SIMDE_CMP_GE_OS -#define _CMP_GT_OS SIMDE_CMP_GT_OS -#define _CMP_TRUE_UQ SIMDE_CMP_TRUE_UQ -#define _CMP_EQ_OS SIMDE_CMP_EQ_OS -#define _CMP_LT_OQ SIMDE_CMP_LT_OQ -#define _CMP_LE_OQ SIMDE_CMP_LE_OQ -#define _CMP_UNORD_S SIMDE_CMP_UNORD_S -#define _CMP_NEQ_US SIMDE_CMP_NEQ_US -#define _CMP_NLT_UQ SIMDE_CMP_NLT_UQ -#define _CMP_NLE_UQ SIMDE_CMP_NLE_UQ -#define _CMP_ORD_S SIMDE_CMP_ORD_S -#define _CMP_EQ_US SIMDE_CMP_EQ_US -#define _CMP_NGE_UQ SIMDE_CMP_NGE_UQ -#define _CMP_NGT_UQ SIMDE_CMP_NGT_UQ -#define _CMP_FALSE_OS SIMDE_CMP_FALSE_OS -#define _CMP_NEQ_OS SIMDE_CMP_NEQ_OS -#define _CMP_GE_OQ SIMDE_CMP_GE_OQ -#define _CMP_GT_OQ SIMDE_CMP_GT_OQ -#define _CMP_TRUE_US SIMDE_CMP_TRUE_US -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_castps_pd (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castps_pd(a); - #else - return *HEDLEY_REINTERPRET_CAST(simde__m256d*, &a); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castps_pd - #define _mm256_castps_pd(a) simde_mm256_castps_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_castps_si256 (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castps_si256(a); - #else - return *HEDLEY_REINTERPRET_CAST(simde__m256i*, &a); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castps_si256 - #define _mm256_castps_si256(a) simde_mm256_castps_si256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_castsi256_pd (simde__m256i a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castsi256_pd(a); - #else - return *HEDLEY_REINTERPRET_CAST(simde__m256d*, &a); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castsi256_pd - #define _mm256_castsi256_pd(a) simde_mm256_castsi256_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_castsi256_ps (simde__m256i a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castsi256_ps(a); - #else - return *HEDLEY_REINTERPRET_CAST(simde__m256*, &a); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castsi256_ps - #define _mm256_castsi256_ps(a) simde_mm256_castsi256_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_castpd_ps (simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castpd_ps(a); - #else - return *HEDLEY_REINTERPRET_CAST(simde__m256*, &a); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castpd_ps - #define _mm256_castpd_ps(a) simde_mm256_castpd_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_castpd_si256 (simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castpd_si256(a); - #else - return *HEDLEY_REINTERPRET_CAST(simde__m256i*, &a); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castpd_si256 - #define _mm256_castpd_si256(a) simde_mm256_castpd_si256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_setzero_si256 (void) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_setzero_si256(); - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_setzero_si128(); - r_.m128i[1] = simde_mm_setzero_si128(); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = 0; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setzero_si256 - #define _mm256_setzero_si256() simde_mm256_setzero_si256() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_setzero_ps (void) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_setzero_ps(); - #else - return simde_mm256_castsi256_ps(simde_mm256_setzero_si256()); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setzero_ps - #define _mm256_setzero_ps() simde_mm256_setzero_ps() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_setzero_pd (void) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_setzero_pd(); - #else - return simde_mm256_castsi256_pd(simde_mm256_setzero_si256()); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setzero_pd - #define _mm256_setzero_pd() simde_mm256_setzero_pd() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_x_mm256_not_ps(simde__m256 a) { - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = ~a_.i32; - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - r_.m128[0] = simde_x_mm_not_ps(a_.m128[0]); - r_.m128[1] = simde_x_mm_not_ps(a_.m128[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = ~(a_.i32[i]); - } - #endif - - return simde__m256_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_x_mm256_select_ps(simde__m256 a, simde__m256 b, simde__m256 mask) { - /* This function is for when you want to blend two elements together - * according to a mask. It is similar to _mm256_blendv_ps, except that - * it is undefined whether the blend is based on the highest bit in - * each lane (like blendv) or just bitwise operations. This allows - * us to implement the function efficiently everywhere. - * - * Basically, you promise that all the lanes in mask are either 0 or - * ~0. */ - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_blendv_ps(a, b, mask); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b), - mask_ = simde__m256_to_private(mask); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 ^ ((a_.i32 ^ b_.i32) & mask_.i32); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - r_.m128[0] = simde_x_mm_select_ps(a_.m128[0], b_.m128[0], mask_.m128[0]); - r_.m128[1] = simde_x_mm_select_ps(a_.m128[1], b_.m128[1], mask_.m128[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] ^ ((a_.i32[i] ^ b_.i32[i]) & mask_.i32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_x_mm256_not_pd(simde__m256d a) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = ~a_.i64; - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - r_.m128d[0] = simde_x_mm_not_pd(a_.m128d[0]); - r_.m128d[1] = simde_x_mm_not_pd(a_.m128d[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = ~(a_.i64[i]); - } - #endif - - return simde__m256d_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_x_mm256_select_pd(simde__m256d a, simde__m256d b, simde__m256d mask) { - /* This function is for when you want to blend two elements together - * according to a mask. It is similar to _mm256_blendv_pd, except that - * it is undefined whether the blend is based on the highest bit in - * each lane (like blendv) or just bitwise operations. This allows - * us to implement the function efficiently everywhere. - * - * Basically, you promise that all the lanes in mask are either 0 or - * ~0. */ - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_blendv_pd(a, b, mask); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b), - mask_ = simde__m256d_to_private(mask); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 ^ ((a_.i64 ^ b_.i64) & mask_.i64); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - r_.m128d[0] = simde_x_mm_select_pd(a_.m128d[0], b_.m128d[0], mask_.m128d[0]); - r_.m128d[1] = simde_x_mm_select_pd(a_.m128d[1], b_.m128d[1], mask_.m128d[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] ^ ((a_.i64[i] ^ b_.i64[i]) & mask_.i64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_setone_si256 (void) { - simde__m256i_private r_; - -#if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - __typeof__(r_.i32f) rv = { 0, }; - r_.i32f = ~rv; -#elif defined(SIMDE_X86_AVX2_NATIVE) - __m256i t = _mm256_setzero_si256(); - r_.n = _mm256_cmpeq_epi32(t, t); -#else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = ~HEDLEY_STATIC_CAST(int_fast32_t, 0); - } -#endif - - return simde__m256i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_x_mm256_setone_ps (void) { - return simde_mm256_castsi256_ps(simde_x_mm256_setone_si256()); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_x_mm256_setone_pd (void) { - return simde_mm256_castsi256_pd(simde_x_mm256_setone_si256()); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_set_epi8 (int8_t e31, int8_t e30, int8_t e29, int8_t e28, - int8_t e27, int8_t e26, int8_t e25, int8_t e24, - int8_t e23, int8_t e22, int8_t e21, int8_t e20, - int8_t e19, int8_t e18, int8_t e17, int8_t e16, - int8_t e15, int8_t e14, int8_t e13, int8_t e12, - int8_t e11, int8_t e10, int8_t e9, int8_t e8, - int8_t e7, int8_t e6, int8_t e5, int8_t e4, - int8_t e3, int8_t e2, int8_t e1, int8_t e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set_epi8(e31, e30, e29, e28, e27, e26, e25, e24, - e23, e22, e21, e20, e19, e18, e17, e16, - e15, e14, e13, e12, e11, e10, e9, e8, - e7, e6, e5, e4, e3, e2, e1, e0); - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_set_epi8( - e15, e14, e13, e12, e11, e10, e9, e8, - e7, e6, e5, e4, e3, e2, e1, e0); - r_.m128i[1] = simde_mm_set_epi8( - e31, e30, e29, e28, e27, e26, e25, e24, - e23, e22, e21, e20, e19, e18, e17, e16); - #else - r_.i8[ 0] = e0; - r_.i8[ 1] = e1; - r_.i8[ 2] = e2; - r_.i8[ 3] = e3; - r_.i8[ 4] = e4; - r_.i8[ 5] = e5; - r_.i8[ 6] = e6; - r_.i8[ 7] = e7; - r_.i8[ 8] = e8; - r_.i8[ 9] = e9; - r_.i8[10] = e10; - r_.i8[11] = e11; - r_.i8[12] = e12; - r_.i8[13] = e13; - r_.i8[14] = e14; - r_.i8[15] = e15; - r_.i8[16] = e16; - r_.i8[17] = e17; - r_.i8[18] = e18; - r_.i8[19] = e19; - r_.i8[20] = e20; - r_.i8[21] = e21; - r_.i8[22] = e22; - r_.i8[23] = e23; - r_.i8[24] = e24; - r_.i8[25] = e25; - r_.i8[26] = e26; - r_.i8[27] = e27; - r_.i8[28] = e28; - r_.i8[29] = e29; - r_.i8[30] = e30; - r_.i8[31] = e31; - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set_epi8 - #define _mm256_set_epi8(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) \ - simde_mm256_set_epi8(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_set_epi16 (int16_t e15, int16_t e14, int16_t e13, int16_t e12, - int16_t e11, int16_t e10, int16_t e9, int16_t e8, - int16_t e7, int16_t e6, int16_t e5, int16_t e4, - int16_t e3, int16_t e2, int16_t e1, int16_t e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set_epi16(e15, e14, e13, e12, e11, e10, e9, e8, - e7, e6, e5, e4, e3, e2, e1, e0); - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_set_epi16( e7, e6, e5, e4, e3, e2, e1, e0); - r_.m128i[1] = simde_mm_set_epi16(e15, e14, e13, e12, e11, e10, e9, e8); - #else - r_.i16[ 0] = e0; - r_.i16[ 1] = e1; - r_.i16[ 2] = e2; - r_.i16[ 3] = e3; - r_.i16[ 4] = e4; - r_.i16[ 5] = e5; - r_.i16[ 6] = e6; - r_.i16[ 7] = e7; - r_.i16[ 8] = e8; - r_.i16[ 9] = e9; - r_.i16[10] = e10; - r_.i16[11] = e11; - r_.i16[12] = e12; - r_.i16[13] = e13; - r_.i16[14] = e14; - r_.i16[15] = e15; - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set_epi16 - #define _mm256_set_epi16(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) \ - simde_mm256_set_epi16(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_set_epi32 (int32_t e7, int32_t e6, int32_t e5, int32_t e4, - int32_t e3, int32_t e2, int32_t e1, int32_t e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0); - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_set_epi32(e3, e2, e1, e0); - r_.m128i[1] = simde_mm_set_epi32(e7, e6, e5, e4); - #else - r_.i32[ 0] = e0; - r_.i32[ 1] = e1; - r_.i32[ 2] = e2; - r_.i32[ 3] = e3; - r_.i32[ 4] = e4; - r_.i32[ 5] = e5; - r_.i32[ 6] = e6; - r_.i32[ 7] = e7; - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set_epi32 - #define _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0) \ - simde_mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_set_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set_epi64x(e3, e2, e1, e0); - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_set_epi64x(e1, e0); - r_.m128i[1] = simde_mm_set_epi64x(e3, e2); - #else - r_.i64[0] = e0; - r_.i64[1] = e1; - r_.i64[2] = e2; - r_.i64[3] = e3; - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set_epi64x - #define _mm256_set_epi64x(e3, e2, e1, e0) simde_mm256_set_epi64x(e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_set_epu8 (uint8_t e31, uint8_t e30, uint8_t e29, uint8_t e28, - uint8_t e27, uint8_t e26, uint8_t e25, uint8_t e24, - uint8_t e23, uint8_t e22, uint8_t e21, uint8_t e20, - uint8_t e19, uint8_t e18, uint8_t e17, uint8_t e16, - uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12, - uint8_t e11, uint8_t e10, uint8_t e9, uint8_t e8, - uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4, - uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0) { - simde__m256i_private r_; - - r_.u8[ 0] = e0; - r_.u8[ 1] = e1; - r_.u8[ 2] = e2; - r_.u8[ 3] = e3; - r_.u8[ 4] = e4; - r_.u8[ 5] = e5; - r_.u8[ 6] = e6; - r_.u8[ 7] = e7; - r_.u8[ 8] = e8; - r_.u8[ 9] = e9; - r_.u8[10] = e10; - r_.u8[11] = e11; - r_.u8[12] = e12; - r_.u8[13] = e13; - r_.u8[14] = e14; - r_.u8[15] = e15; - r_.u8[16] = e16; - r_.u8[17] = e17; - r_.u8[18] = e18; - r_.u8[19] = e19; - r_.u8[20] = e20; - r_.u8[20] = e20; - r_.u8[21] = e21; - r_.u8[22] = e22; - r_.u8[23] = e23; - r_.u8[24] = e24; - r_.u8[25] = e25; - r_.u8[26] = e26; - r_.u8[27] = e27; - r_.u8[28] = e28; - r_.u8[29] = e29; - r_.u8[30] = e30; - r_.u8[31] = e31; - - return simde__m256i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_set_epu16 (uint16_t e15, uint16_t e14, uint16_t e13, uint16_t e12, - uint16_t e11, uint16_t e10, uint16_t e9, uint16_t e8, - uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4, - uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) { - simde__m256i_private r_; - - r_.u16[ 0] = e0; - r_.u16[ 1] = e1; - r_.u16[ 2] = e2; - r_.u16[ 3] = e3; - r_.u16[ 4] = e4; - r_.u16[ 5] = e5; - r_.u16[ 6] = e6; - r_.u16[ 7] = e7; - r_.u16[ 8] = e8; - r_.u16[ 9] = e9; - r_.u16[10] = e10; - r_.u16[11] = e11; - r_.u16[12] = e12; - r_.u16[13] = e13; - r_.u16[14] = e14; - r_.u16[15] = e15; - - return simde__m256i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_set_epu32 (uint32_t e7, uint32_t e6, uint32_t e5, uint32_t e4, - uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set_epi32(HEDLEY_STATIC_CAST(int32_t, e7), HEDLEY_STATIC_CAST(int32_t, e6), HEDLEY_STATIC_CAST(int32_t, e5), HEDLEY_STATIC_CAST(int32_t, e4), - HEDLEY_STATIC_CAST(int32_t, e3), HEDLEY_STATIC_CAST(int32_t, e2), HEDLEY_STATIC_CAST(int32_t, e1), HEDLEY_STATIC_CAST(int32_t, e0)); - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_set_epi32(HEDLEY_STATIC_CAST(int32_t, e3), HEDLEY_STATIC_CAST(int32_t, e2), HEDLEY_STATIC_CAST(int32_t, e1), HEDLEY_STATIC_CAST(int32_t, e0)); - r_.m128i[1] = simde_mm_set_epi32(HEDLEY_STATIC_CAST(int32_t, e7), HEDLEY_STATIC_CAST(int32_t, e6), HEDLEY_STATIC_CAST(int32_t, e5), HEDLEY_STATIC_CAST(int32_t, e4)); - #else - r_.u32[ 0] = e0; - r_.u32[ 1] = e1; - r_.u32[ 2] = e2; - r_.u32[ 3] = e3; - r_.u32[ 4] = e4; - r_.u32[ 5] = e5; - r_.u32[ 6] = e6; - r_.u32[ 7] = e7; - #endif - - return simde__m256i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_set_epu64x (uint64_t e3, uint64_t e2, uint64_t e1, uint64_t e0) { - simde__m256i_private r_; - - r_.u64[0] = e0; - r_.u64[1] = e1; - r_.u64[2] = e2; - r_.u64[3] = e3; - - return simde__m256i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_set_ps (simde_float32 e7, simde_float32 e6, simde_float32 e5, simde_float32 e4, - simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0); - #else - simde__m256_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_set_ps(e3, e2, e1, e0); - r_.m128[1] = simde_mm_set_ps(e7, e6, e5, e4); - #else - r_.f32[0] = e0; - r_.f32[1] = e1; - r_.f32[2] = e2; - r_.f32[3] = e3; - r_.f32[4] = e4; - r_.f32[5] = e5; - r_.f32[6] = e6; - r_.f32[7] = e7; - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set_ps - #define _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0) \ - simde_mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_set_pd (simde_float64 e3, simde_float64 e2, simde_float64 e1, simde_float64 e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set_pd(e3, e2, e1, e0); - #else - simde__m256d_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_set_pd(e1, e0); - r_.m128d[1] = simde_mm_set_pd(e3, e2); - #else - r_.f64[0] = e0; - r_.f64[1] = e1; - r_.f64[2] = e2; - r_.f64[3] = e3; - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set_pd - #define _mm256_set_pd(e3, e2, e1, e0) \ - simde_mm256_set_pd(e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_set_m128 (simde__m128 e1, simde__m128 e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_insertf128_ps(_mm256_castps128_ps256(e0), e1, 1); - #else - simde__m256_private r_; - simde__m128_private - e1_ = simde__m128_to_private(e1), - e0_ = simde__m128_to_private(e0); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128_private[0] = e0_; - r_.m128_private[1] = e1_; - #elif defined(SIMDE_HAVE_INT128_) - r_.i128[0] = e0_.i128[0]; - r_.i128[1] = e1_.i128[0]; - #else - r_.i64[0] = e0_.i64[0]; - r_.i64[1] = e0_.i64[1]; - r_.i64[2] = e1_.i64[0]; - r_.i64[3] = e1_.i64[1]; - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set_m128 - #define _mm256_set_m128(e1, e0) simde_mm256_set_m128(e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_set_m128d (simde__m128d e1, simde__m128d e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_insertf128_pd(_mm256_castpd128_pd256(e0), e1, 1); - #else - simde__m256d_private r_; - simde__m128d_private - e1_ = simde__m128d_to_private(e1), - e0_ = simde__m128d_to_private(e0); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d_private[0] = e0_; - r_.m128d_private[1] = e1_; - #else - r_.i64[0] = e0_.i64[0]; - r_.i64[1] = e0_.i64[1]; - r_.i64[2] = e1_.i64[0]; - r_.i64[3] = e1_.i64[1]; - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set_m128d - #define _mm256_set_m128d(e1, e0) simde_mm256_set_m128d(e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_set_m128i (simde__m128i e1, simde__m128i e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_insertf128_si256(_mm256_castsi128_si256(e0), e1, 1); - #else - simde__m256i_private r_; - simde__m128i_private - e1_ = simde__m128i_to_private(e1), - e0_ = simde__m128i_to_private(e0); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i_private[0] = e0_; - r_.m128i_private[1] = e1_; - #else - r_.i64[0] = e0_.i64[0]; - r_.i64[1] = e0_.i64[1]; - r_.i64[2] = e1_.i64[0]; - r_.i64[3] = e1_.i64[1]; - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set_m128i - #define _mm256_set_m128i(e1, e0) simde_mm256_set_m128i(e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_set1_epi8 (int8_t a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set1_epi8(a); - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_set1_epi8(a); - r_.m128i[1] = simde_mm_set1_epi8(a); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set1_epi8 - #define _mm256_set1_epi8(a) simde_mm256_set1_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_set1_epi16 (int16_t a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set1_epi16(a); - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_set1_epi16(a); - r_.m128i[1] = simde_mm_set1_epi16(a); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set1_epi16 - #define _mm256_set1_epi16(a) simde_mm256_set1_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_set1_epi32 (int32_t a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set1_epi32(a); - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_set1_epi32(a); - r_.m128i[1] = simde_mm_set1_epi32(a); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set1_epi32 - #define _mm256_set1_epi32(a) simde_mm256_set1_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_set1_epi64x (int64_t a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set1_epi64x(a); - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_set1_epi64x(a); - r_.m128i[1] = simde_mm_set1_epi64x(a); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set1_epi64x - #define _mm256_set1_epi64x(a) simde_mm256_set1_epi64x(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_set1_ps (simde_float32 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set1_ps(a); - #else - simde__m256_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_set1_ps(a); - r_.m128[1] = simde_mm_set1_ps(a); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set1_ps - #define _mm256_set1_ps(a) simde_mm256_set1_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_set1_pd (simde_float64 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_set1_pd(a); - #else - simde__m256d_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_set1_pd(a); - r_.m128d[1] = simde_mm_set1_pd(a); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_set1_pd - #define _mm256_set1_pd(a) simde_mm256_set1_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_deinterleaveeven_epi16 (simde__m256i a, simde__m256i b) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_x_mm_deinterleaveeven_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_x_mm_deinterleaveeven_epi16(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, 0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30); - #else - const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2; - const size_t quarter_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 4; - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.i16[i] = a_.i16[2 * i]; - r_.i16[i + quarter_point] = b_.i16[2 * i]; - r_.i16[halfway_point + i] = a_.i16[halfway_point + 2 * i]; - r_.i16[halfway_point + i + quarter_point] = b_.i16[halfway_point + 2 * i]; - } - #endif - - return simde__m256i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_deinterleaveodd_epi16 (simde__m256i a, simde__m256i b) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_x_mm_deinterleaveodd_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_x_mm_deinterleaveodd_epi16(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, 1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31); - #else - const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2; - const size_t quarter_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 4; - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.i16[i] = a_.i16[2 * i + 1]; - r_.i16[i + quarter_point] = b_.i16[2 * i + 1]; - r_.i16[halfway_point + i] = a_.i16[halfway_point + 2 * i + 1]; - r_.i16[halfway_point + i + quarter_point] = b_.i16[halfway_point + 2 * i + 1]; - } - #endif - - return simde__m256i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_deinterleaveeven_epi32 (simde__m256i a, simde__m256i b) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_x_mm_deinterleaveeven_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_x_mm_deinterleaveeven_epi32(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, 0, 2, 8, 10, 4, 6, 12, 14); - #else - const size_t halfway_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 2; - const size_t quarter_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 4; - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.i32[i] = a_.i32[2 * i]; - r_.i32[i + quarter_point] = b_.i32[2 * i]; - r_.i32[halfway_point + i] = a_.i32[halfway_point + 2 * i]; - r_.i32[halfway_point + i + quarter_point] = b_.i32[halfway_point + 2 * i]; - } - #endif - - return simde__m256i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_deinterleaveodd_epi32 (simde__m256i a, simde__m256i b) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_x_mm_deinterleaveodd_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_x_mm_deinterleaveodd_epi32(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, 1, 3, 9, 11, 5, 7, 13, 15); - #else - const size_t halfway_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 2; - const size_t quarter_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 4; - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.i32[i] = a_.i32[2 * i + 1]; - r_.i32[i + quarter_point] = b_.i32[2 * i + 1]; - r_.i32[halfway_point + i] = a_.i32[halfway_point + 2 * i + 1]; - r_.i32[halfway_point + i + quarter_point] = b_.i32[halfway_point + 2 * i + 1]; - } - #endif - - return simde__m256i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_x_mm256_deinterleaveeven_ps (simde__m256 a, simde__m256 b) { - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_x_mm_deinterleaveeven_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_x_mm_deinterleaveeven_ps(a_.m128[1], b_.m128[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, b_.f32, 0, 2, 8, 10, 4, 6, 12, 14); - #else - const size_t halfway_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 2; - const size_t quarter_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 4; - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.f32[i] = a_.f32[2 * i]; - r_.f32[i + quarter_point] = b_.f32[2 * i]; - r_.f32[halfway_point + i] = a_.f32[halfway_point + 2 * i]; - r_.f32[halfway_point + i + quarter_point] = b_.f32[halfway_point + 2 * i]; - } - #endif - - return simde__m256_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_x_mm256_deinterleaveodd_ps (simde__m256 a, simde__m256 b) { - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_x_mm_deinterleaveodd_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_x_mm_deinterleaveodd_ps(a_.m128[1], b_.m128[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, b_.f32, 1, 3, 9, 11, 5, 7, 13, 15); - #else - const size_t halfway_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 2; - const size_t quarter_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 4; - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.f32[i] = a_.f32[2 * i + 1]; - r_.f32[i + quarter_point] = b_.f32[2 * i + 1]; - r_.f32[halfway_point + i] = a_.f32[halfway_point + 2 * i + 1]; - r_.f32[halfway_point + i + quarter_point] = b_.f32[halfway_point + 2 * i + 1]; - } - #endif - - return simde__m256_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_x_mm256_deinterleaveeven_pd (simde__m256d a, simde__m256d b) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_x_mm_deinterleaveeven_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_x_mm_deinterleaveeven_pd(a_.m128d[1], b_.m128d[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, b_.f64, 0, 4, 2, 6); - #else - const size_t halfway_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 2; - const size_t quarter_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 4; - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.f64[i] = a_.f64[2 * i]; - r_.f64[i + quarter_point] = b_.f64[2 * i]; - r_.f64[halfway_point + i] = a_.f64[halfway_point + 2 * i]; - r_.f64[halfway_point + i + quarter_point] = b_.f64[halfway_point + 2 * i]; - } - #endif - - return simde__m256d_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_x_mm256_deinterleaveodd_pd (simde__m256d a, simde__m256d b) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_x_mm_deinterleaveodd_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_x_mm_deinterleaveodd_pd(a_.m128d[1], b_.m128d[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, b_.f64, 1, 5, 3, 7); - #else - const size_t halfway_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 2; - const size_t quarter_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 4; - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.f64[i] = a_.f64[2 * i + 1]; - r_.f64[i + quarter_point] = b_.f64[2 * i + 1]; - r_.f64[halfway_point + i] = a_.f64[halfway_point + 2 * i + 1]; - r_.f64[halfway_point + i + quarter_point] = b_.f64[halfway_point + 2 * i + 1]; - } - #endif - - return simde__m256d_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_x_mm256_abs_ps(simde__m256 a) { - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_fabsf(a_.f32[i]); - } - return simde__m256_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_x_mm256_abs_pd(simde__m256d a) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_fabs(a_.f64[i]); - } - return simde__m256d_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_add_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_add_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_add_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_mm_add_ps(a_.m128[1], b_.m128[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = a_.f32 + b_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i] + b_.f32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_add_ps - #define _mm256_add_ps(a, b) simde_mm256_add_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_hadd_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_hadd_ps(a, b); - #else - return simde_mm256_add_ps(simde_x_mm256_deinterleaveeven_ps(a, b), simde_x_mm256_deinterleaveodd_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_hadd_ps - #define _mm256_hadd_ps(a, b) simde_mm256_hadd_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_add_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_add_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_add_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_mm_add_pd(a_.m128d[1], b_.m128d[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 + b_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[i] + b_.f64[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_add_pd - #define _mm256_add_pd(a, b) simde_mm256_add_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_hadd_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_hadd_pd(a, b); - #else - return simde_mm256_add_pd(simde_x_mm256_deinterleaveeven_pd(a, b), simde_x_mm256_deinterleaveodd_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_hadd_pd - #define _mm256_hadd_pd(a, b) simde_mm256_hadd_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_addsub_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_addsub_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_addsub_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_mm_addsub_ps(a_.m128[1], b_.m128[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) { - r_.f32[ i ] = a_.f32[ i ] - b_.f32[ i ]; - r_.f32[i + 1] = a_.f32[i + 1] + b_.f32[i + 1]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_addsub_ps - #define _mm256_addsub_ps(a, b) simde_mm256_addsub_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_addsub_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_addsub_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_addsub_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_mm_addsub_pd(a_.m128d[1], b_.m128d[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i += 2) { - r_.f64[ i ] = a_.f64[ i ] - b_.f64[ i ]; - r_.f64[i + 1] = a_.f64[i + 1] + b_.f64[i + 1]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_addsub_pd - #define _mm256_addsub_pd(a, b) simde_mm256_addsub_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_and_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_and_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_and_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_mm_and_ps(a_.m128[1], b_.m128[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f & b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] & b_.i32f[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_and_ps - #define _mm256_and_ps(a, b) simde_mm256_and_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_and_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_and_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_and_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_mm_and_pd(a_.m128d[1], b_.m128d[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f & b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] & b_.i32f[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_and_pd - #define _mm256_and_pd(a, b) simde_mm256_and_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_andnot_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_andnot_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_andnot_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_mm_andnot_ps(a_.m128[1], b_.m128[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = ~a_.i32f & b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = ~a_.i32f[i] & b_.i32f[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_andnot_ps - #define _mm256_andnot_ps(a, b) simde_mm256_andnot_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_andnot_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_andnot_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_andnot_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_mm_andnot_pd(a_.m128d[1], b_.m128d[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = ~a_.i32f & b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = ~a_.i32f[i] & b_.i32f[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_andnot_pd - #define _mm256_andnot_pd(a, b) simde_mm256_andnot_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_blend_ps (simde__m256 a, simde__m256 b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = ((imm8 >> i) & 1) ? b_.f32[i] : a_.f32[i]; - } - - return simde__m256_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_blend_ps(a, b, imm8) _mm256_blend_ps(a, b, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) -# define simde_mm256_blend_ps(a, b, imm8) \ - simde_mm256_set_m128( \ - simde_mm_blend_ps(simde_mm256_extractf128_ps(a, 1), simde_mm256_extractf128_ps(b, 1), (imm8) >> 4), \ - simde_mm_blend_ps(simde_mm256_extractf128_ps(a, 0), simde_mm256_extractf128_ps(b, 0), (imm8) & 0x0F)) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_blend_ps - #define _mm256_blend_ps(a, b, imm8) simde_mm256_blend_ps(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_blend_pd (simde__m256d a, simde__m256d b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = ((imm8 >> i) & 1) ? b_.f64[i] : a_.f64[i]; - } - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_blend_pd(a, b, imm8) _mm256_blend_pd(a, b, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) -# define simde_mm256_blend_pd(a, b, imm8) \ - simde_mm256_set_m128d( \ - simde_mm_blend_pd(simde_mm256_extractf128_pd(a, 1), simde_mm256_extractf128_pd(b, 1), (imm8) >> 2), \ - simde_mm_blend_pd(simde_mm256_extractf128_pd(a, 0), simde_mm256_extractf128_pd(b, 0), (imm8) & 3)) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_blend_pd - #define _mm256_blend_pd(a, b, imm8) simde_mm256_blend_pd(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_blendv_ps (simde__m256 a, simde__m256 b, simde__m256 mask) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_blendv_ps(a, b, mask); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b), - mask_ = simde__m256_to_private(mask); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_blendv_ps(a_.m128[0], b_.m128[0], mask_.m128[0]); - r_.m128[1] = simde_mm_blendv_ps(a_.m128[1], b_.m128[1], mask_.m128[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.f32[i] = (mask_.u32[i] & (UINT32_C(1) << 31)) ? b_.f32[i] : a_.f32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_blendv_ps - #define _mm256_blendv_ps(a, b, imm8) simde_mm256_blendv_ps(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_blendv_pd (simde__m256d a, simde__m256d b, simde__m256d mask) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_blendv_pd(a, b, mask); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b), - mask_ = simde__m256d_to_private(mask); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_blendv_pd(a_.m128d[0], b_.m128d[0], mask_.m128d[0]); - r_.m128d[1] = simde_mm_blendv_pd(a_.m128d[1], b_.m128d[1], mask_.m128d[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.f64[i] = (mask_.u64[i] & (UINT64_C(1) << 63)) ? b_.f64[i] : a_.f64[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_blendv_pd - #define _mm256_blendv_pd(a, b, imm8) simde_mm256_blendv_pd(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_broadcast_pd (simde__m128d const * mem_addr) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_broadcast_pd(mem_addr); - #else - simde__m256d_private r_; - - simde__m128d tmp = simde_mm_loadu_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, mem_addr)); - r_.m128d[0] = tmp; - r_.m128d[1] = tmp; - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcast_pd - #define _mm256_broadcast_pd(mem_addr) simde_mm256_broadcast_pd(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_broadcast_ps (simde__m128 const * mem_addr) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_broadcast_ps(mem_addr); - #else - simde__m256_private r_; - - simde__m128 tmp = simde_mm_loadu_ps(HEDLEY_REINTERPRET_CAST(simde_float32 const*, mem_addr)); - r_.m128[0] = tmp; - r_.m128[1] = tmp; - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcast_ps - #define _mm256_broadcast_ps(mem_addr) simde_mm256_broadcast_ps(HEDLEY_REINTERPRET_CAST(simde__m128 const*, mem_addr)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_broadcast_sd (simde_float64 const * a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_broadcast_sd(a); - #else - return simde_mm256_set1_pd(*a); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcast_sd - #define _mm256_broadcast_sd(mem_addr) simde_mm256_broadcast_sd(HEDLEY_REINTERPRET_CAST(double const*, mem_addr)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_broadcast_ss (simde_float32 const * a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm_broadcast_ss(a); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128_from_wasm_v128(wasm_v128_load32_splat(a)); - #else - return simde_mm_set1_ps(*a); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_broadcast_ss - #define _mm_broadcast_ss(mem_addr) simde_mm_broadcast_ss(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_broadcast_ss (simde_float32 const * a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_broadcast_ss(a); - #else - return simde_mm256_set1_ps(*a); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcast_ss - #define _mm256_broadcast_ss(mem_addr) simde_mm256_broadcast_ss(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_castpd128_pd256 (simde__m128d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castpd128_pd256(a); - #else - simde__m256d_private r_; - simde__m128d_private a_ = simde__m128d_to_private(a); - - r_.m128d_private[0] = a_; - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castpd128_pd256 - #define _mm256_castpd128_pd256(a) simde_mm256_castpd128_pd256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm256_castpd256_pd128 (simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castpd256_pd128(a); - #else - simde__m256d_private a_ = simde__m256d_to_private(a); - return a_.m128d[0]; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castpd256_pd128 - #define _mm256_castpd256_pd128(a) simde_mm256_castpd256_pd128(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_castps128_ps256 (simde__m128 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castps128_ps256(a); - #else - simde__m256_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - r_.m128_private[0] = a_; - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castps128_ps256 - #define _mm256_castps128_ps256(a) simde_mm256_castps128_ps256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm256_castps256_ps128 (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castps256_ps128(a); - #else - simde__m256_private a_ = simde__m256_to_private(a); - return a_.m128[0]; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castps256_ps128 - #define _mm256_castps256_ps128(a) simde_mm256_castps256_ps128(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_castsi128_si256 (simde__m128i a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castsi128_si256(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - r_.m128i_private[0] = a_; - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castsi128_si256 - #define _mm256_castsi128_si256(a) simde_mm256_castsi128_si256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm256_castsi256_si128 (simde__m256i a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_castsi256_si128(a); - #else - simde__m256i_private a_ = simde__m256i_to_private(a); - return a_.m128i[0]; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_castsi256_si128 - #define _mm256_castsi256_si128(a) simde_mm256_castsi256_si128(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_round_ps (simde__m256 a, const int rounding) { - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { - #if defined(simde_math_nearbyintf) - case SIMDE_MM_FROUND_CUR_DIRECTION: - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_nearbyintf(a_.f32[i]); - } - break; - #endif - - #if defined(simde_math_roundf) - case SIMDE_MM_FROUND_TO_NEAREST_INT: - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_roundf(a_.f32[i]); - } - break; - #endif - - #if defined(simde_math_floorf) - case SIMDE_MM_FROUND_TO_NEG_INF: - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_floorf(a_.f32[i]); - } - break; - #endif - - #if defined(simde_math_ceilf) - case SIMDE_MM_FROUND_TO_POS_INF: - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_ceilf(a_.f32[i]); - } - break; - #endif - - #if defined(simde_math_truncf) - case SIMDE_MM_FROUND_TO_ZERO: - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_truncf(a_.f32[i]); - } - break; - #endif - - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm256_undefined_ps()); - } - - return simde__m256_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm256_round_ps(a, rounding) _mm256_round_ps(a, rounding) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm256_round_ps(a, rounding) SIMDE_STATEMENT_EXPR_(({ \ - simde__m256_private \ - simde_mm256_round_ps_r_ = simde__m256_to_private(simde_mm256_setzero_ps()), \ - simde_mm256_round_ps_a_ = simde__m256_to_private(a); \ - \ - for (size_t simde_mm256_round_ps_i = 0 ; simde_mm256_round_ps_i < (sizeof(simde_mm256_round_ps_r_.m128) / sizeof(simde_mm256_round_ps_r_.m128[0])) ; simde_mm256_round_ps_i++) { \ - simde_mm256_round_ps_r_.m128[simde_mm256_round_ps_i] = simde_mm_round_ps(simde_mm256_round_ps_a_.m128[simde_mm256_round_ps_i], rounding); \ - } \ - \ - simde__m256_from_private(simde_mm256_round_ps_r_); \ - })) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_round_ps - #define _mm256_round_ps(a, rounding) simde_mm256_round_ps(a, rounding) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_round_pd (simde__m256d a, const int rounding) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { - #if defined(simde_math_nearbyint) - case SIMDE_MM_FROUND_CUR_DIRECTION: - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_nearbyint(a_.f64[i]); - } - break; - #endif - - #if defined(simde_math_round) - case SIMDE_MM_FROUND_TO_NEAREST_INT: - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_round(a_.f64[i]); - } - break; - #endif - - #if defined(simde_math_floor) - case SIMDE_MM_FROUND_TO_NEG_INF: - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_floor(a_.f64[i]); - } - break; - #endif - - #if defined(simde_math_ceil) - case SIMDE_MM_FROUND_TO_POS_INF: - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_ceil(a_.f64[i]); - } - break; - #endif - - #if defined(simde_math_trunc) - case SIMDE_MM_FROUND_TO_ZERO: - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_trunc(a_.f64[i]); - } - break; - #endif - - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm256_undefined_pd()); - } - - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm256_round_pd(a, rounding) _mm256_round_pd(a, rounding) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm256_round_pd(a, rounding) SIMDE_STATEMENT_EXPR_(({ \ - simde__m256d_private \ - simde_mm256_round_pd_r_ = simde__m256d_to_private(simde_mm256_setzero_pd()), \ - simde_mm256_round_pd_a_ = simde__m256d_to_private(a); \ - \ - for (size_t simde_mm256_round_pd_i = 0 ; simde_mm256_round_pd_i < (sizeof(simde_mm256_round_pd_r_.m128d) / sizeof(simde_mm256_round_pd_r_.m128d[0])) ; simde_mm256_round_pd_i++) { \ - simde_mm256_round_pd_r_.m128d[simde_mm256_round_pd_i] = simde_mm_round_pd(simde_mm256_round_pd_a_.m128d[simde_mm256_round_pd_i], rounding); \ - } \ - \ - simde__m256d_from_private(simde_mm256_round_pd_r_); \ - })) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_round_pd - #define _mm256_round_pd(a, rounding) simde_mm256_round_pd(a, rounding) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_ceil_pd (simde__m256d a) { - return simde_mm256_round_pd(a, SIMDE_MM_FROUND_TO_POS_INF); -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_ceil_pd - #define _mm256_ceil_pd(a) simde_mm256_ceil_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_ceil_ps (simde__m256 a) { - return simde_mm256_round_ps(a, SIMDE_MM_FROUND_TO_POS_INF); -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_ceil_ps - #define _mm256_ceil_ps(a) simde_mm256_ceil_ps(a) -#endif - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DIAGNOSTIC_DISABLE_FLOAT_EQUAL - -/* This implementation does not support signaling NaNs (yet?) */ -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmp_pd (simde__m128d a, simde__m128d b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) { - switch (imm8) { - case SIMDE_CMP_EQ_UQ: - case SIMDE_CMP_EQ_US: - return simde_mm_or_pd(simde_mm_cmpunord_pd(a, b), simde_mm_cmpeq_pd(a, b)); - break; - case SIMDE_CMP_EQ_OQ: - case SIMDE_CMP_EQ_OS: - return simde_mm_cmpeq_pd(a, b); - break; - case SIMDE_CMP_NGE_US: - case SIMDE_CMP_NGE_UQ: - return simde_x_mm_not_pd(simde_mm_cmpge_pd(a, b)); - break; - case SIMDE_CMP_LT_OS: - case SIMDE_CMP_LT_OQ: - return simde_mm_cmplt_pd(a, b); - break; - case SIMDE_CMP_NGT_US: - case SIMDE_CMP_NGT_UQ: - return simde_x_mm_not_pd(simde_mm_cmpgt_pd(a, b)); - break; - case SIMDE_CMP_LE_OS: - case SIMDE_CMP_LE_OQ: - return simde_mm_cmple_pd(a, b); - break; - case SIMDE_CMP_NEQ_UQ: - case SIMDE_CMP_NEQ_US: - return simde_mm_cmpneq_pd(a, b); - break; - case SIMDE_CMP_NEQ_OQ: - case SIMDE_CMP_NEQ_OS: - return simde_mm_and_pd(simde_mm_cmpord_pd(a, b), simde_mm_cmpneq_pd(a, b)); - break; - case SIMDE_CMP_NLT_US: - case SIMDE_CMP_NLT_UQ: - return simde_x_mm_not_pd(simde_mm_cmplt_pd(a, b)); - break; - case SIMDE_CMP_GE_OS: - case SIMDE_CMP_GE_OQ: - return simde_mm_cmpge_pd(a, b); - break; - case SIMDE_CMP_NLE_US: - case SIMDE_CMP_NLE_UQ: - return simde_x_mm_not_pd(simde_mm_cmple_pd(a, b)); - break; - case SIMDE_CMP_GT_OS: - case SIMDE_CMP_GT_OQ: - return simde_mm_cmpgt_pd(a, b); - break; - case SIMDE_CMP_FALSE_OQ: - case SIMDE_CMP_FALSE_OS: - return simde_mm_setzero_pd(); - break; - case SIMDE_CMP_TRUE_UQ: - case SIMDE_CMP_TRUE_US: - return simde_x_mm_setone_pd(); - break; - case SIMDE_CMP_UNORD_Q: - case SIMDE_CMP_UNORD_S: - return simde_mm_cmpunord_pd(a, b); - break; - case SIMDE_CMP_ORD_Q: - case SIMDE_CMP_ORD_S: - return simde_mm_cmpord_pd(a, b); - break; - } - - HEDLEY_UNREACHABLE_RETURN(simde_mm_setzero_pd()); -} -#if defined(__clang__) && defined(__AVX512DQ__) - #define simde_mm_cmp_pd(a, b, imm8) (__extension__ ({ \ - simde__m128d simde_mm_cmp_pd_r; \ - switch (imm8) { \ - case SIMDE_CMP_FALSE_OQ: \ - case SIMDE_CMP_FALSE_OS: \ - simde_mm_cmp_pd_r = simde_mm_setzero_pd(); \ - break; \ - case SIMDE_CMP_TRUE_UQ: \ - case SIMDE_CMP_TRUE_US: \ - simde_mm_cmp_pd_r = simde_x_mm_setone_pd(); \ - break; \ - default: \ - simde_mm_cmp_pd_r = simde_mm_cmp_pd(a, b, imm8); \ - break; \ - } \ - simde_mm_cmp_pd_r; \ - })) -#elif defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm_cmp_pd(a, b, imm8) _mm_cmp_pd(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_cmp_pd - #define _mm_cmp_pd(a, b, imm8) simde_mm_cmp_pd(a, b, imm8) -#endif - -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmp_ps (simde__m128 a, simde__m128 b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) { - switch (imm8) { - case SIMDE_CMP_EQ_UQ: - case SIMDE_CMP_EQ_US: - return simde_mm_or_ps(simde_mm_cmpunord_ps(a, b), simde_mm_cmpeq_ps(a, b)); - break; - case SIMDE_CMP_EQ_OQ: - case SIMDE_CMP_EQ_OS: - return simde_mm_cmpeq_ps(a, b); - break; - case SIMDE_CMP_NGE_US: - case SIMDE_CMP_NGE_UQ: - return simde_x_mm_not_ps(simde_mm_cmpge_ps(a, b)); - break; - case SIMDE_CMP_LT_OS: - case SIMDE_CMP_LT_OQ: - return simde_mm_cmplt_ps(a, b); - break; - case SIMDE_CMP_NGT_US: - case SIMDE_CMP_NGT_UQ: - return simde_x_mm_not_ps(simde_mm_cmpgt_ps(a, b)); - break; - case SIMDE_CMP_LE_OS: - case SIMDE_CMP_LE_OQ: - return simde_mm_cmple_ps(a, b); - break; - case SIMDE_CMP_NEQ_UQ: - case SIMDE_CMP_NEQ_US: - return simde_mm_cmpneq_ps(a, b); - break; - case SIMDE_CMP_NEQ_OQ: - case SIMDE_CMP_NEQ_OS: - return simde_mm_and_ps(simde_mm_cmpord_ps(a, b), simde_mm_cmpneq_ps(a, b)); - break; - case SIMDE_CMP_NLT_US: - case SIMDE_CMP_NLT_UQ: - return simde_x_mm_not_ps(simde_mm_cmplt_ps(a, b)); - break; - case SIMDE_CMP_GE_OS: - case SIMDE_CMP_GE_OQ: - return simde_mm_cmpge_ps(a, b); - break; - case SIMDE_CMP_NLE_US: - case SIMDE_CMP_NLE_UQ: - return simde_x_mm_not_ps(simde_mm_cmple_ps(a, b)); - break; - case SIMDE_CMP_GT_OS: - case SIMDE_CMP_GT_OQ: - return simde_mm_cmpgt_ps(a, b); - break; - case SIMDE_CMP_FALSE_OQ: - case SIMDE_CMP_FALSE_OS: - return simde_mm_setzero_ps(); - break; - case SIMDE_CMP_TRUE_UQ: - case SIMDE_CMP_TRUE_US: - return simde_x_mm_setone_ps(); - break; - case SIMDE_CMP_UNORD_Q: - case SIMDE_CMP_UNORD_S: - return simde_mm_cmpunord_ps(a, b); - break; - case SIMDE_CMP_ORD_Q: - case SIMDE_CMP_ORD_S: - return simde_mm_cmpord_ps(a, b); - break; - } - - HEDLEY_UNREACHABLE_RETURN(simde_mm_setzero_ps()); -} -/* Prior to 9.0 clang has problems with _mm{,256}_cmp_{ps,pd} for all four of the true/false - * comparisons, but only when AVX-512 is enabled. */ -#if defined(__clang__) && defined(__AVX512DQ__) - #define simde_mm_cmp_ps(a, b, imm8) (__extension__ ({ \ - simde__m128 simde_mm_cmp_ps_r; \ - switch (imm8) { \ - case SIMDE_CMP_FALSE_OQ: \ - case SIMDE_CMP_FALSE_OS: \ - simde_mm_cmp_ps_r = simde_mm_setzero_ps(); \ - break; \ - case SIMDE_CMP_TRUE_UQ: \ - case SIMDE_CMP_TRUE_US: \ - simde_mm_cmp_ps_r = simde_x_mm_setone_ps(); \ - break; \ - default: \ - simde_mm_cmp_ps_r = simde_mm_cmp_ps(a, b, imm8); \ - break; \ - } \ - simde_mm_cmp_ps_r; \ - })) -#elif defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm_cmp_ps(a, b, imm8) _mm_cmp_ps(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_cmp_ps - #define _mm_cmp_ps(a, b, imm8) simde_mm_cmp_ps(a, b, imm8) -#endif - -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmp_sd (simde__m128d a, simde__m128d b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) { - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - switch (imm8) { - case SIMDE_CMP_EQ_OQ: - case SIMDE_CMP_EQ_OS: - a_.i64[0] = (a_.f64[0] == b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_LT_OQ: - case SIMDE_CMP_LT_OS: - a_.i64[0] = (a_.f64[0] < b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_LE_OQ: - case SIMDE_CMP_LE_OS: - a_.i64[0] = (a_.f64[0] <= b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_UNORD_Q: - case SIMDE_CMP_UNORD_S: - a_.i64[0] = ((a_.f64[0] != a_.f64[0]) || (b_.f64[0] != b_.f64[0])) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_NEQ_UQ: - case SIMDE_CMP_NEQ_US: - a_.i64[0] = ((a_.f64[0] == a_.f64[0]) & (b_.f64[0] == b_.f64[0]) & (a_.f64[0] != b_.f64[0])) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_NEQ_OQ: - case SIMDE_CMP_NEQ_OS: - a_.i64[0] = ((a_.f64[0] == a_.f64[0]) & (b_.f64[0] == b_.f64[0]) & (a_.f64[0] != b_.f64[0])) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_NLT_UQ: - case SIMDE_CMP_NLT_US: - a_.i64[0] = !(a_.f64[0] < b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_NLE_UQ: - case SIMDE_CMP_NLE_US: - a_.i64[0] = !(a_.f64[0] <= b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_ORD_Q: - case SIMDE_CMP_ORD_S: - a_.i64[0] = ((a_.f64[0] == a_.f64[0]) & (b_.f64[0] == b_.f64[0])) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_EQ_UQ: - case SIMDE_CMP_EQ_US: - a_.i64[0] = ((a_.f64[0] != a_.f64[0]) | (b_.f64[0] != b_.f64[0]) | (a_.f64[0] == b_.f64[0])) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_NGE_UQ: - case SIMDE_CMP_NGE_US: - a_.i64[0] = !(a_.f64[0] >= b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_NGT_UQ: - case SIMDE_CMP_NGT_US: - a_.i64[0] = !(a_.f64[0] > b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_FALSE_OQ: - case SIMDE_CMP_FALSE_OS: - a_.i64[0] = INT64_C(0); - break; - - case SIMDE_CMP_GE_OQ: - case SIMDE_CMP_GE_OS: - a_.i64[0] = (a_.f64[0] >= b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_GT_OQ: - case SIMDE_CMP_GT_OS: - a_.i64[0] = (a_.f64[0] > b_.f64[0]) ? ~INT64_C(0) : INT64_C(0); - break; - - case SIMDE_CMP_TRUE_UQ: - case SIMDE_CMP_TRUE_US: - a_.i64[0] = ~INT64_C(0); - break; - - default: - HEDLEY_UNREACHABLE(); - } - - return simde__m128d_from_private(a_); -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm_cmp_sd(a, b, imm8) _mm_cmp_sd(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_cmp_sd - #define _mm_cmp_sd(a, b, imm8) simde_mm_cmp_sd(a, b, imm8) -#endif - -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmp_ss (simde__m128 a, simde__m128 b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) { - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - switch (imm8) { - case SIMDE_CMP_EQ_OQ: - case SIMDE_CMP_EQ_OS: - a_.i32[0] = (a_.f32[0] == b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_LT_OQ: - case SIMDE_CMP_LT_OS: - a_.i32[0] = (a_.f32[0] < b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_LE_OQ: - case SIMDE_CMP_LE_OS: - a_.i32[0] = (a_.f32[0] <= b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_UNORD_Q: - case SIMDE_CMP_UNORD_S: - a_.i32[0] = ((a_.f32[0] != a_.f32[0]) || (b_.f32[0] != b_.f32[0])) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_NEQ_UQ: - case SIMDE_CMP_NEQ_US: - a_.i32[0] = ((a_.f32[0] == a_.f32[0]) & (b_.f32[0] == b_.f32[0]) & (a_.f32[0] != b_.f32[0])) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_NEQ_OQ: - case SIMDE_CMP_NEQ_OS: - a_.i32[0] = ((a_.f32[0] == a_.f32[0]) & (b_.f32[0] == b_.f32[0]) & (a_.f32[0] != b_.f32[0])) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_NLT_UQ: - case SIMDE_CMP_NLT_US: - a_.i32[0] = !(a_.f32[0] < b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_NLE_UQ: - case SIMDE_CMP_NLE_US: - a_.i32[0] = !(a_.f32[0] <= b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_ORD_Q: - case SIMDE_CMP_ORD_S: - a_.i32[0] = ((a_.f32[0] == a_.f32[0]) & (b_.f32[0] == b_.f32[0])) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_EQ_UQ: - case SIMDE_CMP_EQ_US: - a_.i32[0] = ((a_.f32[0] != a_.f32[0]) | (b_.f32[0] != b_.f32[0]) | (a_.f32[0] == b_.f32[0])) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_NGE_UQ: - case SIMDE_CMP_NGE_US: - a_.i32[0] = !(a_.f32[0] >= b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_NGT_UQ: - case SIMDE_CMP_NGT_US: - a_.i32[0] = !(a_.f32[0] > b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_FALSE_OQ: - case SIMDE_CMP_FALSE_OS: - a_.i32[0] = INT32_C(0); - break; - - case SIMDE_CMP_GE_OQ: - case SIMDE_CMP_GE_OS: - a_.i32[0] = (a_.f32[0] >= b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_GT_OQ: - case SIMDE_CMP_GT_OS: - a_.i32[0] = (a_.f32[0] > b_.f32[0]) ? ~INT32_C(0) : INT32_C(0); - break; - - case SIMDE_CMP_TRUE_UQ: - case SIMDE_CMP_TRUE_US: - a_.i32[0] = ~INT32_C(0); - break; - - default: - HEDLEY_UNREACHABLE(); - } - - return simde__m128_from_private(a_); -} -#if defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm_cmp_ss(a, b, imm8) _mm_cmp_ss(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_cmp_ss - #define _mm_cmp_ss(a, b, imm8) simde_mm_cmp_ss(a, b, imm8) -#endif - -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__m256d -#if defined(__clang__) && defined(__AVX512DQ__) -simde_mm256_cmp_pd_internal_ -#else -simde_mm256_cmp_pd -#endif -(simde__m256d a, simde__m256d b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - switch (imm8) { - case SIMDE_CMP_EQ_OQ: - case SIMDE_CMP_EQ_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] == b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_LT_OQ: - case SIMDE_CMP_LT_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] < b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_LE_OQ: - case SIMDE_CMP_LE_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] <= b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_UNORD_Q: - case SIMDE_CMP_UNORD_S: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 != a_.f64) | (b_.f64 != b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = ((a_.f64[i] != a_.f64[i]) || (b_.f64[i] != b_.f64[i])) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NEQ_UQ: - case SIMDE_CMP_NEQ_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] != b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NEQ_OQ: - case SIMDE_CMP_NEQ_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 == a_.f64) & (b_.f64 == b_.f64) & (a_.f64 != b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = ((a_.f64[i] == a_.f64[i]) & (b_.f64[i] == b_.f64[i]) & (a_.f64[i] != b_.f64[i])) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NLT_UQ: - case SIMDE_CMP_NLT_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.f64 < b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = !(a_.f64[i] < b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NLE_UQ: - case SIMDE_CMP_NLE_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.f64 <= b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = !(a_.f64[i] <= b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_ORD_Q: - case SIMDE_CMP_ORD_S: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ((a_.f64 == a_.f64) & (b_.f64 == b_.f64))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = ((a_.f64[i] == a_.f64[i]) & (b_.f64[i] == b_.f64[i])) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_EQ_UQ: - case SIMDE_CMP_EQ_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 != a_.f64) | (b_.f64 != b_.f64) | (a_.f64 == b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = ((a_.f64[i] != a_.f64[i]) | (b_.f64[i] != b_.f64[i]) | (a_.f64[i] == b_.f64[i])) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NGE_UQ: - case SIMDE_CMP_NGE_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.f64 >= b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = !(a_.f64[i] >= b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NGT_UQ: - case SIMDE_CMP_NGT_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.f64 > b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = !(a_.f64[i] > b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_FALSE_OQ: - case SIMDE_CMP_FALSE_OS: - r_ = simde__m256d_to_private(simde_mm256_setzero_pd()); - break; - - case SIMDE_CMP_GE_OQ: - case SIMDE_CMP_GE_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] >= b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_GT_OQ: - case SIMDE_CMP_GT_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] > b_.f64[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_TRUE_UQ: - case SIMDE_CMP_TRUE_US: - r_ = simde__m256d_to_private(simde_x_mm256_setone_pd()); - break; - - default: - HEDLEY_UNREACHABLE(); - } - - return simde__m256d_from_private(r_); -} -#if defined(__clang__) && defined(__AVX512DQ__) - #define simde_mm256_cmp_pd(a, b, imm8) (__extension__ ({ \ - simde__m256d simde_mm256_cmp_pd_r; \ - switch (imm8) { \ - case SIMDE_CMP_FALSE_OQ: \ - case SIMDE_CMP_FALSE_OS: \ - simde_mm256_cmp_pd_r = simde_mm256_setzero_pd(); \ - break; \ - case SIMDE_CMP_TRUE_UQ: \ - case SIMDE_CMP_TRUE_US: \ - simde_mm256_cmp_pd_r = simde_x_mm256_setone_pd(); \ - break; \ - default: \ - simde_mm256_cmp_pd_r = simde_mm256_cmp_pd_internal_(a, b, imm8); \ - break; \ - } \ - simde_mm256_cmp_pd_r; \ - })) -#elif defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm256_cmp_pd(a, b, imm8) _mm256_cmp_pd(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmp_pd - #define _mm256_cmp_pd(a, b, imm8) simde_mm256_cmp_pd(a, b, imm8) -#endif - -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__m256 -#if defined(__clang__) && defined(__AVX512DQ__) -simde_mm256_cmp_ps_internal_ -#else -simde_mm256_cmp_ps -#endif -(simde__m256 a, simde__m256 b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) { - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - switch (imm8) { - case SIMDE_CMP_EQ_OQ: - case SIMDE_CMP_EQ_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 == b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = (a_.f32[i] == b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_LT_OQ: - case SIMDE_CMP_LT_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 < b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = (a_.f32[i] < b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_LE_OQ: - case SIMDE_CMP_LE_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 <= b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = (a_.f32[i] <= b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_UNORD_Q: - case SIMDE_CMP_UNORD_S: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 != a_.f32) | (b_.f32 != b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = ((a_.f32[i] != a_.f32[i]) || (b_.f32[i] != b_.f32[i])) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NEQ_UQ: - case SIMDE_CMP_NEQ_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 != b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = (a_.f32[i] != b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NEQ_OQ: - case SIMDE_CMP_NEQ_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 == a_.f32) & (b_.f32 == b_.f32) & (a_.f32 != b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = ((a_.f32[i] == a_.f32[i]) & (b_.f32[i] == b_.f32[i]) & (a_.f32[i] != b_.f32[i])) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NLT_UQ: - case SIMDE_CMP_NLT_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.f32 < b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = !(a_.f32[i] < b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NLE_UQ: - case SIMDE_CMP_NLE_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.f32 <= b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = !(a_.f32[i] <= b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_ORD_Q: - case SIMDE_CMP_ORD_S: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ((a_.f32 == a_.f32) & (b_.f32 == b_.f32))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = ((a_.f32[i] == a_.f32[i]) & (b_.f32[i] == b_.f32[i])) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_EQ_UQ: - case SIMDE_CMP_EQ_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 != a_.f32) | (b_.f32 != b_.f32) | (a_.f32 == b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = ((a_.f32[i] != a_.f32[i]) | (b_.f32[i] != b_.f32[i]) | (a_.f32[i] == b_.f32[i])) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NGE_UQ: - case SIMDE_CMP_NGE_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.f32 >= b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = !(a_.f32[i] >= b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NGT_UQ: - case SIMDE_CMP_NGT_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.f32 > b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = !(a_.f32[i] > b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_FALSE_OQ: - case SIMDE_CMP_FALSE_OS: - r_ = simde__m256_to_private(simde_mm256_setzero_ps()); - break; - - case SIMDE_CMP_GE_OQ: - case SIMDE_CMP_GE_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 >= b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = (a_.f32[i] >= b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_GT_OQ: - case SIMDE_CMP_GT_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 > b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = (a_.f32[i] > b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_TRUE_UQ: - case SIMDE_CMP_TRUE_US: - r_ = simde__m256_to_private(simde_x_mm256_setone_ps()); - break; - - default: - HEDLEY_UNREACHABLE(); - } - - return simde__m256_from_private(r_); -} -#if defined(__clang__) && defined(__AVX512DQ__) - #define simde_mm256_cmp_ps(a, b, imm8) (__extension__ ({ \ - simde__m256 simde_mm256_cmp_ps_r; \ - switch (imm8) { \ - case SIMDE_CMP_FALSE_OQ: \ - case SIMDE_CMP_FALSE_OS: \ - simde_mm256_cmp_ps_r = simde_mm256_setzero_ps(); \ - break; \ - case SIMDE_CMP_TRUE_UQ: \ - case SIMDE_CMP_TRUE_US: \ - simde_mm256_cmp_ps_r = simde_x_mm256_setone_ps(); \ - break; \ - default: \ - simde_mm256_cmp_ps_r = simde_mm256_cmp_ps_internal_(a, b, imm8); \ - break; \ - } \ - simde_mm256_cmp_ps_r; \ - })) -#elif defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm256_cmp_ps(a, b, imm8) _mm256_cmp_ps(a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) && SIMDE_NATURAL_VECTOR_SIZE_LE(128) - #define simde_mm256_cmp_ps(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m256_private \ - simde_mm256_cmp_ps_r_ = simde__m256_to_private(simde_mm256_setzero_ps()), \ - simde_mm256_cmp_ps_a_ = simde__m256_to_private((a)), \ - simde_mm256_cmp_ps_b_ = simde__m256_to_private((b)); \ - \ - for (size_t i = 0 ; i < (sizeof(simde_mm256_cmp_ps_r_.m128) / sizeof(simde_mm256_cmp_ps_r_.m128[0])) ; i++) { \ - simde_mm256_cmp_ps_r_.m128[i] = simde_mm_cmp_ps(simde_mm256_cmp_ps_a_.m128[i], simde_mm256_cmp_ps_b_.m128[i], (imm8)); \ - } \ - \ - simde__m256_from_private(simde_mm256_cmp_ps_r_); \ - })) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmp_ps - #define _mm256_cmp_ps(a, b, imm8) simde_mm256_cmp_ps(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_x_mm256_copysign_ps(simde__m256 dest, simde__m256 src) { - simde__m256_private - r_, - dest_ = simde__m256_to_private(dest), - src_ = simde__m256_to_private(src); - - #if defined(simde_math_copysignf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_copysignf(dest_.f32[i], src_.f32[i]); - } - #else - simde__m256 sgnbit = simde_mm256_xor_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C(0.0)), simde_mm256_set1_ps(-SIMDE_FLOAT32_C(0.0))); - return simde_mm256_xor_ps(simde_mm256_and_ps(sgnbit, src), simde_mm256_andnot_ps(sgnbit, dest)); - #endif - - return simde__m256_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_x_mm256_copysign_pd(simde__m256d dest, simde__m256d src) { - simde__m256d_private - r_, - dest_ = simde__m256d_to_private(dest), - src_ = simde__m256d_to_private(src); - - #if defined(simde_math_copysign) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_copysign(dest_.f64[i], src_.f64[i]); - } - #else - simde__m256d sgnbit = simde_mm256_xor_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C(0.0)), simde_mm256_set1_pd(-SIMDE_FLOAT64_C(0.0))); - return simde_mm256_xor_pd(simde_mm256_and_pd(sgnbit, src), simde_mm256_andnot_pd(sgnbit, dest)); - #endif - - return simde__m256d_from_private(r_); -} - -HEDLEY_DIAGNOSTIC_POP /* -Wfloat-equal */ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_cvtepi32_pd (simde__m128i a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cvtepi32_pd(a); - #else - simde__m256d_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = HEDLEY_STATIC_CAST(simde_float64, a_.i32[i]); - } - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepi32_pd - #define _mm256_cvtepi32_pd(a) simde_mm256_cvtepi32_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 - simde_mm256_cvtepi32_ps (simde__m256i a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cvtepi32_ps(a); - #else - simde__m256_private r_; - simde__m256i_private a_ = simde__m256i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = HEDLEY_STATIC_CAST(simde_float32, a_.i32[i]); - } - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepi32_ps - #define _mm256_cvtepi32_ps(a) simde_mm256_cvtepi32_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm256_cvtpd_epi32 (simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cvtpd_epi32(a); - #else - simde__m128i_private r_; - simde__m256d_private a_ = simde__m256d_to_private(a); - - #if defined(simde_math_nearbyint) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_nearbyint(a_.f64[i])); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtpd_epi32 - #define _mm256_cvtpd_epi32(a) simde_mm256_cvtpd_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm256_cvtpd_ps (simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cvtpd_ps(a); - #else - simde__m128_private r_; - simde__m256d_private a_ = simde__m256d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = HEDLEY_STATIC_CAST(simde_float32, a_.f64[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtpd_ps - #define _mm256_cvtpd_ps(a) simde_mm256_cvtpd_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtps_epi32 (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cvtps_epi32(a); - #else - simde__m256i_private r_; - simde__m256_private a_ = simde__m256_to_private(a); - - #if defined(simde_math_nearbyintf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_nearbyintf(a_.f32[i])); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtps_epi32 - #define _mm256_cvtps_epi32(a) simde_mm256_cvtps_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_cvtps_pd (simde__m128 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cvtps_pd(a); - #else - simde__m256d_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.f64[i] = HEDLEY_STATIC_CAST(double, a_.f32[i]); - } - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtps_pd - #define _mm256_cvtps_pd(a) simde_mm256_cvtps_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde_float64 -simde_mm256_cvtsd_f64 (simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) && ( \ - SIMDE_DETECT_CLANG_VERSION_CHECK(3,9,0) || \ - HEDLEY_GCC_VERSION_CHECK(7,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_MSVC_VERSION_CHECK(19,14,0)) - return _mm256_cvtsd_f64(a); - #else - simde__m256d_private a_ = simde__m256d_to_private(a); - return a_.f64[0]; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtsd_f64 - #define _mm256_cvtsd_f64(a) simde_mm256_cvtsd_f64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm256_cvtsi256_si32 (simde__m256i a) { - #if defined(SIMDE_X86_AVX_NATIVE) && ( \ - SIMDE_DETECT_CLANG_VERSION_CHECK(3,9,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_MSVC_VERSION_CHECK(19,14,0)) - return _mm256_cvtsi256_si32(a); - #else - simde__m256i_private a_ = simde__m256i_to_private(a); - return a_.i32[0]; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtsi256_si32 - #define _mm256_cvtsi256_si32(a) simde_mm256_cvtsi256_si32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde_float32 -simde_mm256_cvtss_f32 (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) && ( \ - SIMDE_DETECT_CLANG_VERSION_CHECK(3,9,0) || \ - HEDLEY_GCC_VERSION_CHECK(7,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) || \ - HEDLEY_MSVC_VERSION_CHECK(19,14,0)) - return _mm256_cvtss_f32(a); - #else - simde__m256_private a_ = simde__m256_to_private(a); - return a_.f32[0]; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtss_f32 - #define _mm256_cvtss_f32(a) simde_mm256_cvtss_f32(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm256_cvttpd_epi32 (simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cvttpd_epi32(a); - #else - simde__m128i_private r_; - simde__m256d_private a_ = simde__m256d_to_private(a); - - #if defined(simde_math_trunc) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_trunc(a_.f64[i])); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvttpd_epi32 - #define _mm256_cvttpd_epi32(a) simde_mm256_cvttpd_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvttps_epi32 (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cvttps_epi32(a); - #else - simde__m256i_private r_; - simde__m256_private a_ = simde__m256_to_private(a); - - #if defined(simde_math_truncf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_truncf(a_.f32[i])); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvttps_epi32 - #define _mm256_cvttps_epi32(a) simde_mm256_cvttps_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_div_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_div_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_div_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_mm_div_ps(a_.m128[1], b_.m128[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = a_.f32 / b_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i] / b_.f32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_div_ps - #define _mm256_div_ps(a, b) simde_mm256_div_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_div_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_div_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_div_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_mm_div_pd(a_.m128d[1], b_.m128d[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 / b_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[i] / b_.f64[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_div_pd - #define _mm256_div_pd(a, b) simde_mm256_div_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm256_extractf128_pd (simde__m256d a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m256d_private a_ = simde__m256d_to_private(a); - return a_.m128d[imm8]; -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_extractf128_pd(a, imm8) _mm256_extractf128_pd(a, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_extractf128_pd - #define _mm256_extractf128_pd(a, imm8) simde_mm256_extractf128_pd(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm256_extractf128_ps (simde__m256 a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m256_private a_ = simde__m256_to_private(a); - return a_.m128[imm8]; -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_extractf128_ps(a, imm8) _mm256_extractf128_ps(a, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_extractf128_ps - #define _mm256_extractf128_ps(a, imm8) simde_mm256_extractf128_ps(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm256_extractf128_si256 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m256i_private a_ = simde__m256i_to_private(a); - return a_.m128i[imm8]; -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_extractf128_si256(a, imm8) _mm256_extractf128_si256(a, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_extractf128_si256 - #define _mm256_extractf128_si256(a, imm8) simde_mm256_extractf128_si256(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_floor_pd (simde__m256d a) { - return simde_mm256_round_pd(a, SIMDE_MM_FROUND_TO_NEG_INF); -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_floor_pd - #define _mm256_floor_pd(a) simde_mm256_floor_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_floor_ps (simde__m256 a) { - return simde_mm256_round_ps(a, SIMDE_MM_FROUND_TO_NEG_INF); -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_floor_ps - #define _mm256_floor_ps(a) simde_mm256_floor_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_insert_epi8 (simde__m256i a, int8_t i, const int index) - SIMDE_REQUIRE_RANGE(index, 0, 31) { - simde__m256i_private a_ = simde__m256i_to_private(a); - - a_.i8[index] = i; - - return simde__m256i_from_private(a_); -} -#if defined(SIMDE_X86_AVX_NATIVE) && \ - (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) - #define simde_mm256_insert_epi8(a, i, index) _mm256_insert_epi8(a, i, index) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_insert_epi8 - #define _mm256_insert_epi8(a, i, index) simde_mm256_insert_epi8(a, i, index) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_insert_epi16 (simde__m256i a, int16_t i, const int index) - SIMDE_REQUIRE_RANGE(index, 0, 15) { - simde__m256i_private a_ = simde__m256i_to_private(a); - - a_.i16[index] = i; - - return simde__m256i_from_private(a_); -} -#if defined(SIMDE_X86_AVX_NATIVE) && \ - (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) - #define simde_mm256_insert_epi16(a, i, index) _mm256_insert_epi16(a, i, index) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_insert_epi16 - #define _mm256_insert_epi16(a, i, imm8) simde_mm256_insert_epi16(a, i, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_insert_epi32 (simde__m256i a, int32_t i, const int index) - SIMDE_REQUIRE_RANGE(index, 0, 7) { - simde__m256i_private a_ = simde__m256i_to_private(a); - - a_.i32[index] = i; - - return simde__m256i_from_private(a_); -} -#if defined(SIMDE_X86_AVX_NATIVE) && \ - (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) - #define simde_mm256_insert_epi32(a, i, index) _mm256_insert_epi32(a, i, index) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_insert_epi32 - #define _mm256_insert_epi32(a, i, index) simde_mm256_insert_epi32(a, i, index) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_insert_epi64 (simde__m256i a, int64_t i, const int index) - SIMDE_REQUIRE_RANGE(index, 0, 3) { - simde__m256i_private a_ = simde__m256i_to_private(a); - - a_.i64[index] = i; - - return simde__m256i_from_private(a_); -} -#if defined(SIMDE_X86_AVX_NATIVE) && defined(SIMDE_ARCH_AMD64) && \ - (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) && \ - SIMDE_DETECT_CLANG_VERSION_CHECK(3,7,0) - #define simde_mm256_insert_epi64(a, i, index) _mm256_insert_epi64(a, i, index) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) - #undef _mm256_insert_epi64 - #define _mm256_insert_epi64(a, i, index) simde_mm256_insert_epi64(a, i, index) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d simde_mm256_insertf128_pd(simde__m256d a, simde__m128d b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m256d_private a_ = simde__m256d_to_private(a); - simde__m128d_private b_ = simde__m128d_to_private(b); - - a_.m128d_private[imm8] = b_; - - return simde__m256d_from_private(a_); -} -#if defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm256_insertf128_pd(a, b, imm8) _mm256_insertf128_pd(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_insertf128_pd - #define _mm256_insertf128_pd(a, b, imm8) simde_mm256_insertf128_pd(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 simde_mm256_insertf128_ps(simde__m256 a, simde__m128 b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m256_private a_ = simde__m256_to_private(a); - simde__m128_private b_ = simde__m128_to_private(b); - - a_.m128_private[imm8] = b_; - - return simde__m256_from_private(a_); -} -#if defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm256_insertf128_ps(a, b, imm8) _mm256_insertf128_ps(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_insertf128_ps - #define _mm256_insertf128_ps(a, b, imm8) simde_mm256_insertf128_ps(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i simde_mm256_insertf128_si256(simde__m256i a, simde__m128i b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m256i_private a_ = simde__m256i_to_private(a); - simde__m128i_private b_ = simde__m128i_to_private(b); - - a_.m128i_private[imm8] = b_; - - return simde__m256i_from_private(a_); -} -#if defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm256_insertf128_si256(a, b, imm8) _mm256_insertf128_si256(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_insertf128_si256 - #define _mm256_insertf128_si256(a, b, imm8) simde_mm256_insertf128_si256(a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_dp_ps(a, b, imm8) _mm256_dp_ps(a, b, imm8) -#else -# define simde_mm256_dp_ps(a, b, imm8) \ - simde_mm256_set_m128( \ - simde_mm_dp_ps(simde_mm256_extractf128_ps(a, 1), simde_mm256_extractf128_ps(b, 1), imm8), \ - simde_mm_dp_ps(simde_mm256_extractf128_ps(a, 0), simde_mm256_extractf128_ps(b, 0), imm8)) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_dp_ps - #define _mm256_dp_ps(a, b, imm8) simde_mm256_dp_ps(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm256_extract_epi32 (simde__m256i a, const int index) - SIMDE_REQUIRE_RANGE(index, 0, 7) { - simde__m256i_private a_ = simde__m256i_to_private(a); - return a_.i32[index]; -} -#if defined(SIMDE_X86_AVX_NATIVE) && \ - (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) - #define simde_mm256_extract_epi32(a, index) _mm256_extract_epi32(a, index) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_extract_epi32 - #define _mm256_extract_epi32(a, index) simde_mm256_extract_epi32(a, index) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int64_t -simde_mm256_extract_epi64 (simde__m256i a, const int index) - SIMDE_REQUIRE_RANGE(index, 0, 3) { - simde__m256i_private a_ = simde__m256i_to_private(a); - return a_.i64[index]; -} -#if defined(SIMDE_X86_AVX_NATIVE) && defined(SIMDE_ARCH_AMD64) - #if !defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0) - #define simde_mm256_extract_epi64(a, index) _mm256_extract_epi64(a, index) - #endif -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) - #undef _mm256_extract_epi64 - #define _mm256_extract_epi64(a, index) simde_mm256_extract_epi64(a, index) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_lddqu_si256 (simde__m256i const * mem_addr) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_loadu_si256(mem_addr); - #else - simde__m256i r; - simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_lddqu_si256 - #define _mm256_lddqu_si256(a) simde_mm256_lddqu_si256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_load_pd (const double mem_addr[HEDLEY_ARRAY_PARAM(4)]) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_load_pd(mem_addr); - #else - simde__m256d r; - simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256d), sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_load_pd - #define _mm256_load_pd(a) simde_mm256_load_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_load_ps (const float mem_addr[HEDLEY_ARRAY_PARAM(8)]) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_load_ps(mem_addr); - #else - simde__m256 r; - simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256), sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_load_ps - #define _mm256_load_ps(a) simde_mm256_load_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_load_si256 (simde__m256i const * mem_addr) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_load_si256(mem_addr); - #else - simde__m256i r; - simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_load_si256 - #define _mm256_load_si256(a) simde_mm256_load_si256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_loadu_pd (const double a[HEDLEY_ARRAY_PARAM(4)]) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_loadu_pd(a); - #else - simde__m256d r; - simde_memcpy(&r, a, sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_loadu_pd - #define _mm256_loadu_pd(a) simde_mm256_loadu_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_loadu_ps (const float a[HEDLEY_ARRAY_PARAM(8)]) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_loadu_ps(a); - #else - simde__m256 r; - simde_memcpy(&r, a, sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_loadu_ps - #define _mm256_loadu_ps(a) simde_mm256_loadu_ps(a) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) \ - && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) \ - && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - #define simde_mm256_loadu_epi8(mem_addr) _mm256_loadu_epi8(mem_addr) -#else -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_loadu_epi8(void const * mem_addr) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr)); - #else - simde__m256i r; - simde_memcpy(&r, mem_addr, sizeof(r)); - return r; - #endif -} -#endif -#define simde_x_mm256_loadu_epi8(mem_addr) simde_mm256_loadu_epi8(mem_addr) -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) - #undef _mm256_loadu_epi8 - #define _mm256_loadu_epi8(a) simde_mm256_loadu_epi8(a) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) \ - && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) \ - && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - #define simde_mm256_loadu_epi16(mem_addr) _mm256_loadu_epi16(mem_addr) -#else -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_loadu_epi16(void const * mem_addr) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr)); - #else - simde__m256i r; - simde_memcpy(&r, mem_addr, sizeof(r)); - return r; - #endif -} -#endif -#define simde_x_mm256_loadu_epi16(mem_addr) simde_mm256_loadu_epi16(mem_addr) -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) - #undef _mm256_loadu_epi16 - #define _mm256_loadu_epi16(a) simde_mm256_loadu_epi16(a) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) \ - && !defined(SIMDE_BUG_CLANG_REV_344862) \ - && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - #define simde_mm256_loadu_epi32(mem_addr) _mm256_loadu_epi32(mem_addr) -#else -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_loadu_epi32(void const * mem_addr) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr)); - #else - simde__m256i r; - simde_memcpy(&r, mem_addr, sizeof(r)); - return r; - #endif -} -#endif -#define simde_x_mm256_loadu_epi32(mem_addr) simde_mm256_loadu_epi32(mem_addr) -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) - #undef _mm256_loadu_epi32 - #define _mm256_loadu_epi32(a) simde_mm256_loadu_epi32(a) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) \ - && !defined(SIMDE_BUG_CLANG_REV_344862) \ - && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - #define simde_mm256_loadu_epi64(mem_addr) _mm256_loadu_epi64(mem_addr) -#else -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_loadu_epi64(void const * mem_addr) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_loadu_si256(SIMDE_ALIGN_CAST(__m256i const *, mem_addr)); - #else - simde__m256i r; - simde_memcpy(&r, mem_addr, sizeof(r)); - return r; - #endif -} -#endif -#define simde_x_mm256_loadu_epi64(mem_addr) simde_mm256_loadu_epi64(mem_addr) -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) - #undef _mm256_loadu_epi64 - #define _mm256_loadu_epi64(a) simde_mm256_loadu_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_loadu_si256 (void const * mem_addr) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_loadu_si256(SIMDE_ALIGN_CAST(const __m256i*, mem_addr)); - #else - simde__m256i r; - simde_memcpy(&r, mem_addr, sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_loadu_si256 - #define _mm256_loadu_si256(mem_addr) simde_mm256_loadu_si256(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_loadu2_m128 (const float hiaddr[HEDLEY_ARRAY_PARAM(4)], const float loaddr[HEDLEY_ARRAY_PARAM(4)]) { - #if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS) - return _mm256_loadu2_m128(hiaddr, loaddr); - #else - return - simde_mm256_insertf128_ps(simde_mm256_castps128_ps256(simde_mm_loadu_ps(loaddr)), - simde_mm_loadu_ps(hiaddr), 1); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_loadu2_m128 - #define _mm256_loadu2_m128(hiaddr, loaddr) simde_mm256_loadu2_m128(hiaddr, loaddr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_loadu2_m128d (const double hiaddr[HEDLEY_ARRAY_PARAM(2)], const double loaddr[HEDLEY_ARRAY_PARAM(2)]) { - #if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS) - return _mm256_loadu2_m128d(hiaddr, loaddr); - #else - return - simde_mm256_insertf128_pd(simde_mm256_castpd128_pd256(simde_mm_loadu_pd(loaddr)), - simde_mm_loadu_pd(hiaddr), 1); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_loadu2_m128d - #define _mm256_loadu2_m128d(hiaddr, loaddr) simde_mm256_loadu2_m128d(hiaddr, loaddr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_loadu2_m128i (const simde__m128i* hiaddr, const simde__m128i* loaddr) { - #if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS) - return _mm256_loadu2_m128i(hiaddr, loaddr); - #else - return - simde_mm256_insertf128_si256(simde_mm256_castsi128_si256(simde_mm_loadu_si128(loaddr)), - simde_mm_loadu_si128(hiaddr), 1); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_loadu2_m128i - #define _mm256_loadu2_m128i(hiaddr, loaddr) simde_mm256_loadu2_m128i(hiaddr, loaddr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_maskload_pd (const simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128i mask) { - #if defined(SIMDE_X86_AVX_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - return _mm_maskload_pd(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m128d, mask)); - #else - return _mm_maskload_pd(mem_addr, mask); - #endif - #else - simde__m128d_private r_; - simde__m128i_private - mask_ = simde__m128i_to_private(mask), - mask_shr_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - mask_shr_.neon_i64 = vshrq_n_s64(mask_.neon_i64, 63); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return simde_mm_and_pd(simde_mm_load_pd(mem_addr), - simde__m128d_from_wasm_v128(wasm_i64x2_shr(mask_.wasm_v128, 63))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(mask_.i64) / sizeof(mask_.i64[0])) ; i++) { - mask_shr_.i64[i] = mask_.i64[i] >> 63; - } - #endif - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = mask_shr_.i64[i] ? mem_addr[i] : SIMDE_FLOAT64_C(0.0); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_maskload_pd - #define _mm_maskload_pd(mem_addr, mask) simde_mm_maskload_pd(HEDLEY_REINTERPRET_CAST(double const*, mem_addr), mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_maskload_pd (const simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask) { - #if defined(SIMDE_X86_AVX_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - return _mm256_maskload_pd(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m256d, mask)); - #else - return _mm256_maskload_pd(mem_addr, mask); - #endif - #else - simde__m256d_private r_; - simde__m256i_private mask_ = simde__m256i_to_private(mask); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (mask_.i64[i] >> 63) ? mem_addr[i] : SIMDE_FLOAT64_C(0.0); - } - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskload_pd - #define _mm256_maskload_pd(mem_addr, mask) simde_mm256_maskload_pd(HEDLEY_REINTERPRET_CAST(double const*, mem_addr), mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_maskload_ps (const simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask) { - #if defined(SIMDE_X86_AVX_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - return _mm_maskload_ps(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m128, mask)); - #else - return _mm_maskload_ps(mem_addr, mask); - #endif - #else - simde__m128_private r_; - simde__m128i_private - mask_ = simde__m128i_to_private(mask), - mask_shr_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - mask_shr_.neon_i32 = vshrq_n_s32(mask_.neon_i32, 31); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return simde_mm_and_ps(simde_mm_load_ps(mem_addr), - simde__m128_from_wasm_v128(wasm_i32x4_shr(mask_.wasm_v128, 31))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(mask_.i32) / sizeof(mask_.i32[0])) ; i++) { - mask_shr_.i32[i] = mask_.i32[i] >> 31; - } - #endif - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = mask_shr_.i32[i] ? mem_addr[i] : SIMDE_FLOAT32_C(0.0); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_maskload_ps - #define _mm_maskload_ps(mem_addr, mask) simde_mm_maskload_ps(HEDLEY_REINTERPRET_CAST(float const*, mem_addr), mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_maskload_ps (const simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(8)], simde__m256i mask) { - #if defined(SIMDE_X86_AVX_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - return _mm256_maskload_ps(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m256, mask)); - #else - return _mm256_maskload_ps(mem_addr, mask); - #endif - #else - simde__m256_private r_; - simde__m256i_private mask_ = simde__m256i_to_private(mask); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (mask_.i32[i] >> 31) ? mem_addr[i] : SIMDE_FLOAT32_C(0.0); - } - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskload_ps - #define _mm256_maskload_ps(mem_addr, mask) simde_mm256_maskload_ps(HEDLEY_REINTERPRET_CAST(float const*, mem_addr), mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_maskstore_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128i mask, simde__m128d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - _mm_maskstore_pd(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m128d, mask), a); - #else - _mm_maskstore_pd(mem_addr, mask, a); - #endif - #else - simde__m128i_private mask_ = simde__m128i_to_private(mask); - simde__m128d_private a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i64x2_extract_lane(mask_.wasm_v128, 0)) & 0x8000000000000000ull) != 0) - mem_addr[0] = wasm_f64x2_extract_lane(a_.wasm_v128, 0); - if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i64x2_extract_lane(mask_.wasm_v128, 1)) & 0x8000000000000000ull) != 0) - mem_addr[1] = wasm_f64x2_extract_lane(a_.wasm_v128, 1); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - if (mask_.u64[i] >> 63) - mem_addr[i] = a_.f64[i]; - } - #endif - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_maskstore_pd - #define _mm_maskstore_pd(mem_addr, mask, a) simde_mm_maskstore_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), mask, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_maskstore_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask, simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - _mm256_maskstore_pd(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m256d, mask), a); - #else - _mm256_maskstore_pd(mem_addr, mask, a); - #endif - #else - simde__m256i_private mask_ = simde__m256i_to_private(mask); - simde__m256d_private a_ = simde__m256d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - if (mask_.u64[i] & (UINT64_C(1) << 63)) - mem_addr[i] = a_.f64[i]; - } - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskstore_pd - #define _mm256_maskstore_pd(mem_addr, mask, a) simde_mm256_maskstore_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), mask, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_maskstore_ps (simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask, simde__m128 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - _mm_maskstore_ps(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m128, mask), a); - #else - _mm_maskstore_ps(mem_addr, mask, a); - #endif - #else - simde__m128i_private mask_ = simde__m128i_to_private(mask); - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i32x4_extract_lane(mask_.wasm_v128, 0)) & 0x80000000ull) != 0) - mem_addr[0] = wasm_f32x4_extract_lane(a_.wasm_v128, 0); - if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i32x4_extract_lane(mask_.wasm_v128, 1)) & 0x80000000ull) != 0) - mem_addr[1] = wasm_f32x4_extract_lane(a_.wasm_v128, 1); - if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i32x4_extract_lane(mask_.wasm_v128, 2)) & 0x80000000ull) != 0) - mem_addr[2] = wasm_f32x4_extract_lane(a_.wasm_v128, 2); - if ((HEDLEY_STATIC_CAST(unsigned long long, wasm_i32x4_extract_lane(mask_.wasm_v128, 3)) & 0x80000000ull) != 0) - mem_addr[3] = wasm_f32x4_extract_lane(a_.wasm_v128, 3); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - if (mask_.u32[i] & (UINT32_C(1) << 31)) - mem_addr[i] = a_.f32[i]; - } - #endif - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_maskstore_ps - #define _mm_maskstore_ps(mem_addr, mask, a) simde_mm_maskstore_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), mask, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_maskstore_ps (simde_float32 mem_addr[HEDLEY_ARRAY_PARAM(8)], simde__m256i mask, simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - _mm256_maskstore_ps(mem_addr, HEDLEY_REINTERPRET_CAST(simde__m256, mask), a); - #else - _mm256_maskstore_ps(mem_addr, mask, a); - #endif - #else - simde__m256i_private mask_ = simde__m256i_to_private(mask); - simde__m256_private a_ = simde__m256_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - if (mask_.u32[i] & (UINT32_C(1) << 31)) - mem_addr[i] = a_.f32[i]; - } - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskstore_ps - #define _mm256_maskstore_ps(mem_addr, mask, a) simde_mm256_maskstore_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), mask, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_min_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_min_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_min_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_mm_min_ps(a_.m128[1], b_.m128[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (a_.f32[i] < b_.f32[i]) ? a_.f32[i] : b_.f32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_min_ps - #define _mm256_min_ps(a, b) simde_mm256_min_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_min_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_min_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_min_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_mm_min_pd(a_.m128d[1], b_.m128d[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (a_.f64[i] < b_.f64[i]) ? a_.f64[i] : b_.f64[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_min_pd - #define _mm256_min_pd(a, b) simde_mm256_min_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_max_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_max_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_max_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_mm_max_ps(a_.m128[1], b_.m128[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (a_.f32[i] > b_.f32[i]) ? a_.f32[i] : b_.f32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_max_ps - #define _mm256_max_ps(a, b) simde_mm256_max_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_max_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_max_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_max_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_mm_max_pd(a_.m128d[1], b_.m128d[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (a_.f64[i] > b_.f64[i]) ? a_.f64[i] : b_.f64[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_max_pd - #define _mm256_max_pd(a, b) simde_mm256_max_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_movedup_pd (simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_movedup_pd(a); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, a_.f64, 0, 0, 2, 2); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i += 2) { - r_.f64[i] = r_.f64[i + 1] = a_.f64[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_movedup_pd - #define _mm256_movedup_pd(a) simde_mm256_movedup_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_movehdup_ps (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_movehdup_ps(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, a_.f32, 1, 1, 3, 3, 5, 5, 7, 7); - #else - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) { - r_.f32[i - 1] = r_.f32[i] = a_.f32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_movehdup_ps - #define _mm256_movehdup_ps(a) simde_mm256_movehdup_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_moveldup_ps (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_moveldup_ps(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, a_.f32, 0, 0, 2, 2, 4, 4, 6, 6); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) { - r_.f32[i] = r_.f32[i + 1] = a_.f32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_moveldup_ps - #define _mm256_moveldup_ps(a) simde_mm256_moveldup_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_movemask_ps (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_movemask_ps(a); - #else - simde__m256_private a_ = simde__m256_to_private(a); - int r = 0; - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r |= (a_.u32[i] >> 31) << i; - } - - return r; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_movemask_ps - #define _mm256_movemask_ps(a) simde_mm256_movemask_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_movemask_pd (simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_movemask_pd(a); - #else - simde__m256d_private a_ = simde__m256d_to_private(a); - int r = 0; - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - r |= (a_.u64[i] >> 63) << i; - } - - return r; - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_movemask_pd - #define _mm256_movemask_pd(a) simde_mm256_movemask_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_mul_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_mul_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_mul_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_mm_mul_ps(a_.m128[1], b_.m128[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = a_.f32 * b_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i] * b_.f32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_mul_ps - #define _mm256_mul_ps(a, b) simde_mm256_mul_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_mul_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_mul_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_mul_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_mm_mul_pd(a_.m128d[1], b_.m128d[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 * b_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[i] * b_.f64[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_mul_pd - #define _mm256_mul_pd(a, b) simde_mm256_mul_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_or_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_or_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_or_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_mm_or_ps(a_.m128[1], b_.m128[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f | b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] | b_.u32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_or_ps - #define _mm256_or_ps(a, b) simde_mm256_or_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_or_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_or_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_or_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_mm_or_pd(a_.m128d[1], b_.m128d[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f | b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = a_.u64[i] | b_.u64[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_or_pd - #define _mm256_or_pd(a, b) simde_mm256_or_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_permute_ps (simde__m256 a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.m128_private[i >> 2].f32[(imm8 >> ((i << 1) & 7)) & 3]; - } - - return simde__m256_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_permute_ps(a, imm8) _mm256_permute_ps(a, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_permute_ps - #define _mm256_permute_ps(a, imm8) simde_mm256_permute_ps(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_permute_pd (simde__m256d a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[((imm8 >> i) & 1) + (i & 2)]; - } - - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_permute_pd(a, imm8) _mm256_permute_pd(a, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_permute_pd - #define _mm256_permute_pd(a, imm8) simde_mm256_permute_pd(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_permute_ps (simde__m128 a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[(imm8 >> ((i << 1) & 7)) & 3]; - } - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm_permute_ps(a, imm8) _mm_permute_ps(a, imm8) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) -# define simde_mm_permute_ps(a, imm8) simde__m128_from_wasm_v128(wasm_i32x4_shuffle(simde__m128_to_wasm_v128(a), simde__m128_to_wasm_v128(a), ((imm8) & 3), (((imm8) >> 2) & 3 ), (((imm8) >> 4) & 3), (((imm8) >> 6) & 3))) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_permute_ps - #define _mm_permute_ps(a, imm8) simde_mm_permute_ps(a, imm8) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_permute_pd (simde__m128d a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[((imm8 >> i) & 1) + (i & 2)]; - } - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm_permute_pd(a, imm8) _mm_permute_pd(a, imm8) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) -# define simde_mm_permute_pd(a, imm8) simde__m128d_from_wasm_v128(wasm_i64x2_shuffle(simde__m128d_to_wasm_v128(a), simde__m128d_to_wasm_v128(a), ((imm8) & 1), (((imm8) >> 1) & 1 ))) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_permute_pd - #define _mm_permute_pd(a, imm8) simde_mm_permute_pd(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_permutevar_ps (simde__m128 a, simde__m128i b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm_permutevar_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - simde__m128i_private b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_make( - (a_.f32[wasm_i32x4_extract_lane(b_.wasm_v128, 0) & 3]), - (a_.f32[wasm_i32x4_extract_lane(b_.wasm_v128, 1) & 3]), - (a_.f32[wasm_i32x4_extract_lane(b_.wasm_v128, 2) & 3]), - (a_.f32[wasm_i32x4_extract_lane(b_.wasm_v128, 3) & 3])); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[b_.i32[i] & 3]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_permutevar_ps - #define _mm_permutevar_ps(a, b) simde_mm_permutevar_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_permutevar_pd (simde__m128d a, simde__m128i b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm_permutevar_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - simde__m128i_private b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_make( - (a_.f64[(wasm_i64x2_extract_lane(b_.wasm_v128, 0) >> 1) & 1]), - (a_.f64[(wasm_i64x2_extract_lane(b_.wasm_v128, 1) >> 1) & 1])); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[(b_.i64[i] & 2) >> 1]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_permutevar_pd - #define _mm_permutevar_pd(a, b) simde_mm_permutevar_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_permutevar_ps (simde__m256 a, simde__m256i b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_permutevar_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - simde__m256i_private b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[(b_.i32[i] & 3) + (i & 4)]; - } - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_permutevar_ps - #define _mm256_permutevar_ps(a, b) simde_mm256_permutevar_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_permutevar_pd (simde__m256d a, simde__m256i b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_permutevar_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - simde__m256i_private b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[((b_.i64[i] & 2) >> 1) + (i & 2)]; - } - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_permutevar_pd - #define _mm256_permutevar_pd(a, b) simde_mm256_permutevar_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_permute2f128_ps (simde__m256 a, simde__m256 b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - r_.m128_private[0] = (imm8 & 0x08) ? simde__m128_to_private(simde_mm_setzero_ps()) : ((imm8 & 0x02) ? b_.m128_private[(imm8 ) & 1] : a_.m128_private[(imm8 ) & 1]); - r_.m128_private[1] = (imm8 & 0x80) ? simde__m128_to_private(simde_mm_setzero_ps()) : ((imm8 & 0x20) ? b_.m128_private[(imm8 >> 4) & 1] : a_.m128_private[(imm8 >> 4) & 1]); - - return simde__m256_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_permute2f128_ps(a, b, imm8) _mm256_permute2f128_ps(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_permute2f128_ps - #define _mm256_permute2f128_ps(a, b, imm8) simde_mm256_permute2f128_ps(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_permute2f128_pd (simde__m256d a, simde__m256d b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - r_.m128d_private[0] = (imm8 & 0x08) ? simde__m128d_to_private(simde_mm_setzero_pd()) : ((imm8 & 0x02) ? b_.m128d_private[(imm8 ) & 1] : a_.m128d_private[(imm8 ) & 1]); - r_.m128d_private[1] = (imm8 & 0x80) ? simde__m128d_to_private(simde_mm_setzero_pd()) : ((imm8 & 0x20) ? b_.m128d_private[(imm8 >> 4) & 1] : a_.m128d_private[(imm8 >> 4) & 1]); - - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_permute2f128_pd(a, b, imm8) _mm256_permute2f128_pd(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_permute2f128_pd - #define _mm256_permute2f128_pd(a, b, imm8) simde_mm256_permute2f128_pd(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_permute2f128_si256 (simde__m256i a, simde__m256i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - r_.m128i_private[0] = (imm8 & 0x08) ? simde__m128i_to_private(simde_mm_setzero_si128()) : ((imm8 & 0x02) ? b_.m128i_private[(imm8 ) & 1] : a_.m128i_private[(imm8 ) & 1]); - r_.m128i_private[1] = (imm8 & 0x80) ? simde__m128i_to_private(simde_mm_setzero_si128()) : ((imm8 & 0x20) ? b_.m128i_private[(imm8 >> 4) & 1] : a_.m128i_private[(imm8 >> 4) & 1]); - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) -# define simde_mm256_permute2f128_si128(a, b, imm8) _mm256_permute2f128_si128(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_permute2f128_si256 - #define _mm256_permute2f128_si256(a, b, imm8) simde_mm256_permute2f128_si256(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_rcp_ps (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_rcp_ps(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_rcp_ps(a_.m128[0]); - r_.m128[1] = simde_mm_rcp_ps(a_.m128[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = SIMDE_FLOAT32_C(1.0) / a_.f32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_rcp_ps - #define _mm256_rcp_ps(a) simde_mm256_rcp_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_rsqrt_ps (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_rsqrt_ps(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if defined(simde_math_sqrtf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = 1.0f / simde_math_sqrtf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_rsqrt_ps - #define _mm256_rsqrt_ps(a) simde_mm256_rsqrt_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_setr_epi8 ( - int8_t e31, int8_t e30, int8_t e29, int8_t e28, int8_t e27, int8_t e26, int8_t e25, int8_t e24, - int8_t e23, int8_t e22, int8_t e21, int8_t e20, int8_t e19, int8_t e18, int8_t e17, int8_t e16, - int8_t e15, int8_t e14, int8_t e13, int8_t e12, int8_t e11, int8_t e10, int8_t e9, int8_t e8, - int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_setr_epi8( - e31, e30, e29, e28, e27, e26, e25, e24, - e23, e22, e21, e20, e19, e18, e17, e16, - e15, e14, e13, e12, e11, e10, e9, e8, - e7, e6, e5, e4, e3, e2, e1, e0); - #else - return simde_mm256_set_epi8( - e0, e1, e2, e3, e4, e5, e6, e7, - e8, e9, e10, e11, e12, e13, e14, e15, - e16, e17, e18, e19, e20, e21, e22, e23, - e24, e25, e26, e27, e28, e29, e30, e31); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setr_epi8 - #define _mm256_setr_epi8(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) \ - simde_mm256_setr_epi8(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_setr_epi16 ( - int16_t e15, int16_t e14, int16_t e13, int16_t e12, int16_t e11, int16_t e10, int16_t e9, int16_t e8, - int16_t e7, int16_t e6, int16_t e5, int16_t e4, int16_t e3, int16_t e2, int16_t e1, int16_t e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_setr_epi16( - e15, e14, e13, e12, e11, e10, e9, e8, - e7, e6, e5, e4, e3, e2, e1, e0); - #else - return simde_mm256_set_epi16( - e0, e1, e2, e3, e4, e5, e6, e7, - e8, e9, e10, e11, e12, e13, e14, e15); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setr_epi16 - #define _mm256_setr_epi16(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) \ - simde_mm256_setr_epi16(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_setr_epi32 ( - int32_t e7, int32_t e6, int32_t e5, int32_t e4, int32_t e3, int32_t e2, int32_t e1, int32_t e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0); - #else - return simde_mm256_set_epi32(e0, e1, e2, e3, e4, e5, e6, e7); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setr_epi32 - #define _mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0) \ - simde_mm256_setr_epi32(e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_setr_epi64x (int64_t e3, int64_t e2, int64_t e1, int64_t e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_setr_epi64x(e3, e2, e1, e0); - #else - return simde_mm256_set_epi64x(e0, e1, e2, e3); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setr_epi64x - #define _mm256_setr_epi64x(e3, e2, e1, e0) \ - simde_mm256_setr_epi64x(e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_setr_ps ( - simde_float32 e7, simde_float32 e6, simde_float32 e5, simde_float32 e4, - simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_setr_ps(e7, e6, e5, e4, e3, e2, e1, e0); - #else - return simde_mm256_set_ps(e0, e1, e2, e3, e4, e5, e6, e7); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setr_ps - #define _mm256_setr_ps(e7, e6, e5, e4, e3, e2, e1, e0) \ - simde_mm256_setr_ps(e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_setr_pd (simde_float64 e3, simde_float64 e2, simde_float64 e1, simde_float64 e0) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_setr_pd(e3, e2, e1, e0); - #else - return simde_mm256_set_pd(e0, e1, e2, e3); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setr_pd - #define _mm256_setr_pd(e3, e2, e1, e0) \ - simde_mm256_setr_pd(e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_setr_m128 (simde__m128 lo, simde__m128 hi) { - #if defined(SIMDE_X86_AVX_NATIVE) && \ - !defined(SIMDE_BUG_GCC_REV_247851) && \ - SIMDE_DETECT_CLANG_VERSION_CHECK(3,6,0) - return _mm256_setr_m128(lo, hi); - #else - return simde_mm256_set_m128(hi, lo); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setr_m128 - #define _mm256_setr_m128(lo, hi) \ - simde_mm256_setr_m128(lo, hi) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_setr_m128d (simde__m128d lo, simde__m128d hi) { - #if defined(SIMDE_X86_AVX_NATIVE) && \ - !defined(SIMDE_BUG_GCC_REV_247851) && \ - SIMDE_DETECT_CLANG_VERSION_CHECK(3,6,0) - return _mm256_setr_m128d(lo, hi); - #else - return simde_mm256_set_m128d(hi, lo); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setr_m128d - #define _mm256_setr_m128d(lo, hi) \ - simde_mm256_setr_m128d(lo, hi) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_setr_m128i (simde__m128i lo, simde__m128i hi) { - #if defined(SIMDE_X86_AVX_NATIVE) && \ - !defined(SIMDE_BUG_GCC_REV_247851) && \ - SIMDE_DETECT_CLANG_VERSION_CHECK(3,6,0) - return _mm256_setr_m128i(lo, hi); - #else - return simde_mm256_set_m128i(hi, lo); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_setr_m128i - #define _mm256_setr_m128i(lo, hi) \ - simde_mm256_setr_m128i(lo, hi) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_shuffle_ps (simde__m256 a, simde__m256 b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - r_.f32[0] = a_.m128_private[0].f32[(imm8 >> 0) & 3]; - r_.f32[1] = a_.m128_private[0].f32[(imm8 >> 2) & 3]; - r_.f32[2] = b_.m128_private[0].f32[(imm8 >> 4) & 3]; - r_.f32[3] = b_.m128_private[0].f32[(imm8 >> 6) & 3]; - r_.f32[4] = a_.m128_private[1].f32[(imm8 >> 0) & 3]; - r_.f32[5] = a_.m128_private[1].f32[(imm8 >> 2) & 3]; - r_.f32[6] = b_.m128_private[1].f32[(imm8 >> 4) & 3]; - r_.f32[7] = b_.m128_private[1].f32[(imm8 >> 6) & 3]; - - return simde__m256_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm256_shuffle_ps(a, b, imm8) _mm256_shuffle_ps(a, b, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) - #define simde_mm256_shuffle_ps(a, b, imm8) \ - simde_mm256_set_m128( \ - simde_mm_shuffle_ps(simde_mm256_extractf128_ps(a, 1), simde_mm256_extractf128_ps(b, 1), (imm8)), \ - simde_mm_shuffle_ps(simde_mm256_extractf128_ps(a, 0), simde_mm256_extractf128_ps(b, 0), (imm8))) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm256_shuffle_ps(a, b, imm8) \ - SIMDE_SHUFFLE_VECTOR_(32, 32, a, b, \ - (((imm8) >> 0) & 3) + 0, \ - (((imm8) >> 2) & 3) + 0, \ - (((imm8) >> 4) & 3) + 8, \ - (((imm8) >> 6) & 3) + 8, \ - (((imm8) >> 0) & 3) + 4, \ - (((imm8) >> 2) & 3) + 4, \ - (((imm8) >> 4) & 3) + 12, \ - (((imm8) >> 6) & 3) + 12) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_shuffle_ps - #define _mm256_shuffle_ps(a, b, imm8) simde_mm256_shuffle_ps(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_shuffle_pd (simde__m256d a, simde__m256d b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - r_.f64[0] = a_.f64[((imm8 ) & 1) ]; - r_.f64[1] = b_.f64[((imm8 >> 1) & 1) ]; - r_.f64[2] = a_.f64[((imm8 >> 2) & 1) | 2]; - r_.f64[3] = b_.f64[((imm8 >> 3) & 1) | 2]; - - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm256_shuffle_pd(a, b, imm8) _mm256_shuffle_pd(a, b, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) - #define simde_mm256_shuffle_pd(a, b, imm8) \ - simde_mm256_set_m128d( \ - simde_mm_shuffle_pd(simde_mm256_extractf128_pd(a, 1), simde_mm256_extractf128_pd(b, 1), (imm8 >> 2) & 3), \ - simde_mm_shuffle_pd(simde_mm256_extractf128_pd(a, 0), simde_mm256_extractf128_pd(b, 0), (imm8 >> 0) & 3)) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm256_shuffle_pd(a, b, imm8) \ - SIMDE_SHUFFLE_VECTOR_(64, 32, a, b, \ - (((imm8) >> 0) & 1) + 0, \ - (((imm8) >> 1) & 1) + 4, \ - (((imm8) >> 2) & 1) + 2, \ - (((imm8) >> 3) & 1) + 6) -#endif -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_shuffle_pd - #define _mm256_shuffle_pd(a, b, imm8) simde_mm256_shuffle_pd(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_sqrt_ps (simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_sqrt_ps(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_sqrt_ps(a_.m128[0]); - r_.m128[1] = simde_mm_sqrt_ps(a_.m128[1]); - #elif defined(simde_math_sqrtf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_sqrtf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_sqrt_ps - #define _mm256_sqrt_ps(a) simde_mm256_sqrt_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_sqrt_pd (simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_sqrt_pd(a); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_sqrt_pd(a_.m128d[0]); - r_.m128d[1] = simde_mm_sqrt_pd(a_.m128d[1]); - #elif defined(simde_math_sqrt) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_sqrt(a_.f64[i]); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_sqrt_pd - #define _mm256_sqrt_pd(a) simde_mm256_sqrt_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_store_ps (simde_float32 mem_addr[8], simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - _mm256_store_ps(mem_addr, a); - #else - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256), &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_store_ps - #define _mm256_store_ps(mem_addr, a) simde_mm256_store_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_store_pd (simde_float64 mem_addr[4], simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - _mm256_store_pd(mem_addr, a); - #else - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256d), &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_store_pd - #define _mm256_store_pd(mem_addr, a) simde_mm256_store_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_store_si256 (simde__m256i* mem_addr, simde__m256i a) { - #if defined(SIMDE_X86_AVX_NATIVE) - _mm256_store_si256(mem_addr, a); - #else - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_store_si256 - #define _mm256_store_si256(mem_addr, a) simde_mm256_store_si256(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_storeu_ps (simde_float32 mem_addr[8], simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - _mm256_storeu_ps(mem_addr, a); - #else - simde_memcpy(mem_addr, &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_storeu_ps - #define _mm256_storeu_ps(mem_addr, a) simde_mm256_storeu_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_storeu_pd (simde_float64 mem_addr[4], simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - _mm256_storeu_pd(mem_addr, a); - #else - simde_memcpy(mem_addr, &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_storeu_pd - #define _mm256_storeu_pd(mem_addr, a) simde_mm256_storeu_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_storeu_si256 (void* mem_addr, simde__m256i a) { - #if defined(SIMDE_X86_AVX_NATIVE) - _mm256_storeu_si256(SIMDE_ALIGN_CAST(__m256i*, mem_addr), a); - #else - simde_memcpy(mem_addr, &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_storeu_si256 - #define _mm256_storeu_si256(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_storeu2_m128 (simde_float32 hi_addr[4], simde_float32 lo_addr[4], simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS) - _mm256_storeu2_m128(hi_addr, lo_addr, a); - #else - simde_mm_storeu_ps(lo_addr, simde_mm256_castps256_ps128(a)); - simde_mm_storeu_ps(hi_addr, simde_mm256_extractf128_ps(a, 1)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_storeu2_m128 - #define _mm256_storeu2_m128(hi_addr, lo_addr, a) simde_mm256_storeu2_m128(hi_addr, lo_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_storeu2_m128d (simde_float64 hi_addr[2], simde_float64 lo_addr[2], simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS) - _mm256_storeu2_m128d(hi_addr, lo_addr, a); - #else - simde_mm_storeu_pd(lo_addr, simde_mm256_castpd256_pd128(a)); - simde_mm_storeu_pd(hi_addr, simde_mm256_extractf128_pd(a, 1)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_storeu2_m128d - #define _mm256_storeu2_m128d(hi_addr, lo_addr, a) simde_mm256_storeu2_m128d(hi_addr, lo_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_storeu2_m128i (simde__m128i* hi_addr, simde__m128i* lo_addr, simde__m256i a) { - #if defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_91341) && !defined(SIMDE_BUG_MCST_LCC_MISSING_AVX_LOAD_STORE_M128_FUNCS) - _mm256_storeu2_m128i(hi_addr, lo_addr, a); - #else - simde_mm_storeu_si128(lo_addr, simde_mm256_castsi256_si128(a)); - simde_mm_storeu_si128(hi_addr, simde_mm256_extractf128_si256(a, 1)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_storeu2_m128i - #define _mm256_storeu2_m128i(hi_addr, lo_addr, a) simde_mm256_storeu2_m128i(hi_addr, lo_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_stream_ps (simde_float32 mem_addr[8], simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - _mm256_stream_ps(mem_addr, a); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) - __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); - #else - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256), &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_stream_ps - #define _mm256_stream_ps(mem_addr, a) simde_mm256_stream_ps(HEDLEY_REINTERPRET_CAST(float*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_stream_pd (simde_float64 mem_addr[4], simde__m256d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - _mm256_stream_pd(mem_addr, a); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) - __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); - #else - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256d), &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_stream_pd - #define _mm256_stream_pd(mem_addr, a) simde_mm256_stream_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_stream_si256 (simde__m256i* mem_addr, simde__m256i a) { - #if defined(SIMDE_X86_AVX_NATIVE) - _mm256_stream_si256(mem_addr, a); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) - __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); - #else - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_stream_si256 - #define _mm256_stream_si256(mem_addr, a) simde_mm256_stream_si256(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_sub_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_sub_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_sub_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_mm_sub_ps(a_.m128[1], b_.m128[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = a_.f32 - b_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i] - b_.f32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_sub_ps - #define _mm256_sub_ps(a, b) simde_mm256_sub_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_hsub_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_hsub_ps(a, b); - #else - return simde_mm256_sub_ps(simde_x_mm256_deinterleaveeven_ps(a, b), simde_x_mm256_deinterleaveodd_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_hsub_ps - #define _mm256_hsub_ps(a, b) simde_mm256_hsub_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_sub_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_sub_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_sub_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_mm_sub_pd(a_.m128d[1], b_.m128d[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 - b_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[i] - b_.f64[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_sub_pd - #define _mm256_sub_pd(a, b) simde_mm256_sub_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_hsub_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_hsub_pd(a, b); - #else - return simde_mm256_sub_pd(simde_x_mm256_deinterleaveeven_pd(a, b), simde_x_mm256_deinterleaveodd_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_hsub_pd - #define _mm256_hsub_pd(a, b) simde_mm256_hsub_pd(a, b) -#endif - -#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_undefined_ps (void) { - simde__m256_private r_; - -#if \ - defined(SIMDE_X86_AVX_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(5,0,0)) && \ - (!defined(__has_builtin) || HEDLEY_HAS_BUILTIN(__builtin_ia32_undef256)) - r_.n = _mm256_undefined_ps(); -#elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - r_ = simde__m256_to_private(simde_mm256_setzero_ps()); -#endif - - return simde__m256_from_private(r_); -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_undefined_ps - #define _mm256_undefined_ps() simde_mm256_undefined_ps() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_undefined_pd (void) { - simde__m256d_private r_; - -#if \ - defined(SIMDE_X86_AVX_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(5,0,0)) && \ - (!defined(__has_builtin) || HEDLEY_HAS_BUILTIN(__builtin_ia32_undef256)) - r_.n = _mm256_undefined_pd(); -#elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - r_ = simde__m256d_to_private(simde_mm256_setzero_pd()); -#endif - - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_undefined_pd - #define _mm256_undefined_pd() simde_mm256_undefined_pd() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_undefined_si256 (void) { - simde__m256i_private r_; -#if \ - defined(SIMDE_X86_AVX_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(5,0,0)) && \ - (!defined(__has_builtin) || HEDLEY_HAS_BUILTIN(__builtin_ia32_undef256)) - r_.n = _mm256_undefined_si256(); -#elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - r_ = simde__m256i_to_private(simde_mm256_setzero_si256()); -#endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_undefined_si256 - #define _mm256_undefined_si256() simde_mm256_undefined_si256() -#endif - -#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - HEDLEY_DIAGNOSTIC_POP -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_xor_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_xor_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128[0] = simde_mm_xor_ps(a_.m128[0], b_.m128[0]); - r_.m128[1] = simde_mm_xor_ps(a_.m128[1], b_.m128[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f ^ b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] ^ b_.u32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_xor_ps - #define _mm256_xor_ps(a, b) simde_mm256_xor_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_xor_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_xor_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128d[0] = simde_mm_xor_pd(a_.m128d[0], b_.m128d[0]); - r_.m128d[1] = simde_mm_xor_pd(a_.m128d[1], b_.m128d[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f ^ b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = a_.u64[i] ^ b_.u64[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_xor_pd - #define _mm256_xor_pd(a, b) simde_mm256_xor_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_x_mm256_xorsign_ps(simde__m256 dest, simde__m256 src) { - return simde_mm256_xor_ps(simde_mm256_and_ps(simde_mm256_set1_ps(-0.0f), src), dest); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_x_mm256_xorsign_pd(simde__m256d dest, simde__m256d src) { - return simde_mm256_xor_pd(simde_mm256_and_pd(simde_mm256_set1_pd(-0.0), src), dest); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_x_mm256_negate_ps(simde__m256 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return simde_mm256_xor_ps(a,_mm256_set1_ps(SIMDE_FLOAT32_C(-0.0))); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if defined(SIMDE_VECTOR_NEGATE) - r_.f32 = -a_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = -a_.f32[i]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_x_mm256_negate_pd(simde__m256d a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return simde_mm256_xor_pd(a, _mm256_set1_pd(SIMDE_FLOAT64_C(-0.0))); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if defined(SIMDE_VECTOR_NEGATE) - r_.f64 = -a_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = -a_.f64[i]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_unpackhi_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_unpackhi_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, b_.f32, 2, 10, 3, 11, 6, 14, 7, 15); - #else - r_.f32[0] = a_.f32[2]; - r_.f32[1] = b_.f32[2]; - r_.f32[2] = a_.f32[3]; - r_.f32[3] = b_.f32[3]; - r_.f32[4] = a_.f32[6]; - r_.f32[5] = b_.f32[6]; - r_.f32[6] = a_.f32[7]; - r_.f32[7] = b_.f32[7]; - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpackhi_ps - #define _mm256_unpackhi_ps(a, b) simde_mm256_unpackhi_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_unpackhi_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_unpackhi_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, b_.f64, 1, 5, 3, 7); - #else - r_.f64[0] = a_.f64[1]; - r_.f64[1] = b_.f64[1]; - r_.f64[2] = a_.f64[3]; - r_.f64[3] = b_.f64[3]; - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpackhi_pd - #define _mm256_unpackhi_pd(a, b) simde_mm256_unpackhi_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_unpacklo_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_unpacklo_ps(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.f32, b_.f32, 0, 8, 1, 9, 4, 12, 5, 13); - #else - r_.f32[0] = a_.f32[0]; - r_.f32[1] = b_.f32[0]; - r_.f32[2] = a_.f32[1]; - r_.f32[3] = b_.f32[1]; - r_.f32[4] = a_.f32[4]; - r_.f32[5] = b_.f32[4]; - r_.f32[6] = a_.f32[5]; - r_.f32[7] = b_.f32[5]; - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpacklo_ps - #define _mm256_unpacklo_ps(a, b) simde_mm256_unpacklo_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_unpacklo_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_unpacklo_pd(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.f64, b_.f64, 0, 4, 2, 6); - #else - r_.f64[0] = a_.f64[0]; - r_.f64[1] = b_.f64[0]; - r_.f64[2] = a_.f64[2]; - r_.f64[3] = b_.f64[2]; - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpacklo_pd - #define _mm256_unpacklo_pd(a, b) simde_mm256_unpacklo_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_zextps128_ps256 (simde__m128 a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_insertf128_ps(_mm256_setzero_ps(), a, 0); - #else - simde__m256_private r_; - - r_.m128_private[0] = simde__m128_to_private(a); - r_.m128_private[1] = simde__m128_to_private(simde_mm_setzero_ps()); - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_zextps128_ps256 - #define _mm256_zextps128_ps256(a) simde_mm256_zextps128_ps256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_zextpd128_pd256 (simde__m128d a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_insertf128_pd(_mm256_setzero_pd(), a, 0); - #else - simde__m256d_private r_; - - r_.m128d_private[0] = simde__m128d_to_private(a); - r_.m128d_private[1] = simde__m128d_to_private(simde_mm_setzero_pd()); - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_zextpd128_pd256 - #define _mm256_zextpd128_pd256(a) simde_mm256_zextpd128_pd256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_zextsi128_si256 (simde__m128i a) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_insertf128_si256(_mm256_setzero_si256(), a, 0); - #else - simde__m256i_private r_; - - r_.m128i_private[0] = simde__m128i_to_private(a); - r_.m128i_private[1] = simde__m128i_to_private(simde_mm_setzero_si128()); - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_zextsi128_si256 - #define _mm256_zextsi128_si256(a) simde_mm256_zextsi128_si256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_testc_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm_testc_ps(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_u32x4_shr(wasm_v128_or(wasm_v128_not(b_.wasm_v128), a_.wasm_v128), 31); - m = wasm_v128_and(m, simde_mm_movehl_ps(m, m)); - m = wasm_v128_and(m, simde_mm_shuffle_epi32(m, SIMDE_MM_SHUFFLE(3, 2, 0, 1))); - return wasm_i32x4_extract_lane(m, 0); - #else - uint_fast32_t r = 0; - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - r |= ~a_.u32[i] & b_.u32[i]; - } - - return HEDLEY_STATIC_CAST(int, ((~r >> 31) & 1)); - #endif - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_testc_ps - #define _mm_testc_ps(a, b) simde_mm_testc_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_testc_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm_testc_pd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_u64x2_shr(wasm_v128_or(wasm_v128_not(b_.wasm_v128), a_.wasm_v128), 63); - return HEDLEY_STATIC_CAST(int, wasm_i64x2_extract_lane(m, 0) & wasm_i64x2_extract_lane(m, 1)); - #else - uint_fast64_t r = 0; - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - r |= ~a_.u64[i] & b_.u64[i]; - } - - return HEDLEY_STATIC_CAST(int, ((~r >> 63) & 1)); - #endif - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_testc_pd - #define _mm_testc_pd(a, b) simde_mm_testc_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_testc_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_testc_ps(a, b); - #else - uint_fast32_t r = 0; - simde__m256_private - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - r |= ~a_.u32[i] & b_.u32[i]; - } - - return HEDLEY_STATIC_CAST(int, ((~r >> 31) & 1)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_testc_ps - #define _mm256_testc_ps(a, b) simde_mm256_testc_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_testc_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_testc_pd(a, b); - #else - uint_fast64_t r = 0; - simde__m256d_private - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - r |= ~a_.u64[i] & b_.u64[i]; - } - - return HEDLEY_STATIC_CAST(int, ((~r >> 63) & 1)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_testc_pd - #define _mm256_testc_pd(a, b) simde_mm256_testc_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_testc_si256 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_testc_si256(a, b); - #else - int_fast32_t r = 0; - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) { - r |= ~a_.i32f[i] & b_.i32f[i]; - } - - return HEDLEY_STATIC_CAST(int, !r); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_testc_si256 - #define _mm256_testc_si256(a, b) simde_mm256_testc_si256(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_testz_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm_testz_ps(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_u32x4_shr(wasm_v128_not(wasm_v128_and(a_.wasm_v128, b_.wasm_v128)), 31); - m = wasm_v128_and(m, simde_mm_movehl_ps(m, m)); - m = wasm_v128_and(m, simde_mm_shuffle_epi32(m, SIMDE_MM_SHUFFLE(3, 2, 0, 1))); - return wasm_i32x4_extract_lane(m, 0); - #else - uint_fast32_t r = 0; - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - r |= a_.u32[i] & b_.u32[i]; - } - - return HEDLEY_STATIC_CAST(int, ((~r >> 31) & 1)); - #endif - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_testz_ps - #define _mm_testz_ps(a, b) simde_mm_testz_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_testz_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm_testz_pd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_u64x2_shr(wasm_v128_not(wasm_v128_and(a_.wasm_v128, b_.wasm_v128)), 63); - return HEDLEY_STATIC_CAST(int, wasm_i64x2_extract_lane(m, 0) & wasm_i64x2_extract_lane(m, 1)); - #else - uint_fast64_t r = 0; - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - r |= a_.u64[i] & b_.u64[i]; - } - - return HEDLEY_STATIC_CAST(int, ((~r >> 63) & 1)); - #endif - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_testz_pd - #define _mm_testz_pd(a, b) simde_mm_testz_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_testz_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_testz_ps(a, b); - #else - uint_fast32_t r = 0; - simde__m256_private - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - r |= a_.u32[i] & b_.u32[i]; - } - - return HEDLEY_STATIC_CAST(int, ((~r >> 31) & 1)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_testz_ps - #define _mm256_testz_ps(a, b) simde_mm256_testz_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_testz_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_testz_pd(a, b); - #else - uint_fast64_t r = 0; - simde__m256d_private - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - r |= a_.u64[i] & b_.u64[i]; - } - - return HEDLEY_STATIC_CAST(int, ((~r >> 63) & 1)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_testz_pd - #define _mm256_testz_pd(a, b) simde_mm256_testz_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_testz_si256 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_testz_si256(a, b); - #else - int_fast32_t r = 0; - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r = simde_mm_testz_si128(a_.m128i[0], b_.m128i[0]) && simde_mm_testz_si128(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) { - r |= a_.i32f[i] & b_.i32f[i]; - } - - r = !r; - #endif - - return HEDLEY_STATIC_CAST(int, r); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_testz_si256 - #define _mm256_testz_si256(a, b) simde_mm256_testz_si256(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_testnzc_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm_testnzc_ps(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_u32x4_shr(wasm_v128_and(a_.wasm_v128, b_.wasm_v128), 31); - v128_t m2 = wasm_u32x4_shr(wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128), 31); - m = wasm_v128_or(m, simde_mm_movehl_ps(m, m)); - m2 = wasm_v128_or(m2, simde_mm_movehl_ps(m2, m2)); - m = wasm_v128_or(m, simde_mm_shuffle_epi32(m, SIMDE_MM_SHUFFLE(3, 2, 0, 1))); - m2 = wasm_v128_or(m2, simde_mm_shuffle_epi32(m2, SIMDE_MM_SHUFFLE(3, 2, 0, 1))); - return wasm_i32x4_extract_lane(m, 0) & wasm_i32x4_extract_lane(m2, 0); - #else - uint32_t rz = 0, rc = 0; - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - rc |= ~a_.u32[i] & b_.u32[i]; - rz |= a_.u32[i] & b_.u32[i]; - } - - return - (rc >> ((sizeof(rc) * CHAR_BIT) - 1)) & - (rz >> ((sizeof(rz) * CHAR_BIT) - 1)); - #endif - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_testnzc_ps - #define _mm_testnzc_ps(a, b) simde_mm_testnzc_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_testnzc_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm_testnzc_pd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - #if defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_u64x2_shr(wasm_v128_and(a_.wasm_v128, b_.wasm_v128), 63); - v128_t m2 = wasm_u64x2_shr(wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128), 63); - return HEDLEY_STATIC_CAST(int, (wasm_i64x2_extract_lane(m, 0) | wasm_i64x2_extract_lane(m, 1)) - & (wasm_i64x2_extract_lane(m2, 0) | wasm_i64x2_extract_lane(m2, 1))); - #else - uint64_t rc = 0, rz = 0; - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - rc |= ~a_.u64[i] & b_.u64[i]; - rz |= a_.u64[i] & b_.u64[i]; - } - - return - (rc >> ((sizeof(rc) * CHAR_BIT) - 1)) & - (rz >> ((sizeof(rz) * CHAR_BIT) - 1)); - #endif - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm_testnzc_pd - #define _mm_testnzc_pd(a, b) simde_mm_testnzc_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_testnzc_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_testnzc_ps(a, b); - #else - uint32_t rc = 0, rz = 0; - simde__m256_private - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - rc |= ~a_.u32[i] & b_.u32[i]; - rz |= a_.u32[i] & b_.u32[i]; - } - - return - (rc >> ((sizeof(rc) * CHAR_BIT) - 1)) & - (rz >> ((sizeof(rz) * CHAR_BIT) - 1)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_testnzc_ps - #define _mm256_testnzc_ps(a, b) simde_mm256_testnzc_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_testnzc_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_testnzc_pd(a, b); - #else - uint64_t rc = 0, rz = 0; - simde__m256d_private - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - rc |= ~a_.u64[i] & b_.u64[i]; - rz |= a_.u64[i] & b_.u64[i]; - } - - return - (rc >> ((sizeof(rc) * CHAR_BIT) - 1)) & - (rz >> ((sizeof(rz) * CHAR_BIT) - 1)); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_testnzc_pd - #define _mm256_testnzc_pd(a, b) simde_mm256_testnzc_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_testnzc_si256 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX_NATIVE) - return _mm256_testnzc_si256(a, b); - #else - int32_t rc = 0, rz = 0; - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) { - rc |= ~a_.i32f[i] & b_.i32f[i]; - rz |= a_.i32f[i] & b_.i32f[i]; - } - - return !!(rc & rz); - #endif -} -#if defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_testnzc_si256 - #define _mm256_testnzc_si256(a, b) simde_mm256_testnzc_si256(a, b) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX_H) */ diff --git a/ffi-deps/simde/simde/x86/avx2.h b/ffi-deps/simde/simde/x86/avx2.h deleted file mode 100644 index a8d3808..0000000 --- a/ffi-deps/simde/simde/x86/avx2.h +++ /dev/null @@ -1,5758 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2018-2020 Evan Nemerson - * 2019-2020 Michael R. Crusoe - * 2020 Himanshi Mathur - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX2_H) -#define SIMDE_X86_AVX2_H - -#include "avx.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_abs_epi8 (simde__m256i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_abs_epi8(a); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_abs_epi8(a_.m128i[0]); - r_.m128i[1] = simde_mm_abs_epi8(a_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] < INT32_C(0)) ? -a_.i8[i] : a_.i8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_abs_epi8 - #define _mm256_abs_epi8(a) simde_mm256_abs_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_abs_epi16 (simde__m256i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_abs_epi16(a); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_abs_epi16(a_.m128i[0]); - r_.m128i[1] = simde_mm_abs_epi16(a_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] < INT32_C(0)) ? -a_.i16[i] : a_.i16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_abs_epi16 - #define _mm256_abs_epi16(a) simde_mm256_abs_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_abs_epi32(simde__m256i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_abs_epi32(a); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_abs_epi32(a_.m128i[0]); - r_.m128i[1] = simde_mm_abs_epi32(a_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { - r_.i32[i] = (a_.i32[i] < INT32_C(0)) ? -a_.i32[i] : a_.i32[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_abs_epi32 - #define _mm256_abs_epi32(a) simde_mm256_abs_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_add_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_add_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_add_epi8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_add_epi8(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = a_.i8 + b_.i8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] + b_.i8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_add_epi8 - #define _mm256_add_epi8(a, b) simde_mm256_add_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_add_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_add_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_add_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_add_epi16(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = a_.i16 + b_.i16; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] + b_.i16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_add_epi16 - #define _mm256_add_epi16(a, b) simde_mm256_add_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_hadd_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_hadd_epi16(a, b); - #else - return simde_mm256_add_epi16(simde_x_mm256_deinterleaveeven_epi16(a, b), simde_x_mm256_deinterleaveodd_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_hadd_epi16 - #define _mm256_hadd_epi16(a, b) simde_mm256_hadd_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_add_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_add_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_add_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_add_epi32(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 + b_.i32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] + b_.i32[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_add_epi32 - #define _mm256_add_epi32(a, b) simde_mm256_add_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_hadd_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_hadd_epi32(a, b); - #else - return simde_mm256_add_epi32(simde_x_mm256_deinterleaveeven_epi32(a, b), simde_x_mm256_deinterleaveodd_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_hadd_epi32 - #define _mm256_hadd_epi32(a, b) simde_mm256_hadd_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_add_epi64 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_add_epi64(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_add_epi64(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_add_epi64(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_CLANG_BAD_VI64_OPS) - r_.i64 = a_.i64 + b_.i64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] + b_.i64[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_add_epi64 - #define _mm256_add_epi64(a, b) simde_mm256_add_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_alignr_epi8 (simde__m256i a, simde__m256i b, int count) - SIMDE_REQUIRE_CONSTANT_RANGE(count, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - if (HEDLEY_UNLIKELY(count > 31)) - return simde_mm256_setzero_si256(); - - for (size_t h = 0 ; h < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; h++) { - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i_private[h].i8) / sizeof(r_.m128i_private[h].i8[0])) ; i++) { - const int srcpos = count + HEDLEY_STATIC_CAST(int, i); - if (srcpos > 31) { - r_.m128i_private[h].i8[i] = 0; - } else if (srcpos > 15) { - r_.m128i_private[h].i8[i] = a_.m128i_private[h].i8[(srcpos) & 15]; - } else { - r_.m128i_private[h].i8[i] = b_.m128i_private[h].i8[srcpos]; - } - } - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_BUG_PGI_30106) -# define simde_mm256_alignr_epi8(a, b, count) _mm256_alignr_epi8(a, b, count) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_alignr_epi8(a, b, count) \ - simde_mm256_set_m128i( \ - simde_mm_alignr_epi8(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (count)), \ - simde_mm_alignr_epi8(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (count))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_alignr_epi8 - #define _mm256_alignr_epi8(a, b, count) simde_mm256_alignr_epi8(a, b, (count)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_and_si256 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_and_si256(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_and_si128(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_and_si128(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f & b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] & b_.i64[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_and_si256 - #define _mm256_and_si256(a, b) simde_mm256_and_si256(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_andnot_si256 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_andnot_si256(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_andnot_si128(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_andnot_si128(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_andnot_si256 - #define _mm256_andnot_si256(a, b) simde_mm256_andnot_si256(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_adds_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_adds_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_adds_epi8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_adds_epi8(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = simde_math_adds_i8(a_.i8[i], b_.i8[i]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_adds_epi8 - #define _mm256_adds_epi8(a, b) simde_mm256_adds_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_adds_epi16(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_adds_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_adds_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_adds_epi16(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = simde_math_adds_i16(a_.i16[i], b_.i16[i]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_adds_epi16 - #define _mm256_adds_epi16(a, b) simde_mm256_adds_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_hadds_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_hadds_epi16(a, b); - #else - return simde_mm256_adds_epi16(simde_x_mm256_deinterleaveeven_epi16(a, b), simde_x_mm256_deinterleaveodd_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_hadds_epi16 - #define _mm256_hadds_epi16(a, b) simde_mm256_hadds_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_adds_epu8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_adds_epu8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_adds_epu8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_adds_epu8(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = simde_math_adds_u8(a_.u8[i], b_.u8[i]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_adds_epu8 - #define _mm256_adds_epu8(a, b) simde_mm256_adds_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_adds_epu16(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_adds_epu16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_adds_epu16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_adds_epu16(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = simde_math_adds_u16(a_.u16[i], b_.u16[i]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_adds_epu16 - #define _mm256_adds_epu16(a, b) simde_mm256_adds_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_avg_epu8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_avg_epu8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_avg_epu8 - #define _mm256_avg_epu8(a, b) simde_mm256_avg_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_avg_epu16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_avg_epu16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_avg_epu16 - #define _mm256_avg_epu16(a, b) simde_mm256_avg_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_blend_epi32(simde__m128i a, simde__m128i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = ((imm8 >> i) & 1) ? b_.i32[i] : a_.i32[i]; - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm_blend_epi32(a, b, imm8) _mm_blend_epi32(a, b, imm8) -#elif SIMDE_NATURAL_FLOAT_VECTOR_SIZE_LE(128) -# define simde_mm_blend_epi32(a, b, imm8) \ - simde_mm_castps_si128(simde_mm_blend_ps(simde_mm_castsi128_ps(a), simde_mm_castsi128_ps(b), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_blend_epi32 - #define _mm_blend_epi32(a, b, imm8) simde_mm_blend_epi32(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_blend_epi16(simde__m256i a, simde__m256i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = ((imm8 >> i%8) & 1) ? b_.i16[i] : a_.i16[i]; - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_BUG_CLANG_REV_234560) -# define simde_mm256_blend_epi16(a, b, imm8) _mm256_castpd_si256(_mm256_blend_epi16(a, b, imm8)) -#elif defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_blend_epi16(a, b, imm8) _mm256_blend_epi16(a, b, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_blend_epi16(a, b, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_blend_epi16(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (imm8)), \ - simde_mm_blend_epi16(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_blend_epi16 - #define _mm256_blend_epi16(a, b, imm8) simde_mm256_blend_epi16(a, b, imm8) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_blend_epi32(simde__m256i a, simde__m256i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = ((imm8 >> i) & 1) ? b_.i32[i] : a_.i32[i]; - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_blend_epi32(a, b, imm8) _mm256_blend_epi32(a, b, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_blend_epi32(a, b, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_blend_epi32(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (imm8) >> 4), \ - simde_mm_blend_epi32(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (imm8) & 0x0F)) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_blend_epi32 - #define _mm256_blend_epi32(a, b, imm8) simde_mm256_blend_epi32(a, b, imm8) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_blendv_epi8(simde__m256i a, simde__m256i b, simde__m256i mask) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_blendv_epi8(a, b, mask); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b), - mask_ = simde__m256i_to_private(mask); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_blendv_epi8(a_.m128i[0], b_.m128i[0], mask_.m128i[0]); - r_.m128i[1] = simde_mm_blendv_epi8(a_.m128i[1], b_.m128i[1], mask_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - __typeof__(mask_.i8) tmp = mask_.i8 >> 7; - r_.i8 = (tmp & b_.i8) | (~tmp & a_.i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - int8_t tmp = mask_.i8[i] >> 7; - r_.i8[i] = (tmp & b_.i8[i]) | (~tmp & a_.i8[i]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_blendv_epi8(a, b, imm8) _mm256_blendv_epi8(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_blendv_epi8 - #define _mm256_blendv_epi8(a, b, mask) simde_mm256_blendv_epi8(a, b, mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_broadcastb_epi8 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm_broadcastb_epi8(a); - #else - simde__m128i_private r_; - simde__m128i_private a_= simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[0]; - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_broadcastb_epi8 - #define _mm_broadcastb_epi8(a) simde_mm_broadcastb_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_broadcastb_epi8 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_broadcastb_epi8(a); - #else - simde__m256i_private r_; - simde__m128i_private a_= simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[0]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcastb_epi8 - #define _mm256_broadcastb_epi8(a) simde_mm256_broadcastb_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_broadcastw_epi16 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm_broadcastw_epi16(a); - #else - simde__m128i_private r_; - simde__m128i_private a_= simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[0]; - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_broadcastw_epi16 - #define _mm_broadcastw_epi16(a) simde_mm_broadcastw_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_broadcastw_epi16 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_broadcastw_epi16(a); - #else - simde__m256i_private r_; - simde__m128i_private a_= simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[0]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcastw_epi16 - #define _mm256_broadcastw_epi16(a) simde_mm256_broadcastw_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_broadcastd_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm_broadcastd_epi32(a); - #else - simde__m128i_private r_; - simde__m128i_private a_= simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[0]; - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_broadcastd_epi32 - #define _mm_broadcastd_epi32(a) simde_mm_broadcastd_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_broadcastd_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_broadcastd_epi32(a); - #else - simde__m256i_private r_; - simde__m128i_private a_= simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[0]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcastd_epi32 - #define _mm256_broadcastd_epi32(a) simde_mm256_broadcastd_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_broadcastq_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm_broadcastq_epi64(a); - #else - simde__m128i_private r_; - simde__m128i_private a_= simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[0]; - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_broadcastq_epi64 - #define _mm_broadcastq_epi64(a) simde_mm_broadcastq_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_broadcastq_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_broadcastq_epi64(a); - #else - simde__m256i_private r_; - simde__m128i_private a_= simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[0]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcastq_epi64 - #define _mm256_broadcastq_epi64(a) simde_mm256_broadcastq_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_broadcastss_ps (simde__m128 a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm_broadcastss_ps(a); - #elif defined(SIMDE_X86_SSE_NATIVE) - return simde_mm_shuffle_ps(a, a, 0); - #else - simde__m128_private r_; - simde__m128_private a_= simde__m128_to_private(a); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 0, 0); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[0]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_broadcastss_ps - #define _mm_broadcastss_ps(a) simde_mm_broadcastss_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_broadcastss_ps (simde__m128 a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_broadcastss_ps(a); - #else - simde__m256_private r_; - simde__m128_private a_= simde__m128_to_private(a); - - #if defined(SIMDE_X86_AVX_NATIVE) - __m128 tmp = _mm_permute_ps(a_.n, 0); - r_.n = _mm256_insertf128_ps(_mm256_castps128_ps256(tmp), tmp, 1); - #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) - r_.f32 = __builtin_shufflevector(a_.f32, a_.f32, 0, 0, 0, 0, 0, 0, 0, 0); - #elif SIMDE_NATURAL_FLOAT_VECTOR_SIZE_LE(128) - r_.m128[0] = r_.m128[1] = simde_mm_broadcastss_ps(simde__m128_from_private(a_)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[0]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcastss_ps - #define _mm256_broadcastss_ps(a) simde_mm256_broadcastss_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_broadcastsd_pd (simde__m128d a) { - return simde_mm_movedup_pd(a); -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_broadcastsd_pd - #define _mm_broadcastsd_pd(a) simde_mm_broadcastsd_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_broadcastsd_pd (simde__m128d a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_broadcastsd_pd(a); - #else - simde__m256d_private r_; - simde__m128d_private a_= simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[0]; - } - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcastsd_pd - #define _mm256_broadcastsd_pd(a) simde_mm256_broadcastsd_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_broadcastsi128_si256 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,8,0)) - return _mm256_broadcastsi128_si256(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i_private[0] = a_; - r_.m128i_private[1] = a_; - #else - r_.i64[0] = a_.i64[0]; - r_.i64[1] = a_.i64[1]; - r_.i64[2] = a_.i64[0]; - r_.i64[3] = a_.i64[1]; - #endif - - return simde__m256i_from_private(r_); - #endif -} -#define simde_mm_broadcastsi128_si256(a) simde_mm256_broadcastsi128_si256(a) -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcastsi128_si256 - #define _mm256_broadcastsi128_si256(a) simde_mm256_broadcastsi128_si256(a) - #undef _mm_broadcastsi128_si256 - #define _mm_broadcastsi128_si256(a) simde_mm256_broadcastsi128_si256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_bslli_epi128 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - const int ssize = HEDLEY_STATIC_CAST(int, (sizeof(r_.i8) / sizeof(r_.i8[0]))); - - SIMDE_VECTORIZE - for (int i = 0 ; i < ssize ; i++) { - const int e = i - imm8; - if(i >= (ssize/2)) { - if(e >= (ssize/2) && e < ssize) - r_.i8[i] = a_.i8[e]; - else - r_.i8[i] = 0; - } - else{ - if(e >= 0 && e < (ssize/2)) - r_.i8[i] = a_.i8[e]; - else - r_.i8[i] = 0; - } - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,8,0)) && \ - SIMDE_DETECT_CLANG_VERSION_CHECK(3,7,0) - #define simde_mm256_bslli_epi128(a, imm8) _mm256_bslli_epi128(a, imm8) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_bslli_epi128 - #define _mm256_bslli_epi128(a, imm8) simde_mm256_bslli_epi128(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_bsrli_epi128 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - const int ssize = HEDLEY_STATIC_CAST(int, (sizeof(r_.i8) / sizeof(r_.i8[0]))); - - SIMDE_VECTORIZE - for (int i = 0 ; i < ssize ; i++) { - const int e = i + imm8; - if(i < (ssize/2)) { - if(e >= 0 && e < (ssize/2)) - r_.i8[i] = a_.i8[e]; - else - r_.i8[i] = 0; - } - else{ - if(e >= (ssize/2) && e < ssize) - r_.i8[i] = a_.i8[e]; - else - r_.i8[i] = 0; - } - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,8,0)) && \ - SIMDE_DETECT_CLANG_VERSION_CHECK(3,7,0) - #define simde_mm256_bsrli_epi128(a, imm8) _mm256_bsrli_epi128(a, imm8) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_bsrli_epi128 - #define _mm256_bsrli_epi128(a, imm8) simde_mm256_bsrli_epi128(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cmpeq_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cmpeq_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_cmpeq_epi8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_cmpeq_epi8(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpeq_epi8 - #define _mm256_cmpeq_epi8(a, b) simde_mm256_cmpeq_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cmpeq_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cmpeq_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_cmpeq_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_cmpeq_epi16(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpeq_epi16 - #define _mm256_cmpeq_epi16(a, b) simde_mm256_cmpeq_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cmpeq_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cmpeq_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_cmpeq_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_cmpeq_epi32(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpeq_epi32 - #define _mm256_cmpeq_epi32(a, b) simde_mm256_cmpeq_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cmpeq_epi64 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cmpeq_epi64(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_cmpeq_epi64(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_cmpeq_epi64(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = (a_.i64[i] == b_.i64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpeq_epi64 - #define _mm256_cmpeq_epi64(a, b) simde_mm256_cmpeq_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cmpgt_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cmpgt_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_cmpgt_epi8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_cmpgt_epi8(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), a_.i8 > b_.i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpgt_epi8 - #define _mm256_cmpgt_epi8(a, b) simde_mm256_cmpgt_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cmpgt_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cmpgt_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_cmpgt_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_cmpgt_epi16(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = a_.i16 > b_.i16; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpgt_epi16 - #define _mm256_cmpgt_epi16(a, b) simde_mm256_cmpgt_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cmpgt_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cmpgt_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_cmpgt_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_cmpgt_epi32(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.i32 > b_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpgt_epi32 - #define _mm256_cmpgt_epi32(a, b) simde_mm256_cmpgt_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cmpgt_epi64 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cmpgt_epi64(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_cmpgt_epi64(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_cmpgt_epi64(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 > b_.i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = (a_.i64[i] > b_.i64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpgt_epi64 - #define _mm256_cmpgt_epi64(a, b) simde_mm256_cmpgt_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepi8_epi16 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepi8_epi16(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i16, a_.i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepi8_epi16 - #define _mm256_cvtepi8_epi16(a) simde_mm256_cvtepi8_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepi8_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepi8_epi32(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepi8_epi32 - #define _mm256_cvtepi8_epi32(a) simde_mm256_cvtepi8_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepi8_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepi8_epi64(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i8[i]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepi8_epi64 - #define _mm256_cvtepi8_epi64(a) simde_mm256_cvtepi8_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepi16_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepi16_epi32(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i32, a_.i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepi16_epi32 - #define _mm256_cvtepi16_epi32(a) simde_mm256_cvtepi16_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepi16_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepi16_epi64(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepi16_epi64 - #define _mm256_cvtepi16_epi64(a) simde_mm256_cvtepi16_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepi32_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepi32_epi64(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i64, a_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i32[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepi32_epi64 - #define _mm256_cvtepi32_epi64(a) simde_mm256_cvtepi32_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepu8_epi16 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepu8_epi16(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i16, a_.u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.u8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepu8_epi16 - #define _mm256_cvtepu8_epi16(a) simde_mm256_cvtepu8_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepu8_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepu8_epi32(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.u8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepu8_epi32 - #define _mm256_cvtepu8_epi32(a) simde_mm256_cvtepu8_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepu8_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepu8_epi64(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.u8[i]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepu8_epi64 - #define _mm256_cvtepu8_epi64(a) simde_mm256_cvtepu8_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepu16_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepu16_epi32(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i32, a_.u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.u16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepu16_epi32 - #define _mm256_cvtepu16_epi32(a) simde_mm256_cvtepu16_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepu16_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepu16_epi64(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.u16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepu16_epi64 - #define _mm256_cvtepu16_epi64(a) simde_mm256_cvtepu16_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cvtepu32_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_cvtepu32_epi64(a); - #else - simde__m256i_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i64, a_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.u32[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtepu32_epi64 - #define _mm256_cvtepu32_epi64(a) simde_mm256_cvtepu32_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_extract_epi8 (simde__m256i a, const int index) - SIMDE_REQUIRE_RANGE(index, 0, 31){ - simde__m256i_private a_ = simde__m256i_to_private(a); - return a_.i8[index]; -} -#if defined(SIMDE_X86_AVX2_NATIVE) && \ - (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) - #define simde_mm256_extract_epi8(a, index) _mm256_extract_epi8(a, index) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_extract_epi8 - #define _mm256_extract_epi8(a, index) simde_mm256_extract_epi8(a, index) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm256_extract_epi16 (simde__m256i a, const int index) - SIMDE_REQUIRE_RANGE(index, 0, 15) { - simde__m256i_private a_ = simde__m256i_to_private(a); - return a_.i16[index]; -} -#if defined(SIMDE_X86_AVX2_NATIVE) && \ - (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,10,0)) - #define simde_mm256_extract_epi16(a, index) _mm256_extract_epi16(a, index) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_extract_epi16 - #define _mm256_extract_epi16(a, index) simde_mm256_extract_epi16(a, index) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm256_extracti128_si256 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m256i_private a_ = simde__m256i_to_private(a); - return a_.m128i[imm8]; -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_extracti128_si256(a, imm8) _mm256_extracti128_si256(a, imm8) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_extracti128_si256 - #define _mm256_extracti128_si256(a, imm8) simde_mm256_extracti128_si256(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_i32gather_epi32(const int32_t* base_addr, simde__m128i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex), - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int32_t dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.i32[i] = dst; - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_i32gather_epi32(base_addr, vindex, scale) _mm_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_i32gather_epi32 - #define _mm_i32gather_epi32(base_addr, vindex, scale) simde_mm_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_i32gather_epi32(simde__m128i src, const int32_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex), - src_ = simde__m128i_to_private(src), - mask_ = simde__m128i_to_private(mask), - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - if ((mask_.i32[i] >> 31) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int32_t dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.i32[i] = dst; - } - else { - r_.i32[i] = src_.i32[i]; - } - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_i32gather_epi32 - #define _mm_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) simde_mm_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_i32gather_epi32(const int32_t* base_addr, simde__m256i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex), - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int32_t dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.i32[i] = dst; - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_i32gather_epi32(base_addr, vindex, scale) _mm256_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_i32gather_epi32 - #define _mm256_i32gather_epi32(base_addr, vindex, scale) simde_mm256_i32gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_i32gather_epi32(simde__m256i src, const int32_t* base_addr, simde__m256i vindex, simde__m256i mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex), - src_ = simde__m256i_to_private(src), - mask_ = simde__m256i_to_private(mask), - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - if ((mask_.i32[i] >> 31) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int32_t dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.i32[i] = dst; - } - else { - r_.i32[i] = src_.i32[i]; - } - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_i32gather_epi32 - #define _mm256_mask_i32gather_epi32(src, base_addr, vindex, mask, scale) simde_mm256_mask_i32gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_i64gather_epi32(const int32_t* base_addr, simde__m128i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex), - r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int32_t dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.i32[i] = dst; - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_i64gather_epi32(base_addr, vindex, scale) _mm_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_i64gather_epi32 - #define _mm_i64gather_epi32(base_addr, vindex, scale) simde_mm_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_i64gather_epi32(simde__m128i src, const int32_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex), - src_ = simde__m128i_to_private(src), - mask_ = simde__m128i_to_private(mask), - r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - if ((mask_.i32[i] >> 31) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int32_t dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.i32[i] = dst; - } - else { - r_.i32[i] = src_.i32[i]; - } - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_i64gather_epi32 - #define _mm_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) simde_mm_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm256_i64gather_epi32(const int32_t* base_addr, simde__m256i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex); - simde__m128i_private - r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int32_t dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.i32[i] = dst; - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_i64gather_epi32(base_addr, vindex, scale) _mm256_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_i64gather_epi32 - #define _mm256_i64gather_epi32(base_addr, vindex, scale) simde_mm256_i64gather_epi32(SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm256_mask_i64gather_epi32(simde__m128i src, const int32_t* base_addr, simde__m256i vindex, simde__m128i mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex); - simde__m128i_private - src_ = simde__m128i_to_private(src), - mask_ = simde__m128i_to_private(mask), - r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - if ((mask_.i32[i] >> 31) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int32_t dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.i32[i] = dst; - } - else { - r_.i32[i] = src_.i32[i]; - } - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int const*, int32_t const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_i64gather_epi32 - #define _mm256_mask_i64gather_epi32(src, base_addr, vindex, mask, scale) simde_mm256_mask_i64gather_epi32(src, SIMDE_CHECKED_REINTERPRET_CAST(int32_t const*, int const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_i32gather_epi64(const int64_t* base_addr, simde__m128i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex), - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int64_t dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.i64[i] = dst; - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm_i32gather_epi64(base_addr, vindex, scale) _mm_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale) - #else - #define simde_mm_i32gather_epi64(base_addr, vindex, scale) _mm_i32gather_epi64(HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, scale) - #endif -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_i32gather_epi64 - #define _mm_i32gather_epi64(base_addr, vindex, scale) simde_mm_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_i32gather_epi64(simde__m128i src, const int64_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex), - src_ = simde__m128i_to_private(src), - mask_ = simde__m128i_to_private(mask), - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - if ((mask_.i64[i] >> 63) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int64_t dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.i64[i] = dst; - } - else { - r_.i64[i] = src_.i64[i]; - } - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale) - #else - #define simde_mm_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, mask, scale) - #endif -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_i32gather_epi64 - #define _mm_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) simde_mm_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_i32gather_epi64(const int64_t* base_addr, simde__m128i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - simde__m256i_private - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int64_t dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.i64[i] = dst; - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm256_i32gather_epi64(base_addr, vindex, scale) _mm256_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale) - #else - #define simde_mm256_i32gather_epi64(base_addr, vindex, scale) _mm256_i32gather_epi64(HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, scale) - #endif -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_i32gather_epi64 - #define _mm256_i32gather_epi64(base_addr, vindex, scale) simde_mm256_i32gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_i32gather_epi64(simde__m256i src, const int64_t* base_addr, simde__m128i vindex, simde__m256i mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - src_ = simde__m256i_to_private(src), - mask_ = simde__m256i_to_private(mask), - r_; - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - if ((mask_.i64[i] >> 63) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int64_t dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.i64[i] = dst; - } - else { - r_.i64[i] = src_.i64[i]; - } - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm256_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale) - #else - #define simde_mm256_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, mask, scale) - #endif -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_i32gather_epi64 - #define _mm256_mask_i32gather_epi64(src, base_addr, vindex, mask, scale) simde_mm256_mask_i32gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_i64gather_epi64(const int64_t* base_addr, simde__m128i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex), - r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int64_t dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.i64[i] = dst; - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm_i64gather_epi64(base_addr, vindex, scale) _mm_i64gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale) - #else - #define simde_mm_i64gather_epi64(base_addr, vindex, scale) _mm_i64gather_epi64(HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, scale) - #endif -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_i64gather_epi64 - #define _mm_i64gather_epi64(base_addr, vindex, scale) simde_mm_i64gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_i64gather_epi64(simde__m128i src, const int64_t* base_addr, simde__m128i vindex, simde__m128i mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex), - src_ = simde__m128i_to_private(src), - mask_ = simde__m128i_to_private(mask), - r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - if ((mask_.i64[i] >> 63) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int64_t dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.i64[i] = dst; - } - else { - r_.i64[i] = src_.i64[i]; - } - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale) - #else - #define simde_mm_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, mask, scale) - #endif -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_i64gather_epi64 - #define _mm_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) simde_mm_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_i64gather_epi64(const int64_t* base_addr, simde__m256i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex), - r_ = simde__m256i_to_private(simde_mm256_setzero_si256()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int64_t dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.i64[i] = dst; - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm256_i64gather_epi64(base_addr, vindex, scale) _mm256_i64gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale) - #else - #define simde_mm256_i64gather_epi64(base_addr, vindex, scale) _mm256_i64gather_epi64(HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, scale) - #endif -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_i64gather_epi64 - #define _mm256_i64gather_epi64(base_addr, vindex, scale) simde_mm256_i64gather_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_i64gather_epi64(simde__m256i src, const int64_t* base_addr, simde__m256i vindex, simde__m256i mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex), - src_ = simde__m256i_to_private(src), - mask_ = simde__m256i_to_private(mask), - r_ = simde__m256i_to_private(simde_mm256_setzero_si256()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - if ((mask_.i64[i] >> 63) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int64_t dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.i64[i] = dst; - } - else { - r_.i64[i] = src_.i64[i]; - } - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #if SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm256_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale) - #else - #define simde_mm256_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(long long const*, base_addr), vindex, mask, scale) - #endif -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_i64gather_epi64 - #define _mm256_mask_i64gather_epi64(src, base_addr, vindex, mask, scale) simde_mm256_mask_i64gather_epi64(src, HEDLEY_REINTERPRET_CAST(int64_t const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_i32gather_ps(const simde_float32* base_addr, simde__m128i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - simde__m128_private - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float32 dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.f32[i] = dst; - } - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_i32gather_ps(base_addr, vindex, scale) _mm_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_i32gather_ps - #define _mm_i32gather_ps(base_addr, vindex, scale) simde_mm_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_mask_i32gather_ps(simde__m128 src, const simde_float32* base_addr, simde__m128i vindex, simde__m128 mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - simde__m128_private - src_ = simde__m128_to_private(src), - mask_ = simde__m128_to_private(mask), - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - if ((mask_.i32[i] >> 31) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float32 dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.f32[i] = dst; - } - else { - r_.f32[i] = src_.f32[i]; - } - } - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_mask_i32gather_ps(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_i32gather_ps - #define _mm_mask_i32gather_ps(src, base_addr, vindex, mask, scale) simde_mm_mask_i32gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_i32gather_ps(const simde_float32* base_addr, simde__m256i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex); - simde__m256_private - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float32 dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.f32[i] = dst; - } - - return simde__m256_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_i32gather_ps(base_addr, vindex, scale) _mm256_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, (base_addr)), (vindex), (scale)) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_i32gather_ps - #define _mm256_i32gather_ps(base_addr, vindex, scale) simde_mm256_i32gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, (base_addr)), (vindex), (scale)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_mask_i32gather_ps(simde__m256 src, const simde_float32* base_addr, simde__m256i vindex, simde__m256 mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex); - simde__m256_private - src_ = simde__m256_to_private(src), - mask_ = simde__m256_to_private(mask), - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - if ((mask_.i32[i] >> 31) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float32 dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.f32[i] = dst; - } - else { - r_.f32[i] = src_.f32[i]; - } - } - - return simde__m256_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_i32gather_ps - #define _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale) simde_mm256_mask_i32gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_i64gather_ps(const simde_float32* base_addr, simde__m128i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - simde__m128_private - r_ = simde__m128_to_private(simde_mm_setzero_ps()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float32 dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.f32[i] = dst; - } - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_i64gather_ps(base_addr, vindex, scale) _mm_i64gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_i64gather_ps - #define _mm_i64gather_ps(base_addr, vindex, scale) simde_mm_i64gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_mask_i64gather_ps(simde__m128 src, const simde_float32* base_addr, simde__m128i vindex, simde__m128 mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - simde__m128_private - src_ = simde__m128_to_private(src), - mask_ = simde__m128_to_private(mask), - r_ = simde__m128_to_private(simde_mm_setzero_ps()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - if ((mask_.i32[i] >> 31) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float32 dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.f32[i] = dst; - } - else { - r_.f32[i] = src_.f32[i]; - } - } - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_mask_i64gather_ps(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(float const*, float32_t const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_i64gather_ps - #define _mm_mask_i64gather_ps(src, base_addr, vindex, mask, scale) simde_mm_mask_i64gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm256_i64gather_ps(const simde_float32* base_addr, simde__m256i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex); - simde__m128_private - r_ = simde__m128_to_private(simde_mm_setzero_ps()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float32 dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.f32[i] = dst; - } - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_i64gather_ps(base_addr, vindex, scale) _mm256_i64gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_i64gather_ps - #define _mm256_i64gather_ps(base_addr, vindex, scale) simde_mm256_i64gather_ps(SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm256_mask_i64gather_ps(simde__m128 src, const simde_float32* base_addr, simde__m256i vindex, simde__m128 mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex); - simde__m128_private - src_ = simde__m128_to_private(src), - mask_ = simde__m128_to_private(mask), - r_ = simde__m128_to_private(simde_mm_setzero_ps()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - if ((mask_.i32[i] >> 31) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float32 dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.f32[i] = dst; - } - else { - r_.f32[i] = src_.f32[i]; - } - } - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_mask_i64gather_ps(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(float const*, simde_float32 const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_i64gather_ps - #define _mm256_mask_i64gather_ps(src, base_addr, vindex, mask, scale) simde_mm256_mask_i64gather_ps(src, SIMDE_CHECKED_REINTERPRET_CAST(simde_float32 const*, float const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_i32gather_pd(const simde_float64* base_addr, simde__m128i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - simde__m128d_private - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float64 dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.f64[i] = dst; - } - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_i32gather_pd(base_addr, vindex, scale) _mm_i32gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_i32gather_pd - #define _mm_i32gather_pd(base_addr, vindex, scale) simde_mm_i32gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_mask_i32gather_pd(simde__m128d src, const simde_float64* base_addr, simde__m128i vindex, simde__m128d mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - simde__m128d_private - src_ = simde__m128d_to_private(src), - mask_ = simde__m128d_to_private(mask), - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - if ((mask_.i64[i] >> 63) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float64 dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.f64[i] = dst; - } - else { - r_.f64[i] = src_.f64[i]; - } - } - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_mask_i32gather_pd(src, base_addr, vindex, mask, scale) _mm_mask_i32gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_i32gather_pd - #define _mm_mask_i32gather_pd(src, base_addr, vindex, mask, scale) simde_mm_mask_i32gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_i32gather_pd(const simde_float64* base_addr, simde__m128i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - simde__m256d_private - r_; - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float64 dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.f64[i] = dst; - } - - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_i32gather_pd(base_addr, vindex, scale) _mm256_i32gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_i32gather_pd - #define _mm256_i32gather_pd(base_addr, vindex, scale) simde_mm256_i32gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_mask_i32gather_pd(simde__m256d src, const simde_float64* base_addr, simde__m128i vindex, simde__m256d mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256d_private - src_ = simde__m256d_to_private(src), - mask_ = simde__m256d_to_private(mask), - r_; - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - if ((mask_.i64[i] >> 63) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float64 dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.f64[i] = dst; - } - else { - r_.f64[i] = src_.f64[i]; - } - } - - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_mask_i32gather_pd(src, base_addr, vindex, mask, scale) _mm256_mask_i32gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_i32gather_pd - #define _mm256_mask_i32gather_pd(src, base_addr, vindex, mask, scale) simde_mm256_mask_i32gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_i64gather_pd(const simde_float64* base_addr, simde__m128i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - simde__m128d_private - r_ = simde__m128d_to_private(simde_mm_setzero_pd()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float64 dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.f64[i] = dst; - } - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_i64gather_pd(base_addr, vindex, scale) _mm_i64gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_i64gather_pd - #define _mm_i64gather_pd(base_addr, vindex, scale) simde_mm_i64gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_mask_i64gather_pd(simde__m128d src, const simde_float64* base_addr, simde__m128i vindex, simde__m128d mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m128i_private - vindex_ = simde__m128i_to_private(vindex); - simde__m128d_private - src_ = simde__m128d_to_private(src), - mask_ = simde__m128d_to_private(mask), - r_ = simde__m128d_to_private(simde_mm_setzero_pd()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - if ((mask_.i64[i] >> 63) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float64 dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.f64[i] = dst; - } - else { - r_.f64[i] = src_.f64[i]; - } - } - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_mask_i64gather_pd(src, base_addr, vindex, mask, scale) _mm_mask_i64gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_i64gather_pd - #define _mm_mask_i64gather_pd(src, base_addr, vindex, mask, scale) simde_mm_mask_i64gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_i64gather_pd(const simde_float64* base_addr, simde__m256i vindex, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex); - simde__m256d_private - r_ = simde__m256d_to_private(simde_mm256_setzero_pd()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float64 dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.f64[i] = dst; - } - - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_i64gather_pd(base_addr, vindex, scale) _mm256_i64gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_i64gather_pd - #define _mm256_i64gather_pd(base_addr, vindex, scale) simde_mm256_i64gather_pd(HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_mask_i64gather_pd(simde__m256d src, const simde_float64* base_addr, simde__m256i vindex, simde__m256d mask, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m256i_private - vindex_ = simde__m256i_to_private(vindex); - simde__m256d_private - src_ = simde__m256d_to_private(src), - mask_ = simde__m256d_to_private(mask), - r_ = simde__m256d_to_private(simde_mm256_setzero_pd()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - if ((mask_.i64[i] >> 63) & 1) { - const uint8_t* src1 = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float64 dst; - simde_memcpy(&dst, src1, sizeof(dst)); - r_.f64[i] = dst; - } - else { - r_.f64[i] = src_.f64[i]; - } - } - - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_mask_i64gather_pd(src, base_addr, vindex, mask, scale) _mm256_mask_i64gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_i64gather_pd - #define _mm256_mask_i64gather_pd(src, base_addr, vindex, mask, scale) simde_mm256_mask_i64gather_pd(src, HEDLEY_REINTERPRET_CAST(simde_float64 const*, base_addr), vindex, mask, scale) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_inserti128_si256(simde__m256i a, simde__m128i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m256i_private a_ = simde__m256i_to_private(a); - simde__m128i_private b_ = simde__m128i_to_private(b); - - a_.m128i_private[ imm8 & 1 ] = b_; - - return simde__m256i_from_private(a_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_inserti128_si256(a, b, imm8) _mm256_inserti128_si256(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_inserti128_si256 - #define _mm256_inserti128_si256(a, b, imm8) simde_mm256_inserti128_si256(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_madd_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_madd_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_madd_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_madd_epi16(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) - SIMDE_ALIGN_TO_32 int32_t product SIMDE_VECTOR(64); - SIMDE_ALIGN_TO_32 int32_t a32x16 SIMDE_VECTOR(64); - SIMDE_ALIGN_TO_32 int32_t b32x16 SIMDE_VECTOR(64); - SIMDE_ALIGN_TO_32 int32_t even SIMDE_VECTOR(32); - SIMDE_ALIGN_TO_32 int32_t odd SIMDE_VECTOR(32); - - SIMDE_CONVERT_VECTOR_(a32x16, a_.i16); - SIMDE_CONVERT_VECTOR_(b32x16, b_.i16); - product = a32x16 * b32x16; - - even = __builtin_shufflevector(product, product, 0, 2, 4, 6, 8, 10, 12, 14); - odd = __builtin_shufflevector(product, product, 1, 3, 5, 7, 9, 11, 13, 15); - - r_.i32 = even + odd; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i += 2) { - r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) + (a_.i16[i + 1] * b_.i16[i + 1]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_madd_epi16 - #define _mm256_madd_epi16(a, b) simde_mm256_madd_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maddubs_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_maddubs_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_maddubs_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_maddubs_epi16(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - const int idx = HEDLEY_STATIC_CAST(int, i) << 1; - int32_t ts = - (HEDLEY_STATIC_CAST(int16_t, a_.u8[ idx ]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[ idx ])) + - (HEDLEY_STATIC_CAST(int16_t, a_.u8[idx + 1]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[idx + 1])); - r_.i16[i] = (ts > INT16_MIN) ? ((ts < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ts) : INT16_MAX) : INT16_MIN; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_maddubs_epi16 - #define _mm256_maddubs_epi16(a, b) simde_mm256_maddubs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskload_epi32 (const int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm_maskload_epi32(mem_addr, mask); - #else - simde__m128i_private - r_, - mask_ = simde__m128i_to_private(mask), - mask_shr_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - mask_shr_.neon_i32 = vshrq_n_s32(mask_.neon_i32, 31); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - mask_shr_.i32[i] = mask_.i32[i] >> 31; - } - #endif - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = mask_shr_.i32[i] ? mem_addr[i] : INT32_C(0); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_maskload_epi32 - #define _mm_maskload_epi32(mem_addr, mask) simde_mm_maskload_epi32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr), mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskload_epi32 (const int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_maskload_epi32(mem_addr, mask); - #else - simde__m256i_private - mask_ = simde__m256i_to_private(mask), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (mask_.i32[i] >> 31) ? mem_addr[i] : INT32_C(0); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskload_epi32 - #define _mm256_maskload_epi32(mem_addr, mask) simde_mm256_maskload_epi32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr), mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskload_epi64 (const int64_t mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128i mask) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm_maskload_epi64(HEDLEY_REINTERPRET_CAST(const long long *, mem_addr), mask); - #else - simde__m128i_private - r_, - mask_ = simde__m128i_to_private(mask), - mask_shr_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - mask_shr_.neon_i64 = vshrq_n_s64(mask_.neon_i64, 63); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(mask_.i64) / sizeof(mask_.i64[0])) ; i++) { - mask_shr_.i64[i] = mask_.i64[i] >> 63; - } - #endif - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = mask_shr_.i64[i] ? mem_addr[i] : INT64_C(0); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_maskload_epi64 - #define _mm_maskload_epi64(mem_addr, mask) simde_mm_maskload_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr), mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskload_epi64 (const int64_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_maskload_epi64(HEDLEY_REINTERPRET_CAST(const long long *, mem_addr), mask); - #else - simde__m256i_private - mask_ = simde__m256i_to_private(mask), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = (mask_.i64[i] >> 63) ? mem_addr[i] : INT64_C(0); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskload_epi64 - #define _mm256_maskload_epi64(mem_addr, mask) simde_mm256_maskload_epi64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr), mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_maskstore_epi32 (int32_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m128i mask, simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - _mm_maskstore_epi32(mem_addr, mask, a); - #else - simde__m128i_private mask_ = simde__m128i_to_private(mask); - simde__m128i_private a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - if (mask_.u32[i] & (UINT32_C(1) << 31)) - mem_addr[i] = a_.i32[i]; - } - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_maskstore_epi32 - #define _mm_maskstore_epi32(mem_addr, mask, a) simde_mm_maskstore_epi32(HEDLEY_REINTERPRET_CAST(int32_t *, mem_addr), mask, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_maskstore_epi32 (int32_t mem_addr[HEDLEY_ARRAY_PARAM(8)], simde__m256i mask, simde__m256i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - _mm256_maskstore_epi32(mem_addr, mask, a); - #else - simde__m256i_private mask_ = simde__m256i_to_private(mask); - simde__m256i_private a_ = simde__m256i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - if (mask_.u32[i] & (UINT32_C(1) << 31)) - mem_addr[i] = a_.i32[i]; - } - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskstore_epi32 - #define _mm256_maskstore_epi32(mem_addr, mask, a) simde_mm256_maskstore_epi32(HEDLEY_REINTERPRET_CAST(int32_t *, mem_addr), mask, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_maskstore_epi64 (int64_t mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128i mask, simde__m128i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - _mm_maskstore_epi64(HEDLEY_REINTERPRET_CAST(long long *, mem_addr), mask, a); - #else - simde__m128i_private mask_ = simde__m128i_to_private(mask); - simde__m128i_private a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - if (mask_.u64[i] >> 63) - mem_addr[i] = a_.i64[i]; - } - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_maskstore_epi64 - #define _mm_maskstore_epi64(mem_addr, mask, a) simde_mm_maskstore_epi64(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr), mask, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_maskstore_epi64 (int64_t mem_addr[HEDLEY_ARRAY_PARAM(4)], simde__m256i mask, simde__m256i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - _mm256_maskstore_epi64(HEDLEY_REINTERPRET_CAST(long long *, mem_addr), mask, a); - #else - simde__m256i_private mask_ = simde__m256i_to_private(mask); - simde__m256i_private a_ = simde__m256i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - if (mask_.u64[i] & (UINT64_C(1) << 63)) - mem_addr[i] = a_.i64[i]; - } - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskstore_epi64 - #define _mm256_maskstore_epi64(mem_addr, mask, a) simde_mm256_maskstore_epi64(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr), mask, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_max_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) && !defined(__PGI) - return _mm256_max_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_max_epi8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_max_epi8(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] > b_.i8[i] ? a_.i8[i] : b_.i8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_max_epi8 - #define _mm256_max_epi8(a, b) simde_mm256_max_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_max_epu8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_max_epu8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_max_epu8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_max_epu8(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_max_epu8 - #define _mm256_max_epu8(a, b) simde_mm256_max_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_max_epu16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_max_epu16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_max_epu16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_max_epu16(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] > b_.u16[i]) ? a_.u16[i] : b_.u16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_max_epu16 - #define _mm256_max_epu16(a, b) simde_mm256_max_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_max_epu32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_max_epu32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_max_epu32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_max_epu32(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] > b_.u32[i]) ? a_.u32[i] : b_.u32[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_max_epu32 - #define _mm256_max_epu32(a, b) simde_mm256_max_epu32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_max_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_max_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_max_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_max_epi16(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_max_epi16 - #define _mm256_max_epi16(a, b) simde_mm256_max_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_max_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_max_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_max_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_max_epi32(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] > b_.i32[i] ? a_.i32[i] : b_.i32[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_max_epi32 - #define _mm256_max_epi32(a, b) simde_mm256_max_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_min_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) && !defined(__PGI) - return _mm256_min_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_min_epi8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_min_epi8(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] < b_.i8[i] ? a_.i8[i] : b_.i8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_min_epi8 - #define _mm256_min_epi8(a, b) simde_mm256_min_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_min_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_min_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_min_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_min_epi16(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_min_epi16 - #define _mm256_min_epi16(a, b) simde_mm256_min_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_min_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_min_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_min_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_min_epi32(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] < b_.i32[i] ? a_.i32[i] : b_.i32[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_min_epi32 - #define _mm256_min_epi32(a, b) simde_mm256_min_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_min_epu8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_min_epu8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_min_epu8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_min_epu8(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_min_epu8 - #define _mm256_min_epu8(a, b) simde_mm256_min_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_min_epu16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_min_epu16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_min_epu16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_min_epu16(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] < b_.u16[i]) ? a_.u16[i] : b_.u16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_min_epu16 - #define _mm256_min_epu16(a, b) simde_mm256_min_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_min_epu32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_min_epu32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_min_epu32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_min_epu32(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] < b_.u32[i]) ? a_.u32[i] : b_.u32[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_min_epu32 - #define _mm256_min_epu32(a, b) simde_mm256_min_epu32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm256_movemask_epi8 (simde__m256i a) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_movemask_epi8(a); - #else - simde__m256i_private a_ = simde__m256i_to_private(a); - uint32_t r = 0; - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(a_.m128i) / sizeof(a_.m128i[0])) ; i++) { - r |= HEDLEY_STATIC_CAST(uint32_t,simde_mm_movemask_epi8(a_.m128i[i])) << (16 * i); - } - #else - r = 0; - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { - r |= HEDLEY_STATIC_CAST(uint32_t, (a_.u8[31 - i] >> 7)) << (31 - i); - } - #endif - - return HEDLEY_STATIC_CAST(int32_t, r); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_movemask_epi8 - #define _mm256_movemask_epi8(a) simde_mm256_movemask_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mpsadbw_epu8 (simde__m256i a, simde__m256i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - const int a_offset1 = imm8 & 4; - const int b_offset1 = (imm8 & 3) << 2; - const int a_offset2 = (imm8 >> 3) & 4; - const int b_offset2 = ((imm8 >> 3) & 3) << 2; - - #if defined(simde_math_abs) - const int halfway_point = HEDLEY_STATIC_CAST(int, (sizeof(r_.u16) / sizeof(r_.u16[0])) ) / 2; - for (int i = 0 ; i < halfway_point ; i++) { - r_.u16[i] = - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset1 + i + 0] - b_.u8[b_offset1 + 0]))) + - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset1 + i + 1] - b_.u8[b_offset1 + 1]))) + - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset1 + i + 2] - b_.u8[b_offset1 + 2]))) + - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset1 + i + 3] - b_.u8[b_offset1 + 3]))); - r_.u16[halfway_point + i] = - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[2 * halfway_point + a_offset2 + i + 0] - b_.u8[2 * halfway_point + b_offset2 + 0]))) + - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[2 * halfway_point + a_offset2 + i + 1] - b_.u8[2 * halfway_point + b_offset2 + 1]))) + - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[2 * halfway_point + a_offset2 + i + 2] - b_.u8[2 * halfway_point + b_offset2 + 2]))) + - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[2 * halfway_point + a_offset2 + i + 3] - b_.u8[2 * halfway_point + b_offset2 + 3]))); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) && SIMDE_DETECT_CLANG_VERSION_CHECK(3,9,0) - #define simde_mm256_mpsadbw_epu8(a, b, imm8) _mm256_mpsadbw_epu8(a, b, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - #define simde_mm256_mpsadbw_epu8(a, b, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_mpsadbw_epu8(simde_mm256_extracti128_si256(a, 1), simde_mm256_extracti128_si256(b, 1), (imm8 >> 3)), \ - simde_mm_mpsadbw_epu8(simde_mm256_extracti128_si256(a, 0), simde_mm256_extracti128_si256(b, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_mpsadbw_epu8 - #define _mm256_mpsadbw_epu8(a, b, imm8) simde_mm256_mpsadbw_epu8(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mul_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_mul_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_mul_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_mul_epi32(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = - HEDLEY_STATIC_CAST(int64_t, a_.i32[i * 2]) * - HEDLEY_STATIC_CAST(int64_t, b_.i32[i * 2]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) -# define _mm256_mul_epi32(a, b) simde_mm256_mul_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mul_epu32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_mul_epu32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_mul_epu32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_mul_epu32(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[i * 2]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[i * 2]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) -# define _mm256_mul_epu32(a, b) simde_mm256_mul_epu32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mulhi_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_mulhi_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (HEDLEY_STATIC_CAST(uint32_t, HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) >> 16)); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) -# define _mm256_mulhi_epi16(a, b) simde_mm256_mulhi_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mulhi_epu16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_mulhi_epu16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]) >> 16); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) -# define _mm256_mulhi_epu16(a, b) simde_mm256_mulhi_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mulhrs_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_mulhrs_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, (((HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) + 0x4000) >> 15)); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) -# define _mm256_mulhrs_epi16(a, b) simde_mm256_mulhrs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mullo_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_mullo_epi16(a, b); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] * b_.i16[i]); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_mullo_epi16 - #define _mm256_mullo_epi16(a, b) simde_mm256_mullo_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mullo_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_mullo_epi32(a, b); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.i32[i] * b_.i32[i]); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_mullo_epi32 - #define _mm256_mullo_epi32(a, b) simde_mm256_mullo_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_mullo_epu32 (simde__m256i a, simde__m256i b) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = a_.u32 * b_.u32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] * b_.u32[i]; - } - #endif - - return simde__m256i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_or_si256 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_or_si256(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_or_si128(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_or_si128(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f | b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] | b_.i32f[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_or_si256 - #define _mm256_or_si256(a, b) simde_mm256_or_si256(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_packs_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_packs_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_packs_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_packs_epi16(a_.m128i[1], b_.m128i[1]); - #else - const size_t halfway_point = (sizeof(r_.i8) / sizeof(r_.i8[0]))/2; - const size_t quarter_point = (sizeof(r_.i8) / sizeof(r_.i8[0]))/4; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.i8[i] = (a_.i16[i] > INT8_MAX) ? INT8_MAX : ((a_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[i])); - r_.i8[i + quarter_point] = (b_.i16[i] > INT8_MAX) ? INT8_MAX : ((b_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[i])); - r_.i8[halfway_point + i] = (a_.i16[quarter_point + i] > INT8_MAX) ? INT8_MAX : ((a_.i16[quarter_point + i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[quarter_point + i])); - r_.i8[halfway_point + i + quarter_point] = (b_.i16[quarter_point + i] > INT8_MAX) ? INT8_MAX : ((b_.i16[quarter_point + i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[quarter_point + i])); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_packs_epi16 - #define _mm256_packs_epi16(a, b) simde_mm256_packs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_packs_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_packs_epi32(a, b); - #else - simde__m256i_private - r_, - v_[] = { - simde__m256i_to_private(a), - simde__m256i_to_private(b) - }; - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_packs_epi32(v_[0].m128i[0], v_[1].m128i[0]); - r_.m128i[1] = simde_mm_packs_epi32(v_[0].m128i[1], v_[1].m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - const int32_t v = v_[(i >> 2) & 1].i32[(i & 11) - ((i & 8) >> 1)]; - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, (v > INT16_MAX) ? INT16_MAX : ((v < INT16_MIN) ? INT16_MIN : v)); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_packs_epi32 - #define _mm256_packs_epi32(a, b) simde_mm256_packs_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_packus_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_packus_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_packus_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_packus_epi16(a_.m128i[1], b_.m128i[1]); - #else - const size_t halfway_point = (sizeof(r_.i8) / sizeof(r_.i8[0])) / 2; - const size_t quarter_point = (sizeof(r_.i8) / sizeof(r_.i8[0])) / 4; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.u8[i] = (a_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[i])); - r_.u8[i + quarter_point] = (b_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[i])); - r_.u8[halfway_point + i] = (a_.i16[quarter_point + i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[quarter_point + i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[quarter_point + i])); - r_.u8[halfway_point + i + quarter_point] = (b_.i16[quarter_point + i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[quarter_point + i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[quarter_point + i])); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_packus_epi16 - #define _mm256_packus_epi16(a, b) simde_mm256_packus_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_packus_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_packus_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_packus_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_packus_epi32(a_.m128i[1], b_.m128i[1]); - #else - const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2; - const size_t quarter_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 4; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < quarter_point ; i++) { - r_.u16[i] = (a_.i32[i] > UINT16_MAX) ? UINT16_MAX : ((a_.i32[i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, a_.i32[i])); - r_.u16[i + quarter_point] = (b_.i32[i] > UINT16_MAX) ? UINT16_MAX : ((b_.i32[i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, b_.i32[i])); - r_.u16[halfway_point + i] = (a_.i32[quarter_point + i] > UINT16_MAX) ? UINT16_MAX : ((a_.i32[quarter_point + i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, a_.i32[quarter_point + i])); - r_.u16[halfway_point + i + quarter_point] = (b_.i32[quarter_point + i] > UINT16_MAX) ? UINT16_MAX : ((b_.i32[quarter_point + i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, b_.i32[quarter_point + i])); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_packus_epi32 - #define _mm256_packus_epi32(a, b) simde_mm256_packus_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_permute2x128_si256 (simde__m256i a, simde__m256i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - r_.m128i_private[0] = (imm8 & 0x08) ? simde__m128i_to_private(simde_mm_setzero_si128()) : ((imm8 & 0x02) ? b_.m128i_private[(imm8 ) & 1] : a_.m128i_private[(imm8 ) & 1]); - r_.m128i_private[1] = (imm8 & 0x80) ? simde__m128i_to_private(simde_mm_setzero_si128()) : ((imm8 & 0x20) ? b_.m128i_private[(imm8 >> 4) & 1] : a_.m128i_private[(imm8 >> 4) & 1]); - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_permute2x128_si256(a, b, imm8) _mm256_permute2x128_si256(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_permute2x128_si256 - #define _mm256_permute2x128_si256(a, b, imm8) simde_mm256_permute2x128_si256(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_permute4x64_epi64 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - r_.i64[0] = (imm8 & 0x02) ? a_.i64[((imm8 ) & 1)+2] : a_.i64[(imm8 ) & 1]; - r_.i64[1] = (imm8 & 0x08) ? a_.i64[((imm8 >> 2 ) & 1)+2] : a_.i64[(imm8 >> 2 ) & 1]; - r_.i64[2] = (imm8 & 0x20) ? a_.i64[((imm8 >> 4 ) & 1)+2] : a_.i64[(imm8 >> 4 ) & 1]; - r_.i64[3] = (imm8 & 0x80) ? a_.i64[((imm8 >> 6 ) & 1)+2] : a_.i64[(imm8 >> 6 ) & 1]; - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_permute4x64_epi64(a, imm8) _mm256_permute4x64_epi64(a, imm8) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_permute4x64_epi64 - #define _mm256_permute4x64_epi64(a, imm8) simde_mm256_permute4x64_epi64(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_permute4x64_pd (simde__m256d a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - r_.f64[0] = (imm8 & 0x02) ? a_.f64[((imm8 ) & 1)+2] : a_.f64[(imm8 ) & 1]; - r_.f64[1] = (imm8 & 0x08) ? a_.f64[((imm8 >> 2 ) & 1)+2] : a_.f64[(imm8 >> 2 ) & 1]; - r_.f64[2] = (imm8 & 0x20) ? a_.f64[((imm8 >> 4 ) & 1)+2] : a_.f64[(imm8 >> 4 ) & 1]; - r_.f64[3] = (imm8 & 0x80) ? a_.f64[((imm8 >> 6 ) & 1)+2] : a_.f64[(imm8 >> 6 ) & 1]; - - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_permute4x64_pd(a, imm8) _mm256_permute4x64_pd(a, imm8) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_permute4x64_pd - #define _mm256_permute4x64_pd(a, imm8) simde_mm256_permute4x64_pd(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_permutevar8x32_epi32 (simde__m256i a, simde__m256i idx) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_permutevar8x32_epi32(a, idx); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - idx_ = simde__m256i_to_private(idx); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[idx_.i32[i] & 7]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_permutevar8x32_epi32 - #define _mm256_permutevar8x32_epi32(a, idx) simde_mm256_permutevar8x32_epi32(a, idx) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_permutevar8x32_ps (simde__m256 a, simde__m256i idx) { - #if defined(SIMDE_X86_AVX2_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - return _mm256_permutevar8x32_ps(a, HEDLEY_REINTERPRET_CAST(simde__m256, idx)); - #else - return _mm256_permutevar8x32_ps(a, idx); - #endif - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - simde__m256i_private - idx_ = simde__m256i_to_private(idx); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[idx_.i32[i] & 7]; - } - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_permutevar8x32_ps - #define _mm256_permutevar8x32_ps(a, idx) simde_mm256_permutevar8x32_ps(a, idx) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sad_epu8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sad_epu8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sad_epu8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_sad_epu8(a_.m128i[1], b_.m128i[1]); - #else - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - uint16_t tmp = 0; - SIMDE_VECTORIZE_REDUCTION(+:tmp) - for (size_t j = 0 ; j < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 4) ; j++) { - const size_t e = j + (i * 8); - tmp += (a_.u8[e] > b_.u8[e]) ? (a_.u8[e] - b_.u8[e]) : (b_.u8[e] - a_.u8[e]); - } - r_.i64[i] = tmp; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sad_epu8 - #define _mm256_sad_epu8(a, b) simde_mm256_sad_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_shuffle_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_shuffle_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_shuffle_epi8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_shuffle_epi8(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 2) ; i++) { - r_.u8[ i ] = (b_.u8[ i ] & 0x80) ? 0 : a_.u8[(b_.u8[ i ] & 0x0f) ]; - r_.u8[i + 16] = (b_.u8[i + 16] & 0x80) ? 0 : a_.u8[(b_.u8[i + 16] & 0x0f) + 16]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_shuffle_epi8 - #define _mm256_shuffle_epi8(a, b) simde_mm256_shuffle_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_shuffle_epi32 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - for (size_t i = 0 ; i < ((sizeof(r_.i32) / sizeof(r_.i32[0])) / 2) ; i++) { - r_.i32[i] = a_.i32[(imm8 >> (i * 2)) & 3]; - } - for (size_t i = 0 ; i < ((sizeof(r_.i32) / sizeof(r_.i32[0])) / 2) ; i++) { - r_.i32[i + 4] = a_.i32[((imm8 >> (i * 2)) & 3) + 4]; - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_shuffle_epi32(a, imm8) _mm256_shuffle_epi32(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) && !defined(__PGI) -# define simde_mm256_shuffle_epi32(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_shuffle_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_shuffle_epi32(simde_mm256_extracti128_si256(a, 0), (imm8))) -#elif defined(SIMDE_SHUFFLE_VECTOR_) -# define simde_mm256_shuffle_epi32(a, imm8) (__extension__ ({ \ - const simde__m256i_private simde_tmp_a_ = simde__m256i_to_private(a); \ - simde__m256i_from_private((simde__m256i_private) { .i32 = \ - SIMDE_SHUFFLE_VECTOR_(32, 32, \ - (simde_tmp_a_).i32, \ - (simde_tmp_a_).i32, \ - ((imm8) ) & 3, \ - ((imm8) >> 2) & 3, \ - ((imm8) >> 4) & 3, \ - ((imm8) >> 6) & 3, \ - (((imm8) ) & 3) + 4, \ - (((imm8) >> 2) & 3) + 4, \ - (((imm8) >> 4) & 3) + 4, \ - (((imm8) >> 6) & 3) + 4) }); })) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_shuffle_epi32 - #define _mm256_shuffle_epi32(a, imm8) simde_mm256_shuffle_epi32(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_shufflehi_epi16(a, imm8) _mm256_shufflehi_epi16(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_shufflehi_epi16(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 0), (imm8))) -#elif defined(SIMDE_SHUFFLE_VECTOR_) -# define simde_mm256_shufflehi_epi16(a, imm8) (__extension__ ({ \ - const simde__m256i_private simde_tmp_a_ = simde__m256i_to_private(a); \ - simde__m256i_from_private((simde__m256i_private) { .i16 = \ - SIMDE_SHUFFLE_VECTOR_(16, 32, \ - (simde_tmp_a_).i16, \ - (simde_tmp_a_).i16, \ - 0, 1, 2, 3, \ - (((imm8) ) & 3) + 4, \ - (((imm8) >> 2) & 3) + 4, \ - (((imm8) >> 4) & 3) + 4, \ - (((imm8) >> 6) & 3) + 4, \ - 8, 9, 10, 11, \ - ((((imm8) ) & 3) + 8 + 4), \ - ((((imm8) >> 2) & 3) + 8 + 4), \ - ((((imm8) >> 4) & 3) + 8 + 4), \ - ((((imm8) >> 6) & 3) + 8 + 4) \ - ) }); })) -#else -# define simde_mm256_shufflehi_epi16(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 1), imm8), \ - simde_mm_shufflehi_epi16(simde_mm256_extracti128_si256(a, 0), imm8)) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_shufflehi_epi16 - #define _mm256_shufflehi_epi16(a, imm8) simde_mm256_shufflehi_epi16(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_shufflelo_epi16(a, imm8) _mm256_shufflelo_epi16(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_shufflelo_epi16(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 0), (imm8))) -#elif defined(SIMDE_SHUFFLE_VECTOR_) -# define simde_mm256_shufflelo_epi16(a, imm8) (__extension__ ({ \ - const simde__m256i_private simde_tmp_a_ = simde__m256i_to_private(a); \ - simde__m256i_from_private((simde__m256i_private) { .i16 = \ - SIMDE_SHUFFLE_VECTOR_(16, 32, \ - (simde_tmp_a_).i16, \ - (simde_tmp_a_).i16, \ - (((imm8) ) & 3), \ - (((imm8) >> 2) & 3), \ - (((imm8) >> 4) & 3), \ - (((imm8) >> 6) & 3), \ - 4, 5, 6, 7, \ - ((((imm8) ) & 3) + 8), \ - ((((imm8) >> 2) & 3) + 8), \ - ((((imm8) >> 4) & 3) + 8), \ - ((((imm8) >> 6) & 3) + 8), \ - 12, 13, 14, 15) }); })) -#else -# define simde_mm256_shufflelo_epi16(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 1), imm8), \ - simde_mm_shufflelo_epi16(simde_mm256_extracti128_si256(a, 0), imm8)) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_shufflelo_epi16 - #define _mm256_shufflelo_epi16(a, imm8) simde_mm256_shufflelo_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sign_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sign_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (b_.i8[i] < INT32_C(0)) ? -a_.i8[i] : a_.i8[i]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sign_epi8 - #define _mm256_sign_epi8(a, b) simde_mm256_sign_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sign_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sign_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (b_.i16[i] < INT32_C(0)) ? -a_.i16[i] : a_.i16[i]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sign_epi16 - #define _mm256_sign_epi16(a, b) simde_mm256_sign_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sign_epi32(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sign_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { - r_.i32[i] = (b_.i32[i] < INT32_C(0)) ? -a_.i32[i] : a_.i32[i]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sign_epi32 - #define _mm256_sign_epi32(a, b) simde_mm256_sign_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sll_epi16 (simde__m256i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sll_epi16(a, count); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sll_epi16(a_.m128i[0], count); - r_.m128i[1] = simde_mm_sll_epi16(a_.m128i[1], count); - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]); - if (shift > 15) - return simde_mm256_setzero_si256(); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << (shift)); - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sll_epi16 - #define _mm256_sll_epi16(a, count) simde_mm256_sll_epi16(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sll_epi32 (simde__m256i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sll_epi32(a, count); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sll_epi32(a_.m128i[0], count); - r_.m128i[1] = simde_mm_sll_epi32(a_.m128i[1], count); - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]); - if (shift > 31) - return simde_mm256_setzero_si256(); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 << HEDLEY_STATIC_CAST(int32_t, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.i32[i] << (shift)); - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sll_epi32 - #define _mm256_sll_epi32(a, count) simde_mm256_sll_epi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sll_epi64 (simde__m256i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sll_epi64(a, count); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sll_epi64(a_.m128i[0], count); - r_.m128i[1] = simde_mm_sll_epi64(a_.m128i[1], count); - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]); - if (shift > 63) - return simde_mm256_setzero_si256(); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i64 = a_.i64 << HEDLEY_STATIC_CAST(int64_t, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = HEDLEY_STATIC_CAST(int64_t, a_.i64[i] << (shift)); - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sll_epi64 - #define _mm256_sll_epi64(a, count) simde_mm256_sll_epi64(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_slli_epi16 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - /* Note: There is no consistency in how compilers handle values outside of - the expected range, hence the discrepancy between what we allow and what - Intel specifies. Some compilers will return 0, others seem to just mask - off everything outside of the range. */ - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned short, imm8)); - for (size_t i = 0 ; i < (sizeof(a_.altivec_i16) / sizeof(a_.altivec_i16[0])) ; i++) { - r_.altivec_i16[i] = vec_sl(a_.altivec_i16[i], sv); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, imm8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << (imm8 & 0xff)); - } - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_slli_epi16(a, imm8) _mm256_slli_epi16(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_slli_epi16(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_slli_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_slli_epi16(simde_mm256_extracti128_si256(a, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_slli_epi16 - #define _mm256_slli_epi16(a, imm8) simde_mm256_slli_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_slli_epi32 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned int, imm8)); - for (size_t i = 0 ; i < (sizeof(a_.altivec_i32) / sizeof(a_.altivec_i32[0])) ; i++) { - r_.altivec_i32[i] = vec_sl(a_.altivec_i32[i], sv); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 << HEDLEY_STATIC_CAST(int32_t, imm8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] << (imm8 & 0xff); - } - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_slli_epi32(a, imm8) _mm256_slli_epi32(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_slli_epi32(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_slli_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_slli_epi32(simde_mm256_extracti128_si256(a, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_slli_epi32 - #define _mm256_slli_epi32(a, imm8) simde_mm256_slli_epi32(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_slli_epi64 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - -#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i64 = a_.i64 << HEDLEY_STATIC_CAST(int64_t, imm8); -#else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] << (imm8 & 0xff); - } -#endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_slli_epi64(a, imm8) _mm256_slli_epi64(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_slli_epi64(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_slli_epi64(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_slli_epi64(simde_mm256_extracti128_si256(a, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_slli_epi64 - #define _mm256_slli_epi64(a, imm8) simde_mm256_slli_epi64(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_slli_si256 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - for (size_t h = 0 ; h < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; h++) { - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i_private[h].i8) / sizeof(r_.m128i_private[h].i8[0])) ; i++) { - const int e = HEDLEY_STATIC_CAST(int, i) - imm8; - r_.m128i_private[h].i8[i] = (e >= 0) ? a_.m128i_private[h].i8[e] : 0; - } - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_slli_si256(a, imm8) _mm256_slli_si256(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) && !defined(__PGI) -# define simde_mm256_slli_si256(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_slli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_slli_si128(simde_mm256_extracti128_si256(a, 0), (imm8))) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define simde_mm256_slli_si256(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_bslli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_bslli_si128(simde_mm256_extracti128_si256(a, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_slli_si256 - #define _mm256_slli_si256(a, imm8) simde_mm256_slli_si256(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sllv_epi32 (simde__m128i a, simde__m128i b) { - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vshlq_u32(a_.neon_u32, vreinterpretq_s32_u32(b_.neon_u32)); - r_.neon_u32 = vandq_u32(r_.neon_u32, vcltq_u32(b_.neon_u32, vdupq_n_u32(32))); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (b_.u32 < UINT32_C(32))) & (a_.u32 << b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (b_.u32[i] < 32) ? (a_.u32[i] << b_.u32[i]) : 0; - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_sllv_epi32(a, b) _mm_sllv_epi32(a, b) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_sllv_epi32 - #define _mm_sllv_epi32(a, b) simde_mm_sllv_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sllv_epi32 (simde__m256i a, simde__m256i b) { - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b), - r_; - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sllv_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_sllv_epi32(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (b_.u32 < 32)) & (a_.u32 << b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (b_.u32[i] < 32) ? (a_.u32[i] << b_.u32[i]) : 0; - } - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_sllv_epi32(a, b) _mm256_sllv_epi32(a, b) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sllv_epi32 - #define _mm256_sllv_epi32(a, b) simde_mm256_sllv_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sllv_epi64 (simde__m128i a, simde__m128i b) { - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vshlq_u64(a_.neon_u64, vreinterpretq_s64_u64(b_.neon_u64)); - r_.neon_u64 = vandq_u64(r_.neon_u64, vcltq_u64(b_.neon_u64, vdupq_n_u64(64))); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (b_.u64 < 64)) & (a_.u64 << b_.u64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (b_.u64[i] < 64) ? (a_.u64[i] << b_.u64[i]) : 0; - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_sllv_epi64(a, b) _mm_sllv_epi64(a, b) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_sllv_epi64 - #define _mm_sllv_epi64(a, b) simde_mm_sllv_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sllv_epi64 (simde__m256i a, simde__m256i b) { - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b), - r_; - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sllv_epi64(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_sllv_epi64(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (b_.u64 < 64)) & (a_.u64 << b_.u64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (b_.u64[i] < 64) ? (a_.u64[i] << b_.u64[i]) : 0; - } - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_sllv_epi64(a, b) _mm256_sllv_epi64(a, b) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sllv_epi64 - #define _mm256_sllv_epi64(a, b) simde_mm256_sllv_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sra_epi16 (simde__m256i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sra_epi16(a, count); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sra_epi16(a_.m128i[0], count); - r_.m128i[1] = simde_mm_sra_epi16(a_.m128i[1], count); - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]); - - if (shift > 15) shift = 15; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 >> HEDLEY_STATIC_CAST(int16_t, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] >> shift; - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sra_epi16 - #define _mm256_sra_epi16(a, count) simde_mm256_sra_epi16(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sra_epi32 (simde__m256i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sra_epi32(a, count); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sra_epi32(a_.m128i[0], count); - r_.m128i[1] = simde_mm_sra_epi32(a_.m128i[1], count); - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]); - - if (shift > 31) shift = 31; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 >> HEDLEY_STATIC_CAST(int16_t, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] >> shift; - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sra_epi32 - #define _mm256_sra_epi32(a, count) simde_mm256_sra_epi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srai_epi16 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - unsigned int shift = HEDLEY_STATIC_CAST(unsigned int, imm8); - - if (shift > 15) shift = 15; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 >> HEDLEY_STATIC_CAST(int16_t, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] >> shift; - } - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_srai_epi16(a, imm8) _mm256_srai_epi16(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_srai_epi16(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_srai_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_srai_epi16(simde_mm256_extracti128_si256(a, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srai_epi16 - #define _mm256_srai_epi16(a, imm8) simde_mm256_srai_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srai_epi32 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - unsigned int shift = HEDLEY_STATIC_CAST(unsigned int, imm8); - - if (shift > 31) shift = 31; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 >> HEDLEY_STATIC_CAST(int16_t, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] >> shift; - } - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_srai_epi32(a, imm8) _mm256_srai_epi32(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_srai_epi32(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_srai_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_srai_epi32(simde_mm256_extracti128_si256(a, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srai_epi32 - #define _mm256_srai_epi32(a, imm8) simde_mm256_srai_epi32(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srav_epi32 (simde__m128i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm_srav_epi32(a, count); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - count_ = simde__m128i_to_private(count); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int32x4_t cnt = vreinterpretq_s32_u32(vminq_u32(count_.neon_u32, vdupq_n_u32(31))); - r_.neon_i32 = vshlq_s32(a_.neon_i32, vnegq_s32(cnt)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - uint32_t shift = HEDLEY_STATIC_CAST(uint32_t, count_.i32[i]); - r_.i32[i] = a_.i32[i] >> HEDLEY_STATIC_CAST(int, shift > 31 ? 31 : shift); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_srav_epi32 - #define _mm_srav_epi32(a, count) simde_mm_srav_epi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srav_epi32 (simde__m256i a, simde__m256i count) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_srav_epi32(a, count); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - count_ = simde__m256i_to_private(count); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_srav_epi32(a_.m128i[0], count_.m128i[0]); - r_.m128i[1] = simde_mm_srav_epi32(a_.m128i[1], count_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - uint32_t shift = HEDLEY_STATIC_CAST(uint32_t, count_.i32[i]); - if (shift > 31) shift = 31; - r_.i32[i] = a_.i32[i] >> shift; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srav_epi32 - #define _mm256_srav_epi32(a, count) simde_mm256_srav_epi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srl_epi16 (simde__m256i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_srl_epi16(a, count); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_srl_epi16(a_.m128i[0], count); - r_.m128i[1] = simde_mm_srl_epi16(a_.m128i[1], count); - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t , (count_.i64[0] > 16 ? 16 : count_.i64[0])); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.u16[i] = a_.u16[i] >> (shift); - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srl_epi16 - #define _mm256_srl_epi16(a, count) simde_mm256_srl_epi16(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srl_epi32 (simde__m256i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_srl_epi32(a, count); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_srl_epi32(a_.m128i[0], count); - r_.m128i[1] = simde_mm_srl_epi32(a_.m128i[1], count); - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t , (count_.i64[0] > 32 ? 32 : count_.i64[0])); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(32, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.u32[i] = a_.u32[i] >> (shift); - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srl_epi32 - #define _mm256_srl_epi32(a, count) simde_mm256_srl_epi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srl_epi64 (simde__m256i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_srl_epi64(a, count); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_srl_epi64(a_.m128i[0], count); - r_.m128i[1] = simde_mm_srl_epi64(a_.m128i[1], count); - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t , (count_.i64[0] > 64 ? 64 : count_.i64[0])); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(64, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.u64[i] = a_.u64[i] >> (shift); - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srl_epi64 - #define _mm256_srl_epi64(a, count) simde_mm256_srl_epi64(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srli_epi16 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - if (imm8 > 15) - return simde_mm256_setzero_si256(); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned short, imm8)); - for (size_t i = 0 ; i < (sizeof(a_.altivec_u16) / sizeof(a_.altivec_u16[0])) ; i++) { - r_.altivec_u16[i] = vec_sr(a_.altivec_u16[i], sv); - } - #else - if (HEDLEY_STATIC_CAST(unsigned int, imm8) > 15) { - simde_memset(&r_, 0, sizeof(r_)); - } else { - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, imm8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = a_.u16[i] >> imm8; - } - #endif - } - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_srli_epi16(a, imm8) _mm256_srli_epi16(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_srli_epi16(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_srli_epi16(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_srli_epi16(simde_mm256_extracti128_si256(a, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srli_epi16 - #define _mm256_srli_epi16(a, imm8) simde_mm256_srli_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srli_epi32 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) sv = vec_splats(HEDLEY_STATIC_CAST(unsigned int, imm8)); - for (size_t i = 0 ; i < (sizeof(a_.altivec_u32) / sizeof(a_.altivec_u32[0])) ; i++) { - r_.altivec_u32[i] = vec_sr(a_.altivec_u32[i], sv); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, imm8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] >> imm8; - } - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_srli_epi32(a, imm8) _mm256_srli_epi32(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_srli_epi32(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_srli_epi32(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_srli_epi32(simde_mm256_extracti128_si256(a, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srli_epi32 - #define _mm256_srli_epi32(a, imm8) simde_mm256_srli_epi32(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srli_epi64 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - -#if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(32, imm8); -#else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = a_.u64[i] >> imm8; - } -#endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_srli_epi64(a, imm8) _mm256_srli_epi64(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) -# define simde_mm256_srli_epi64(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_srli_epi64(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_srli_epi64(simde_mm256_extracti128_si256(a, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srli_epi64 - #define _mm256_srli_epi64(a, imm8) simde_mm256_srli_epi64(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srli_si256 (simde__m256i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - for (size_t h = 0 ; h < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; h++) { - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i_private[h].i8) / sizeof(r_.m128i_private[h].i8[0])) ; i++) { - const int e = imm8 + HEDLEY_STATIC_CAST(int, i); - r_.m128i_private[h].i8[i] = (e < 16) ? a_.m128i_private[h].i8[e] : 0; - } - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) -# define simde_mm256_srli_si256(a, imm8) _mm256_srli_si256(a, imm8) -#elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) && !defined(__PGI) -# define simde_mm256_srli_si256(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_srli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_srli_si128(simde_mm256_extracti128_si256(a, 0), (imm8))) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define simde_mm256_srli_si256(a, imm8) \ - simde_mm256_set_m128i( \ - simde_mm_bsrli_si128(simde_mm256_extracti128_si256(a, 1), (imm8)), \ - simde_mm_bsrli_si128(simde_mm256_extracti128_si256(a, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srli_si256 - #define _mm256_srli_si256(a, imm8) simde_mm256_srli_si256(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srlv_epi32 (simde__m128i a, simde__m128i b) { - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - r_; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (b_.u32 < 32)) & (a_.u32 >> b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (b_.u32[i] < 32) ? (a_.u32[i] >> b_.u32[i]) : 0; - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_srlv_epi32(a, b) _mm_srlv_epi32(a, b) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_srlv_epi32 - #define _mm_srlv_epi32(a, b) simde_mm_srlv_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srlv_epi32 (simde__m256i a, simde__m256i b) { - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b), - r_; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (b_.u32 < 32)) & (a_.u32 >> b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (b_.u32[i] < 32) ? (a_.u32[i] >> b_.u32[i]) : 0; - } - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_srlv_epi32(a, b) _mm256_srlv_epi32(a, b) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srlv_epi32 - #define _mm256_srlv_epi32(a, b) simde_mm256_srlv_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srlv_epi64 (simde__m128i a, simde__m128i b) { - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - r_; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (b_.u64 < 64)) & (a_.u64 >> b_.u64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (b_.u64[i] < 64) ? (a_.u64[i] >> b_.u64[i]) : 0; - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm_srlv_epi64(a, b) _mm_srlv_epi64(a, b) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm_srlv_epi64 - #define _mm_srlv_epi64(a, b) simde_mm_srlv_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srlv_epi64 (simde__m256i a, simde__m256i b) { - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b), - r_; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (b_.u64 < 64)) & (a_.u64 >> b_.u64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (b_.u64[i] < 64) ? (a_.u64[i] >> b_.u64[i]) : 0; - } - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX2_NATIVE) - #define simde_mm256_srlv_epi64(a, b) _mm256_srlv_epi64(a, b) -#endif -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_srlv_epi64 - #define _mm256_srlv_epi64(a, b) simde_mm256_srlv_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_stream_load_si256 (const simde__m256i* mem_addr) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_stream_load_si256(HEDLEY_CONST_CAST(simde__m256i*, mem_addr)); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && defined(SIMDE_VECTOR_SUBSCRIPT) - return __builtin_nontemporal_load(mem_addr); - #else - simde__m256i r; - simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m256i), sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) -# define _mm256_stream_load_si256(mem_addr) simde_mm256_stream_load_si256(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sub_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sub_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sub_epi8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_sub_epi8(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = a_.i8 - b_.i8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] - b_.i8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sub_epi8 - #define _mm256_sub_epi8(a, b) simde_mm256_sub_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sub_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sub_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sub_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_sub_epi16(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = a_.i16 - b_.i16; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] - b_.i16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sub_epi16 - #define _mm256_sub_epi16(a, b) simde_mm256_sub_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_hsub_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_hsub_epi16(a, b); - #else - return simde_mm256_sub_epi16(simde_x_mm256_deinterleaveeven_epi16(a, b), simde_x_mm256_deinterleaveodd_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_hsub_epi16 - #define _mm256_hsub_epi16(a, b) simde_mm256_hsub_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sub_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sub_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sub_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_sub_epi32(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 - b_.i32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] - b_.i32[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sub_epi32 - #define _mm256_sub_epi32(a, b) simde_mm256_sub_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_hsub_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_hsub_epi32(a, b); - #else - return simde_mm256_sub_epi32(simde_x_mm256_deinterleaveeven_epi32(a, b), simde_x_mm256_deinterleaveodd_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_hsub_epi32 - #define _mm256_hsub_epi32(a, b) simde_mm256_hsub_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_sub_epi64 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_sub_epi64(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_sub_epi64(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_sub_epi64(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 - b_.i64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] - b_.i64[i]; - } - #endif - - return simde__m256i_from_private(r_); -#endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_sub_epi64 - #define _mm256_sub_epi64(a, b) simde_mm256_sub_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_sub_epu32 (simde__m256i a, simde__m256i b) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = a_.u32 - b_.u32; - #elif SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_x_mm_sub_epu32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_x_mm_sub_epu32(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] - b_.u32[i]; - } - #endif - - return simde__m256i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_subs_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_subs_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_subs_epi8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_subs_epi8(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = simde_math_subs_i8(a_.i8[i], b_.i8[i]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_subs_epi8 - #define _mm256_subs_epi8(a, b) simde_mm256_subs_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_subs_epi16(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_subs_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_subs_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_subs_epi16(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = simde_math_subs_i16(a_.i16[i], b_.i16[i]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_subs_epi16 - #define _mm256_subs_epi16(a, b) simde_mm256_subs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_hsubs_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_hsubs_epi16(a, b); - #else - return simde_mm256_subs_epi16(simde_x_mm256_deinterleaveeven_epi16(a, b), simde_x_mm256_deinterleaveodd_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_hsubs_epi16 - #define _mm256_hsubs_epi16(a, b) simde_mm256_hsubs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_subs_epu8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_subs_epu8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_subs_epu8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_subs_epu8(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = simde_math_subs_u8(a_.u8[i], b_.u8[i]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_subs_epu8 - #define _mm256_subs_epu8(a, b) simde_mm256_subs_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_subs_epu16(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_subs_epu16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_subs_epu16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_subs_epu16(a_.m128i[1], b_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = simde_math_subs_u16(a_.u16[i], b_.u16[i]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_subs_epu16 - #define _mm256_subs_epu16(a, b) simde_mm256_subs_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_x_mm256_test_all_ones (simde__m256i a) { - simde__m256i_private a_ = simde__m256i_to_private(a); - int r; - int_fast32_t r_ = ~HEDLEY_STATIC_CAST(int_fast32_t, 0); - - SIMDE_VECTORIZE_REDUCTION(&:r_) - for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) { - r_ &= a_.i32f[i]; - } - - r = (r_ == ~HEDLEY_STATIC_CAST(int_fast32_t, 0)); - - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_unpacklo_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_unpacklo_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_unpacklo_epi8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_unpacklo_epi8(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 32, a_.i8, b_.i8, - 0, 32, 1, 33, 2, 34, 3, 35, - 4, 36, 5, 37, 6, 38, 7, 39, - 16, 48, 17, 49, 18, 50, 19, 51, - 20, 52, 21, 53, 22, 54, 23, 55); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0]) / 2) ; i++) { - r_.i8[2 * i] = a_.i8[i + ~(~i | 7)]; - r_.i8[2 * i + 1] = b_.i8[i + ~(~i | 7)]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpacklo_epi8 - #define _mm256_unpacklo_epi8(a, b) simde_mm256_unpacklo_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_unpacklo_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_unpacklo_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_unpacklo_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_unpacklo_epi16(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 =SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, - 0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0]) / 2) ; i++) { - r_.i16[2 * i] = a_.i16[i + ~(~i | 3)]; - r_.i16[2 * i + 1] = b_.i16[i + ~(~i | 3)]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpacklo_epi16 - #define _mm256_unpacklo_epi16(a, b) simde_mm256_unpacklo_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_unpacklo_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_unpacklo_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_unpacklo_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_unpacklo_epi32(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, - 0, 8, 1, 9, 4, 12, 5, 13); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0]) / 2) ; i++) { - r_.i32[2 * i] = a_.i32[i + ~(~i | 1)]; - r_.i32[2 * i + 1] = b_.i32[i + ~(~i | 1)]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpacklo_epi32 - #define _mm256_unpacklo_epi32(a, b) simde_mm256_unpacklo_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_unpacklo_epi64 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_unpacklo_epi64(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_unpacklo_epi64(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_unpacklo_epi64(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.i64, b_.i64, 0, 4, 2, 6); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0]) / 2) ; i++) { - r_.i64[2 * i] = a_.i64[2 * i]; - r_.i64[2 * i + 1] = b_.i64[2 * i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpacklo_epi64 - #define _mm256_unpacklo_epi64(a, b) simde_mm256_unpacklo_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_unpackhi_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_unpackhi_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_unpackhi_epi8(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_unpackhi_epi8(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 32, a_.i8, b_.i8, - 8, 40, 9, 41, 10, 42, 11, 43, - 12, 44, 13, 45, 14, 46, 15, 47, - 24, 56, 25, 57, 26, 58, 27, 59, - 28, 60, 29, 61, 30, 62, 31, 63); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0]) / 2) ; i++) { - r_.i8[2 * i] = a_.i8[i + 8 + ~(~i | 7)]; - r_.i8[2 * i + 1] = b_.i8[i + 8 + ~(~i | 7)]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpackhi_epi8 - #define _mm256_unpackhi_epi8(a, b) simde_mm256_unpackhi_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_unpackhi_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_unpackhi_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_unpackhi_epi16(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_unpackhi_epi16(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, - 4, 20, 5, 21, 6, 22, 7, 23, - 12, 28, 13, 29, 14, 30, 15, 31); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0]) / 2) ; i++) { - r_.i16[2 * i] = a_.i16[i + 4 + ~(~i | 3)]; - r_.i16[2 * i + 1] = b_.i16[i + 4 + ~(~i | 3)]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpackhi_epi16 - #define _mm256_unpackhi_epi16(a, b) simde_mm256_unpackhi_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_unpackhi_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_unpackhi_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_unpackhi_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_unpackhi_epi32(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, - 2, 10, 3, 11, 6, 14, 7, 15); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0]) / 2) ; i++) { - r_.i32[2 * i] = a_.i32[i + 2 + ~(~i | 1)]; - r_.i32[2 * i + 1] = b_.i32[i + 2 + ~(~i | 1)]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpackhi_epi32 - #define _mm256_unpackhi_epi32(a, b) simde_mm256_unpackhi_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_unpackhi_epi64 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_unpackhi_epi64(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_unpackhi_epi64(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_unpackhi_epi64(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 32, a_.i64, b_.i64, 1, 5, 3, 7); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0]) / 2) ; i++) { - r_.i64[2 * i] = a_.i64[2 * i + 1]; - r_.i64[2 * i + 1] = b_.i64[2 * i + 1]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_unpackhi_epi64 - #define _mm256_unpackhi_epi64(a, b) simde_mm256_unpackhi_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_xor_si256 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_xor_si256(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_INT_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_xor_si128(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_xor_si128(a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f ^ b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] ^ b_.i64[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX2_ENABLE_NATIVE_ALIASES) - #undef _mm256_xor_si256 - #define _mm256_xor_si256(a, b) simde_mm256_xor_si256(a, b) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX2_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512.h b/ffi-deps/simde/simde/x86/avx512.h deleted file mode 100644 index 103b466..0000000 --- a/ffi-deps/simde/simde/x86/avx512.h +++ /dev/null @@ -1,149 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - */ - -#if !defined(SIMDE_X86_AVX512_H) -#define SIMDE_X86_AVX512_H - -#include "avx512/types.h" - -#include "avx512/2intersect.h" -#include "avx512/4dpwssd.h" -#include "avx512/4dpwssds.h" -#include "avx512/abs.h" -#include "avx512/add.h" -#include "avx512/adds.h" -#include "avx512/and.h" -#include "avx512/andnot.h" -#include "avx512/avg.h" -#include "avx512/bitshuffle.h" -#include "avx512/blend.h" -#include "avx512/broadcast.h" -#include "avx512/cast.h" -#include "avx512/cmp.h" -#include "avx512/cmpeq.h" -#include "avx512/cmpge.h" -#include "avx512/cmpgt.h" -#include "avx512/cmple.h" -#include "avx512/cmplt.h" -#include "avx512/cmpneq.h" -#include "avx512/compress.h" -#include "avx512/conflict.h" -#include "avx512/copysign.h" -#include "avx512/cvt.h" -#include "avx512/cvtt.h" -#include "avx512/cvts.h" -#include "avx512/cvtus.h" -#include "avx512/dbsad.h" -#include "avx512/div.h" -#include "avx512/dpbf16.h" -#include "avx512/dpbusd.h" -#include "avx512/dpbusds.h" -#include "avx512/dpwssd.h" -#include "avx512/dpwssds.h" -#include "avx512/expand.h" -#include "avx512/extract.h" -#include "avx512/fixupimm.h" -#include "avx512/fixupimm_round.h" -#include "avx512/flushsubnormal.h" -#include "avx512/fmadd.h" -#include "avx512/fmsub.h" -#include "avx512/fnmadd.h" -#include "avx512/fnmsub.h" -#include "avx512/fpclass.h" -#include "avx512/gather.h" -#include "avx512/insert.h" -#include "avx512/kand.h" -#include "avx512/kshift.h" -#include "avx512/knot.h" -#include "avx512/kxor.h" -#include "avx512/load.h" -#include "avx512/loadu.h" -#include "avx512/lzcnt.h" -#include "avx512/madd.h" -#include "avx512/maddubs.h" -#include "avx512/max.h" -#include "avx512/min.h" -#include "avx512/mov.h" -#include "avx512/mov_mask.h" -#include "avx512/movm.h" -#include "avx512/mul.h" -#include "avx512/mulhi.h" -#include "avx512/mulhrs.h" -#include "avx512/mullo.h" -#include "avx512/multishift.h" -#include "avx512/negate.h" -#include "avx512/or.h" -#include "avx512/packs.h" -#include "avx512/packus.h" -#include "avx512/permutex.h" -#include "avx512/permutexvar.h" -#include "avx512/permutex2var.h" -#include "avx512/popcnt.h" -#include "avx512/range.h" -#include "avx512/range_round.h" -#include "avx512/rcp.h" -#include "avx512/reduce.h" -#include "avx512/rol.h" -#include "avx512/rolv.h" -#include "avx512/ror.h" -#include "avx512/rorv.h" -#include "avx512/round.h" -#include "avx512/roundscale.h" -#include "avx512/roundscale_round.h" -#include "avx512/sad.h" -#include "avx512/scalef.h" -#include "avx512/set.h" -#include "avx512/set1.h" -#include "avx512/set4.h" -#include "avx512/setr.h" -#include "avx512/setr4.h" -#include "avx512/setzero.h" -#include "avx512/setone.h" -#include "avx512/shldv.h" -#include "avx512/shuffle.h" -#include "avx512/sll.h" -#include "avx512/slli.h" -#include "avx512/sllv.h" -#include "avx512/sqrt.h" -#include "avx512/sra.h" -#include "avx512/srai.h" -#include "avx512/srav.h" -#include "avx512/srl.h" -#include "avx512/srli.h" -#include "avx512/srlv.h" -#include "avx512/store.h" -#include "avx512/storeu.h" -#include "avx512/sub.h" -#include "avx512/subs.h" -#include "avx512/ternarylogic.h" -#include "avx512/test.h" -#include "avx512/testn.h" -#include "avx512/unpacklo.h" -#include "avx512/unpackhi.h" -#include "avx512/xor.h" -#include "avx512/xorsign.h" - -#endif diff --git a/ffi-deps/simde/simde/x86/avx512/2intersect.h b/ffi-deps/simde/simde/x86/avx512/2intersect.h deleted file mode 100644 index 81b0ee1..0000000 --- a/ffi-deps/simde/simde/x86/avx512/2intersect.h +++ /dev/null @@ -1,249 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Ashleigh Newman-Jones - */ - -#if !defined(SIMDE_X86_AVX512_2INTERSECT_H) -#define SIMDE_X86_AVX512_2INTERSECT_H - -#include "types.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_2intersect_epi32(simde__m128i a, simde__m128i b, simde__mmask8 *k1, simde__mmask8 *k2) { - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - simde__mmask8 - k1_ = 0, - k2_ = 0; - - for (size_t i = 0 ; i < sizeof(a_.i32) / sizeof(a_.i32[0]) ; i++) { - #if defined(SIMDE_ENABLE_OPENMP) - #pragma omp simd reduction(|:k1_) reduction(|:k2_) - #else - SIMDE_VECTORIZE - #endif - for (size_t j = 0 ; j < sizeof(b_.i32) / sizeof(b_.i32[0]) ; j++) { - const int32_t m = a_.i32[i] == b_.i32[j]; - k1_ |= m << i; - k2_ |= m << j; - } - } - - *k1 = k1_; - *k2 = k2_; -} -#if defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_2intersect_epi32(a, b, k1, k2) _mm_2intersect_epi32(a, b, k1, k2) -#endif -#if defined(SIMDE_X86_AVX512VP2INTERSECT_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_2intersect_epi32 - #define _mm_2intersect_epi32(a, b, k1, k2) simde_mm_2intersect_epi32(a, b, k1, k2) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_2intersect_epi64(simde__m128i a, simde__m128i b, simde__mmask8 *k1, simde__mmask8 *k2) { - #if defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm_2intersect_epi64(a, b, k1, k2); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - simde__mmask8 - k1_ = 0, - k2_ = 0; - - for (size_t i = 0 ; i < sizeof(a_.i64) / sizeof(a_.i64[0]) ; i++) { - #if defined(SIMDE_ENABLE_OPENMP) - #pragma omp simd reduction(|:k1_) reduction(|:k2_) - #else - SIMDE_VECTORIZE - #endif - for (size_t j = 0 ; j < sizeof(b_.i64) / sizeof(b_.i64[0]) ; j++) { - const int32_t m = a_.i64[i] == b_.i64[j]; - k1_ |= m << i; - k2_ |= m << j; - } - } - - *k1 = k1_; - *k2 = k2_; - #endif -} -#if defined(SIMDE_X86_AVX512VP2INTERSECT_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_2intersect_epi64 - #define _mm_2intersect_epi64(a, b, k1, k2) simde_mm_2intersect_epi64(a, b, k1, k2) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_2intersect_epi32(simde__m256i a, simde__m256i b, simde__mmask8 *k1, simde__mmask8 *k2) { - #if defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm256_2intersect_epi32(a, b, k1, k2); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - simde__mmask8 - k1_ = 0, - k2_ = 0; - - for (size_t i = 0 ; i < sizeof(a_.i32) / sizeof(a_.i32[0]) ; i++) { - #if defined(SIMDE_ENABLE_OPENMP) - #pragma omp simd reduction(|:k1_) reduction(|:k2_) - #else - SIMDE_VECTORIZE - #endif - for (size_t j = 0 ; j < sizeof(b_.i32) / sizeof(b_.i32[0]) ; j++) { - const int32_t m = a_.i32[i] == b_.i32[j]; - k1_ |= m << i; - k2_ |= m << j; - } - } - - *k1 = k1_; - *k2 = k2_; - #endif -} -#if defined(SIMDE_X86_AVX512VP2INTERSECT_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_2intersect_epi32 - #define _mm256_2intersect_epi32(a, b, k1, k2) simde_mm256_2intersect_epi32(a, b, k1, k2) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_2intersect_epi64(simde__m256i a, simde__m256i b, simde__mmask8 *k1, simde__mmask8 *k2) { - #if defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm256_2intersect_epi64(a, b, k1, k2); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - simde__mmask8 - k1_ = 0, - k2_ = 0; - - for (size_t i = 0 ; i < sizeof(a_.i64) / sizeof(a_.i64[0]) ; i++) { - #if defined(SIMDE_ENABLE_OPENMP) - #pragma omp simd reduction(|:k1_) reduction(|:k2_) - #else - SIMDE_VECTORIZE - #endif - for (size_t j = 0 ; j < sizeof(b_.i64) / sizeof(b_.i64[0]) ; j++) { - const int32_t m = a_.i64[i] == b_.i64[j]; - k1_ |= m << i; - k2_ |= m << j; - } - } - - *k1 = k1_; - *k2 = k2_; - #endif -} -#if defined(SIMDE_X86_AVX512VP2INTERSECT_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_2intersect_epi64 - #define _mm256_2intersect_epi64(a, b, k1, k2) simde_mm256_2intersect_epi64(a, b, k1, k2) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm512_2intersect_epi32(simde__m512i a, simde__m512i b, simde__mmask16 *k1, simde__mmask16 *k2) { - #if defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) - _mm512_2intersect_epi32(a, b, k1, k2); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - simde__mmask16 - k1_ = 0, - k2_ = 0; - - for (size_t i = 0 ; i < sizeof(a_.i32) / sizeof(a_.i32[0]) ; i++) { - #if defined(SIMDE_ENABLE_OPENMP) - #pragma omp simd reduction(|:k1_) reduction(|:k2_) - #else - SIMDE_VECTORIZE - #endif - for (size_t j = 0 ; j < sizeof(b_.i32) / sizeof(b_.i32[0]) ; j++) { - const int32_t m = a_.i32[i] == b_.i32[j]; - k1_ |= m << i; - k2_ |= m << j; - } - } - - *k1 = k1_; - *k2 = k2_; - #endif -} -#if defined(SIMDE_X86_AVX512VP2INTERSECT_ENABLE_NATIVE_ALIASES) - #undef _mm512_2intersect_epi32 - #define _mm512_2intersect_epi32(a, b, k1, k2) simde_mm512_2intersect_epi32(a, b, k1, k2) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm512_2intersect_epi64(simde__m512i a, simde__m512i b, simde__mmask8 *k1, simde__mmask8 *k2) { - #if defined(SIMDE_X86_AVX512VP2INTERSECT_NATIVE) - _mm512_2intersect_epi64(a, b, k1, k2); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - simde__mmask8 - k1_ = 0, - k2_ = 0; - - for (size_t i = 0 ; i < sizeof(a_.i64) / sizeof(a_.i64[0]) ; i++) { - #if defined(SIMDE_ENABLE_OPENMP) - #pragma omp simd reduction(|:k1_) reduction(|:k2_) - #else - SIMDE_VECTORIZE - #endif - for (size_t j = 0 ; j < sizeof(b_.i64) / sizeof(b_.i64[0]) ; j++) { - const int32_t m = a_.i64[i] == b_.i64[j]; - k1_ |= m << i; - k2_ |= m << j; - } - } - - *k1 = k1_; - *k2 = k2_; - #endif -} -#if defined(SIMDE_X86_AVX512VP2INTERSECT_ENABLE_NATIVE_ALIASES) - #undef _mm512_2intersect_epi64 - #define _mm512_2intersect_epi64(a, b, k1, k2) simde_mm512_2intersect_epi64(a, b, k1, k2) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_2INTERSECT_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/4dpwssd.h b/ffi-deps/simde/simde/x86/avx512/4dpwssd.h deleted file mode 100644 index 2139099..0000000 --- a/ffi-deps/simde/simde/x86/avx512/4dpwssd.h +++ /dev/null @@ -1,67 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_4DPWSSD_H) -#define SIMDE_X86_AVX512_4DPWSSD_H - -#include "types.h" -#include "dpwssd.h" -#include "set1.h" -#include "mov.h" -#include "add.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_4dpwssd_epi32 (simde__m512i src, simde__m512i a0, simde__m512i a1, simde__m512i a2, simde__m512i a3, simde__m128i* b) { - #if defined(SIMDE_X86_AVX5124VNNIW_NATIVE) - return _mm512_4dpwssd_epi32(src, a0, a1, a2, a3, b); - #else - simde__m128i_private bv = simde__m128i_to_private(simde_mm_loadu_epi32(b)); - simde__m512i r; - - r = simde_mm512_dpwssd_epi32(src, a0, simde_mm512_set1_epi32(bv.i32[0])); - r = simde_mm512_add_epi32(simde_mm512_dpwssd_epi32(src, a1, simde_mm512_set1_epi32(bv.i32[1])), r); - r = simde_mm512_add_epi32(simde_mm512_dpwssd_epi32(src, a2, simde_mm512_set1_epi32(bv.i32[2])), r); - r = simde_mm512_add_epi32(simde_mm512_dpwssd_epi32(src, a3, simde_mm512_set1_epi32(bv.i32[3])), r); - - return r; - #endif -} -#if defined(SIMDE_X86_AVX5124VNNIW_ENABLE_NATIVE_ALIASES) - #undef simde_mm512_4dpwssd_epi32 - #define _mm512_4dpwssd_epi32(src, a0, a1, a2, a3, b) simde_mm512_4dpwssd_epi32(src, a0, a1, a2, a3, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_4dpwssd_epi32 (simde__m512i src, simde__mmask16 k, simde__m512i a0, simde__m512i a1, simde__m512i a2, simde__m512i a3, simde__m128i* b) { - #if defined(SIMDE_X86_AVX5124VNNIW_NATIVE) - return _mm512_mask_4dpwssd_epi32(src, k, a0, a1, a2, a3, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_4dpwssd_epi32(src, a0, a1, a2, a3, b)); - #endif -} -#if defined(SIMDE_X86_AVX5124VNNIW_ENABLE_NATIVE_ALIASES) - #undef simde_mm512_mask_4dpwssd_epi32 - #define _mm512_mask_4dpwssd_epi32(src, k, a0, a1, a2, a3, b) simde_mm512_mask_4dpwssd_epi32(src, k, a0, a1, a2, a3, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_4dpwssd_epi32 (simde__mmask16 k, simde__m512i src, simde__m512i a0, simde__m512i a1, simde__m512i a2, simde__m512i a3, simde__m128i* b) { - #if defined(SIMDE_X86_AVX5124VNNIW_NATIVE) - return _mm512_mask_4dpwssd_epi32(k, src, a0, a1, a2, a3, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_4dpwssd_epi32(src, a0, a1, a2, a3, b)); - #endif -} -#if defined(SIMDE_X86_AVX5124VNNIW_ENABLE_NATIVE_ALIASES) - #undef simde_mm512_maskz_4dpwssd_epi32 - #define _mm512_maskz_4dpwssd_epi32(k, src, a0, a1, a2, a3, b) simde_mm512_maskz_4dpwssd_epi32(k, src, a0, a1, a2, a3, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_4DPWSSD_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/4dpwssds.h b/ffi-deps/simde/simde/x86/avx512/4dpwssds.h deleted file mode 100644 index ef8cf97..0000000 --- a/ffi-deps/simde/simde/x86/avx512/4dpwssds.h +++ /dev/null @@ -1,67 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_4DPWSSDS_H) -#define SIMDE_X86_AVX512_4DPWSSDS_H - -#include "types.h" -#include "dpwssds.h" -#include "set1.h" -#include "mov.h" -#include "adds.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_4dpwssds_epi32 (simde__m512i src, simde__m512i a0, simde__m512i a1, simde__m512i a2, simde__m512i a3, simde__m128i* b) { - #if defined(SIMDE_X86_AVX5124VNNIW_NATIVE) - return _mm512_4dpwssds_epi32(src, a0, a1, a2, a3, b); - #else - simde__m128i_private bv = simde__m128i_to_private(simde_mm_loadu_epi32(b)); - simde__m512i r; - - r = simde_mm512_dpwssds_epi32(src, a0, simde_mm512_set1_epi32(bv.i32[0])); - r = simde_x_mm512_adds_epi32(simde_mm512_dpwssds_epi32(src, a1, simde_mm512_set1_epi32(bv.i32[1])), r); - r = simde_x_mm512_adds_epi32(simde_mm512_dpwssds_epi32(src, a2, simde_mm512_set1_epi32(bv.i32[2])), r); - r = simde_x_mm512_adds_epi32(simde_mm512_dpwssds_epi32(src, a3, simde_mm512_set1_epi32(bv.i32[3])), r); - - return r; - #endif -} -#if defined(SIMDE_X86_AVX5124VNNIW_ENABLE_NATIVE_ALIASES) - #undef simde_mm512_4dpwssds_epi32 - #define _mm512_4dpwssds_epi32(src, a0, a1, a2, a3, b) simde_mm512_4dpwssds_epi32(src, a0, a1, a2, a3, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_4dpwssds_epi32 (simde__m512i src, simde__mmask16 k, simde__m512i a0, simde__m512i a1, simde__m512i a2, simde__m512i a3, simde__m128i* b) { - #if defined(SIMDE_X86_AVX5124VNNIW_NATIVE) - return _mm512_mask_4dpwssds_epi32(src, k, a0, a1, a2, a3, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_4dpwssds_epi32(src, a0, a1, a2, a3, b)); - #endif -} -#if defined(SIMDE_X86_AVX5124VNNIW_ENABLE_NATIVE_ALIASES) - #undef simde_mm512_mask_4dpwssds_epi32 - #define _mm512_mask_4dpwssds_epi32(src, k, a0, a1, a2, a3, b) simde_mm512_mask_4dpwssds_epi32(src, k, a0, a1, a2, a3, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_4dpwssds_epi32 (simde__mmask16 k, simde__m512i src, simde__m512i a0, simde__m512i a1, simde__m512i a2, simde__m512i a3, simde__m128i* b) { - #if defined(SIMDE_X86_AVX5124VNNIW_NATIVE) - return _mm512_mask_4dpwssds_epi32(k, src, a0, a1, a2, a3, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_4dpwssds_epi32(src, a0, a1, a2, a3, b)); - #endif -} -#if defined(SIMDE_X86_AVX5124VNNIW_ENABLE_NATIVE_ALIASES) - #undef simde_mm512_maskz_4dpwssds_epi32 - #define _mm512_maskz_4dpwssds_epi32(k, src, a0, a1, a2, a3, b) simde_mm512_maskz_4dpwssds_epi32(k, src, a0, a1, a2, a3, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_4DPWSSDS_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/abs.h b/ffi-deps/simde/simde/x86/avx512/abs.h deleted file mode 100644 index 5c0871b..0000000 --- a/ffi-deps/simde/simde/x86/avx512/abs.h +++ /dev/null @@ -1,580 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_ABS_H) -#define SIMDE_X86_AVX512_ABS_H - -#include "types.h" -#include "mov.h" -#include "../avx2.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_abs_epi8(simde__m128i src, simde__mmask16 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_abs_epi8(src, k, a); - #else - return simde_mm_mask_mov_epi8(src, k, simde_mm_abs_epi8(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_abs_epi8 - #define _mm_mask_abs_epi8(src, k, a) simde_mm_mask_abs_epi8(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_abs_epi8(simde__mmask16 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_maskz_abs_epi8(k, a); - #else - return simde_mm_maskz_mov_epi8(k, simde_mm_abs_epi8(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_abs_epi8 - #define _mm_maskz_abs_epi8(k, a) simde_mm_maskz_abs_epi8(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_abs_epi16(simde__m128i src, simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_abs_epi16(src, k, a); - #else - return simde_mm_mask_mov_epi16(src, k, simde_mm_abs_epi16(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_abs_epi16 - #define _mm_mask_abs_epi16(src, k, a) simde_mm_mask_abs_epi16(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_abs_epi16(simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_maskz_abs_epi16(k, a); - #else - return simde_mm_maskz_mov_epi16(k, simde_mm_abs_epi16(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_abs_epi16 - #define _mm_maskz_abs_epi16(k, a) simde_mm_maskz_abs_epi16(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_abs_epi32(simde__m128i src, simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_abs_epi32(src, k, a); - #else - return simde_mm_mask_mov_epi32(src, k, simde_mm_abs_epi32(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_abs_epi32 - #define _mm_mask_abs_epi32(src, k, a) simde_mm_mask_abs_epi32(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_abs_epi32(simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_abs_epi32(k, a); - #else - return simde_mm_maskz_mov_epi32(k, simde_mm_abs_epi32(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_abs_epi32 - #define _mm_maskz_abs_epi32(k, a) simde_mm_maskz_abs_epi32(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_abs_epi64(simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_abs_epi64(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - const __m128i m = _mm_srai_epi32(_mm_shuffle_epi32(a, 0xF5), 31); - return _mm_sub_epi64(_mm_xor_si128(a, m), m); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i64 = vabsq_s64(a_.neon_i64); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const int64x2_t m = vshrq_n_s64(a_.neon_i64, 63); - r_.neon_i64 = vsubq_s64(veorq_s64(a_.neon_i64, m), m); - #elif (defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION)) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i64 = vec_abs(a_.altivec_i64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_abs(a_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - __typeof__(r_.i64) z = { 0, }; - __typeof__(r_.i64) m = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 < z); - r_.i64 = (-a_.i64 & m) | (a_.i64 & ~m); - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) { - r_.i64[i] = (a_.i64[i] < INT64_C(0)) ? -a_.i64[i] : a_.i64[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_abs_epi64 - #define _mm_abs_epi64(a) simde_mm_abs_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_abs_epi64(simde__m128i src, simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_abs_epi64(src, k, a); - #else - return simde_mm_mask_mov_epi64(src, k, simde_mm_abs_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_abs_epi64 - #define _mm_mask_abs_epi64(src, k, a) simde_mm_mask_abs_epi64(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_abs_epi64(simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_abs_epi64(k, a); - #else - return simde_mm_maskz_mov_epi64(k, simde_mm_abs_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_abs_epi64 - #define _mm_maskz_abs_epi64(k, a) simde_mm_maskz_abs_epi64(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_abs_epi64(simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_abs_epi64(a); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_abs_epi64(a_.m128i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) { - r_.i64[i] = (a_.i64[i] < INT64_C(0)) ? -a_.i64[i] : a_.i64[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_abs_epi64 - #define _mm256_abs_epi64(a) simde_mm256_abs_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_abs_epi64(simde__m256i src, simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_abs_epi64(src, k, a); - #else - return simde_mm256_mask_mov_epi64(src, k, simde_mm256_abs_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_abs_epi64 - #define _mm256_mask_abs_epi64(src, k, a) simde_mm256_mask_abs_epi64(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_abs_epi64(simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_abs_epi64(k, a); - #else - return simde_mm256_maskz_mov_epi64(k, simde_mm256_abs_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_abs_epi64 - #define _mm256_maskz_abs_epi64(k, a) simde_mm256_maskz_abs_epi64(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_abs_epi8 (simde__m512i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_abs_epi8(a); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_abs_epi8(a_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] < INT32_C(0)) ? -a_.i8[i] : a_.i8[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_abs_epi8 - #define _mm512_abs_epi8(a) simde_mm512_abs_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_abs_epi8 (simde__m512i src, simde__mmask64 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_abs_epi8(src, k, a); - #else - return simde_mm512_mask_mov_epi8(src, k, simde_mm512_abs_epi8(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_abs_epi8 - #define _mm512_mask_abs_epi8(src, k, a) simde_mm512_mask_abs_epi8(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_abs_epi8 (simde__mmask64 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_abs_epi8(k, a); - #else - return simde_mm512_maskz_mov_epi8(k, simde_mm512_abs_epi8(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_abs_epi8 - #define _mm512_maskz_abs_epi8(k, a) simde_mm512_maskz_abs_epi8(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_abs_epi16 (simde__m512i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_abs_epi16(a); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_abs_epi16(a_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] < INT32_C(0)) ? -a_.i16[i] : a_.i16[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_abs_epi16 - #define _mm512_abs_epi16(a) simde_mm512_abs_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_abs_epi16 (simde__m512i src, simde__mmask32 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_abs_epi16(src, k, a); - #else - return simde_mm512_mask_mov_epi16(src, k, simde_mm512_abs_epi16(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_abs_epi16 - #define _mm512_mask_abs_epi16(src, k, a) simde_mm512_mask_abs_epi16(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_abs_epi16 (simde__mmask32 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_abs_epi16(k, a); - #else - return simde_mm512_maskz_mov_epi16(k, simde_mm512_abs_epi16(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_abs_epi16 - #define _mm512_maskz_abs_epi16(k, a) simde_mm512_maskz_abs_epi16(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_abs_epi32(simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_abs_epi32(a); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_abs_epi32(a_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { - r_.i32[i] = (a_.i32[i] < INT64_C(0)) ? -a_.i32[i] : a_.i32[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_abs_epi32 - #define _mm512_abs_epi32(a) simde_mm512_abs_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_abs_epi32(simde__m512i src, simde__mmask16 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_abs_epi32(src, k, a); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_abs_epi32(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_abs_epi32 - #define _mm512_mask_abs_epi32(src, k, a) simde_mm512_mask_abs_epi32(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_abs_epi32(simde__mmask16 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_abs_epi32(k, a); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_abs_epi32(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_abs_epi32 - #define _mm512_maskz_abs_epi32(k, a) simde_mm512_maskz_abs_epi32(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_abs_epi64(simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_abs_epi64(a); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_abs_epi64(a_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.i64) / sizeof(r_.i64[0])); i++) { - r_.i64[i] = (a_.i64[i] < INT64_C(0)) ? -a_.i64[i] : a_.i64[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_abs_epi64 - #define _mm512_abs_epi64(a) simde_mm512_abs_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_abs_epi64(simde__m512i src, simde__mmask8 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_abs_epi64(src, k, a); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_abs_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_abs_epi64 - #define _mm512_mask_abs_epi64(src, k, a) simde_mm512_mask_abs_epi64(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_abs_epi64(simde__mmask8 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_abs_epi64(k, a); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_abs_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_abs_epi64 - #define _mm512_maskz_abs_epi64(k, a) simde_mm512_maskz_abs_epi64(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_abs_ps(simde__m512 v2) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) - return _mm512_abs_ps(v2); - #else - simde__m512_private - r_, - v2_ = simde__m512_to_private(v2); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) { - r_.m128_private[i].neon_f32 = vabsq_f32(v2_.m128_private[i].neon_f32); - } - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) { - r_.m128_private[i].altivec_f32 = vec_abs(v2_.m128_private[i].altivec_f32); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.f32) / sizeof(r_.f32[0])); i++) { - r_.f32[i] = (v2_.f32[i] < INT64_C(0)) ? -v2_.f32[i] : v2_.f32[i]; - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_abs_ps - #define _mm512_abs_ps(v2) simde_mm512_abs_ps(v2) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_abs_ps(simde__m512 src, simde__mmask16 k, simde__m512 v2) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) - return _mm512_mask_abs_ps(src, k, v2); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_abs_ps(v2)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_abs_ps - #define _mm512_mask_abs_ps(src, k, v2) simde_mm512_mask_abs_ps(src, k, v2) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_abs_pd(simde__m512d v2) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,3,0)) - return _mm512_abs_pd(v2); - #elif defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) - /* gcc bug: https://gcc.gnu.org/legacy-ml/gcc-patches/2018-01/msg01962.html */ - return _mm512_abs_pd(_mm512_castpd_ps(v2)); - #else - simde__m512d_private - r_, - v2_ = simde__m512d_to_private(v2); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128d_private) / sizeof(r_.m128d_private[0])) ; i++) { - r_.m128d_private[i].neon_f64 = vabsq_f64(v2_.m128d_private[i].neon_f64); - } - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128d_private) / sizeof(r_.m128d_private[0])) ; i++) { - r_.m128d_private[i].altivec_f64 = vec_abs(v2_.m128d_private[i].altivec_f64); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.f64) / sizeof(r_.f64[0])); i++) { - r_.f64[i] = (v2_.f64[i] < INT64_C(0)) ? -v2_.f64[i] : v2_.f64[i]; - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_abs_pd - #define _mm512_abs_pd(v2) simde_mm512_abs_pd(v2) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_abs_pd(simde__m512d src, simde__mmask8 k, simde__m512d v2) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,3,0)) - return _mm512_mask_abs_pd(src, k, v2); - #elif defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) - /* gcc bug: https://gcc.gnu.org/legacy-ml/gcc-patches/2018-01/msg01962.html */ - return _mm512_mask_abs_pd(src, k, _mm512_castpd_ps(v2)); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_abs_pd(v2)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_abs_pd - #define _mm512_mask_abs_pd(src, k, v2) simde_mm512_mask_abs_pd(src, k, v2) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_ABS_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/add.h b/ffi-deps/simde/simde/x86/avx512/add.h deleted file mode 100644 index 2c4c98e..0000000 --- a/ffi-deps/simde/simde/x86/avx512/add.h +++ /dev/null @@ -1,641 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_ADD_H) -#define SIMDE_X86_AVX512_ADD_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_add_epi8(simde__m128i src, simde__mmask16 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_add_epi8(src, k, a, b); - #else - return simde_mm_mask_mov_epi8(src, k, simde_mm_add_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_add_epi8 - #define _mm_mask_add_epi8(src, k, a, b) simde_mm_mask_add_epi8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_add_epi8(simde__mmask16 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_maskz_add_epi8(k, a, b); - #else - return simde_mm_maskz_mov_epi8(k, simde_mm_add_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_add_epi8 - #define _mm_maskz_add_epi8(k, a, b) simde_mm_maskz_add_epi8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_add_epi16(simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_add_epi16(src, k, a, b); - #else - return simde_mm_mask_mov_epi16(src, k, simde_mm_add_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_add_epi16 - #define _mm_mask_add_epi16(src, k, a, b) simde_mm_mask_add_epi16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_add_epi16(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_maskz_add_epi16(k, a, b); - #else - return simde_mm_maskz_mov_epi16(k, simde_mm_add_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_add_epi16 - #define _mm_maskz_add_epi16(k, a, b) simde_mm_maskz_add_epi16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_add_epi32(simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_add_epi32(src, k, a, b); - #else - return simde_mm_mask_mov_epi32(src, k, simde_mm_add_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_add_epi32 - #define _mm_mask_add_epi32(src, k, a, b) simde_mm_mask_add_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_add_epi32(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_add_epi32(k, a, b); - #else - return simde_mm_maskz_mov_epi32(k, simde_mm_add_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_add_epi32 - #define _mm_maskz_add_epi32(k, a, b) simde_mm_maskz_add_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_add_epi64(simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_add_epi64(src, k, a, b); - #else - return simde_mm_mask_mov_epi64(src, k, simde_mm_add_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_add_epi64 - #define _mm_mask_add_epi64(src, k, a, b) simde_mm_mask_add_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_add_epi64(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_add_epi64(k, a, b); - #else - return simde_mm_maskz_mov_epi64(k, simde_mm_add_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_add_epi64 - #define _mm_maskz_add_epi64(k, a, b) simde_mm_maskz_add_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_mask_add_ss(simde__m128 src, simde__mmask8 k, simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,1,0)) - return _mm_mask_add_ss(src, k, a, b); - #elif 1 - simde__m128_private - src_ = simde__m128_to_private(src), - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b), - r_ = simde__m128_to_private(a); - - r_.f32[0] = (k & 1) ? (a_.f32[0] + b_.f32[0]) : src_.f32[0]; - - return simde__m128_from_private(r_); - #else - return simde_mm_move_ss(a, simde_mm_mask_mov_ps(src, k, simde_mm_add_ps(a, b))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_add_ss - #define _mm_mask_add_ss(src, k, a, b) simde_mm_mask_add_ss(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_maskz_add_ss(simde__mmask8 k, simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,1,0)) - return _mm_maskz_add_ss(k, a, b); - #elif 1 - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b), - r_ = simde__m128_to_private(a); - - r_.f32[0] = (k & 1) ? (a_.f32[0] + b_.f32[0]) : 0.0f; - - return simde__m128_from_private(r_); - #else - return simde_mm_move_ss(a, simde_mm_maskz_mov_ps(k, simde_mm_add_ps(a, b))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_add_ss - #define _mm_maskz_add_ss(k, a, b) simde_mm_maskz_add_ss(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_add_epi16(simde__m256i src, simde__mmask16 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_mask_add_epi16(src, k, a, b); - #else - return simde_mm256_mask_mov_epi16(src, k, simde_mm256_add_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_add_epi16 - #define _mm256_mask_add_epi16(src, k, a, b) simde_mm256_mask_add_epi16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_add_epi16(simde__mmask16 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_maskz_add_epi16(k, a, b); - #else - return simde_mm256_maskz_mov_epi16(k, simde_mm256_add_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_add_epi16 - #define _mm256_maskz_add_epi16(k, a, b) simde_mm256_maskz_add_epi16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_add_epi32(simde__m256i src, simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_add_epi32(src, k, a, b); - #else - return simde_mm256_mask_mov_epi32(src, k, simde_mm256_add_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_add_epi32 - #define _mm256_mask_add_epi32(src, k, a, b) simde_mm256_mask_add_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_add_epi32(simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_add_epi32(k, a, b); - #else - return simde_mm256_maskz_mov_epi32(k, simde_mm256_add_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_add_epi32 - #define _mm256_maskz_add_epi32(k, a, b) simde_mm256_maskz_add_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_add_epi64(simde__m256i src, simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_add_epi64(src, k, a, b); - #else - return simde_mm256_mask_mov_epi64(src, k, simde_mm256_add_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_add_epi64 - #define _mm256_mask_add_epi64(src, k, a, b) simde_mm256_mask_add_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_add_epi64(simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_add_epi64(k, a, b); - #else - return simde_mm256_maskz_mov_epi64(k, simde_mm256_add_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_add_epi64 - #define _mm256_maskz_add_epi64(k, a, b) simde_mm256_maskz_add_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_add_epi8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_add_epi8(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = a_.i8 + b_.i8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_add_epi8(a_.m256i[i], b_.m256i[i]); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_add_epi8 - #define _mm512_add_epi8(a, b) simde_mm512_add_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_add_epi8 (simde__m512i src, simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_add_epi8(src, k, a, b); - #else - return simde_mm512_mask_mov_epi8(src, k, simde_mm512_add_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_add_epi8 - #define _mm512_mask_add_epi8(src, k, a, b) simde_mm512_mask_add_epi8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_add_epi8 (simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_add_epi8(k, a, b); - #else - return simde_mm512_maskz_mov_epi8(k, simde_mm512_add_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_add_epi8 - #define _mm512_maskz_add_epi8(k, a, b) simde_mm512_maskz_add_epi8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_add_epi16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_add_epi16(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = a_.i16 + b_.i16; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_add_epi16(a_.m256i[i], b_.m256i[i]); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_add_epi16 - #define _mm512_add_epi16(a, b) simde_mm512_add_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_add_epi16 (simde__m512i src, simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_add_epi16(src, k, a, b); - #else - return simde_mm512_mask_mov_epi16(src, k, simde_mm512_add_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_add_epi16 - #define _mm512_mask_add_epi16(src, k, a, b) simde_mm512_mask_add_epi16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_add_epi16 (simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_add_epi16(k, a, b); - #else - return simde_mm512_maskz_mov_epi16(k, simde_mm512_add_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_add_epi16 - #define _mm512_maskz_add_epi16(k, a, b) simde_mm512_maskz_add_epi16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_add_epi32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_add_epi32(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_ARM_SVE_NATIVE) - const size_t n = sizeof(a_.i32) / sizeof(a_.i32[0]); - size_t i = 0; - svbool_t pg = svwhilelt_b32(i, n); - do { - svint32_t - va = svld1_s32(pg, &(a_.i32[i])), - vb = svld1_s32(pg, &(b_.i32[i])); - svst1_s32(pg, &(r_.i32[i]), svadd_s32_x(pg, va, vb)); - i += svcntw(); - pg = svwhilelt_b32(i, n); - } while (svptest_any(svptrue_b32(), pg)); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_add_epi32(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 + b_.i32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_add_epi32(a_.m256i[i], b_.m256i[i]); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_add_epi32 - #define _mm512_add_epi32(a, b) simde_mm512_add_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_add_epi32(simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_add_epi32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_add_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_add_epi32 - #define _mm512_mask_add_epi32(src, k, a, b) simde_mm512_mask_add_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_add_epi32(simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_add_epi32(k, a, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_add_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_add_epi32 - #define _mm512_maskz_add_epi32(k, a, b) simde_mm512_maskz_add_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_add_epi64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_add_epi64(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_add_epi64(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_CLANG_BAD_VI64_OPS) - r_.i64 = a_.i64 + b_.i64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_add_epi64(a_.m256i[i], b_.m256i[i]); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_add_epi64 - #define _mm512_add_epi64(a, b) simde_mm512_add_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_add_epi64(simde__m512i src, simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_add_epi64(src, k, a, b); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_add_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_add_epi64 - #define _mm512_mask_add_epi64(src, k, a, b) simde_mm512_mask_add_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_add_epi64(simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_add_epi64(k, a, b); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_add_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_add_epi64 - #define _mm512_maskz_add_epi64(k, a, b) simde_mm512_maskz_add_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_add_ps (simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_add_ps(a, b); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a), - b_ = simde__m512_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = a_.f32 + b_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_add_ps(a_.m256[i], b_.m256[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_add_ps - #define _mm512_add_ps(a, b) simde_mm512_add_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_add_ps(simde__m512 src, simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_add_ps(src, k, a, b); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_add_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_add_ps - #define _mm512_mask_add_ps(src, k, a, b) simde_mm512_mask_add_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_add_ps(simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_add_ps(k, a, b); - #else - return simde_mm512_maskz_mov_ps(k, simde_mm512_add_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_add_ps - #define _mm512_maskz_add_ps(k, a, b) simde_mm512_maskz_add_ps(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_add_pd (simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_add_pd(a, b); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a), - b_ = simde__m512d_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 + b_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_add_pd(a_.m256d[i], b_.m256d[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_add_pd - #define _mm512_add_pd(a, b) simde_mm512_add_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_add_pd(simde__m512d src, simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_add_pd(src, k, a, b); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_add_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_add_pd - #define _mm512_mask_add_pd(src, k, a, b) simde_mm512_mask_add_pd(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_maskz_add_pd(simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_add_pd(k, a, b); - #else - return simde_mm512_maskz_mov_pd(k, simde_mm512_add_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_add_pd - #define _mm512_maskz_add_pd(k, a, b) simde_mm512_maskz_add_pd(k, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_ADD_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/adds.h b/ffi-deps/simde/simde/x86/avx512/adds.h deleted file mode 100644 index 64abffa..0000000 --- a/ffi-deps/simde/simde/x86/avx512/adds.h +++ /dev/null @@ -1,529 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Himanshi Mathur - */ - -#if !defined(SIMDE_X86_AVX512_ADDS_H) -#define SIMDE_X86_AVX512_ADDS_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_adds_epi8(simde__m128i src, simde__mmask16 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_adds_epi8(src, k, a, b); - #else - return simde_mm_mask_mov_epi8(src, k, simde_mm_adds_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_adds_epi8 - #define _mm_mask_adds_epi8(src, k, a, b) simde_mm_mask_adds_epi8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_adds_epi8(simde__mmask16 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_maskz_adds_epi8(k, a, b); - #else - return simde_mm_maskz_mov_epi8(k, simde_mm_adds_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_adds_epi8 - #define _mm_maskz_adds_epi8(k, a, b) simde_mm_maskz_adds_epi8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_adds_epi16(simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_adds_epi16(src, k, a, b); - #else - return simde_mm_mask_mov_epi16(src, k, simde_mm_adds_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_adds_epi16 - #define _mm_mask_adds_epi16(src, k, a, b) simde_mm_mask_adds_epi16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_adds_epi16(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_maskz_adds_epi16(k, a, b); - #else - return simde_mm_maskz_mov_epi16(k, simde_mm_adds_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_adds_epi16 - #define _mm_maskz_adds_epi16(k, a, b) simde_mm_maskz_adds_epi16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_adds_epi8(simde__m256i src, simde__mmask32 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_mask_adds_epi8(src, k, a, b); - #else - return simde_mm256_mask_mov_epi8(src, k, simde_mm256_adds_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_adds_epi8 - #define _mm256_mask_adds_epi8(src, k, a, b) simde_mm256_mask_adds_epi8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_adds_epi8(simde__mmask32 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_maskz_adds_epi8(k, a, b); - #else - return simde_mm256_maskz_mov_epi8(k, simde_mm256_adds_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_adds_epi8 - #define _mm256_maskz_adds_epi8(k, a, b) simde_mm256_maskz_adds_epi8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_adds_epi16(simde__m256i src, simde__mmask16 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_mask_adds_epi16(src, k, a, b); - #else - return simde_mm256_mask_mov_epi16(src, k, simde_mm256_adds_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_adds_epi16 - #define _mm256_mask_adds_epi16(src, k, a, b) simde_mm256_mask_adds_epi16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_adds_epi16(simde__mmask16 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_maskz_adds_epi16(k, a, b); - #else - return simde_mm256_maskz_mov_epi16(k, simde_mm256_adds_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_adds_epi16 - #define _mm256_maskz_adds_epi16(k, a, b) simde_mm256_maskz_adds_epi16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_adds_epi8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_adds_epi8(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if !defined(HEDLEY_INTEL_VERSION) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_adds_epi8(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = simde_math_adds_i8(a_.i8[i], b_.i8[i]); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_adds_epi8 - #define _mm512_adds_epi8(a, b) simde_mm512_adds_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_adds_epi8 (simde__m512i src, simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_adds_epi8(src, k, a, b); - #else - return simde_mm512_mask_mov_epi8(src, k, simde_mm512_adds_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_adds_epi8 - #define _mm512_mask_adds_epi8(src, k, a, b) simde_mm512_mask_adds_epi8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_adds_epi8 (simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_adds_epi8(k, a, b); - #else - return simde_mm512_maskz_mov_epi8(k, simde_mm512_adds_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_adds_epi8 - #define _mm512_maskz_adds_epi8(k, a, b) simde_mm512_maskz_adds_epi8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_adds_epi16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_adds_epi16(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if !defined(HEDLEY_INTEL_VERSION) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_adds_epi16(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = simde_math_adds_i16(a_.i16[i], b_.i16[i]); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_adds_epi16 - #define _mm512_adds_epi16(a, b) simde_mm512_adds_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_adds_epi16 (simde__m512i src, simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_adds_epi16(src, k, a, b); - #else - return simde_mm512_mask_mov_epi16(src, k, simde_mm512_adds_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_adds_epi16 - #define _mm512_mask_adds_epi16(src, k, a, b) simde_mm512_mask_adds_epi16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_adds_epi16 (simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_adds_epi16(k, a, b); - #else - return simde_mm512_maskz_mov_epi16(k, simde_mm512_adds_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_adds_epi16 - #define _mm512_maskz_adds_epi16(k, a, b) simde_mm512_maskz_adds_epi16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_adds_epu8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_adds_epu8(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if !defined(HEDLEY_INTEL_VERSION) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_adds_epu8(a_.m128i[i], b_.m128i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = simde_math_adds_u8(a_.u8[i], b_.u8[i]); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_adds_epu8 - #define _mm512_adds_epu8(a, b) simde_mm512_adds_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_adds_epu8 (simde__m512i src, simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_adds_epu8(src, k, a, b); - #else - return simde_mm512_mask_mov_epi8(src, k, simde_mm512_adds_epu8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_adds_epu8 - #define _mm512_mask_adds_epu8(src, k, a, b) simde_mm512_mask_adds_epu8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_adds_epu8 (simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_adds_epu8(k, a, b); - #else - return simde_mm512_maskz_mov_epi8(k, simde_mm512_adds_epu8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_adds_epu8 - #define _mm512_maskz_adds_epu8(k, a, b) simde_mm512_maskz_adds_epu8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_adds_epu16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_adds_epu16(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if !defined(HEDLEY_INTEL_VERSION) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_adds_epu16(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = simde_math_adds_u16(a_.u16[i], b_.u16[i]); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_adds_epu16 - #define _mm512_adds_epu16(a, b) simde_mm512_adds_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_adds_epu16 (simde__m512i src, simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_adds_epu16(src, k, a, b); - #else - return simde_mm512_mask_mov_epi16(src, k, simde_mm512_adds_epu16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_adds_epu16 - #define _mm512_mask_adds_epu16(src, k, a, b) simde_mm512_mask_adds_epu16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_adds_epu16 (simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_adds_epu16(k, a, b); - #else - return simde_mm512_maskz_mov_epi16(k, simde_mm512_adds_epu16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_adds_epu16 - #define _mm512_maskz_adds_epu16(k, a, b) simde_mm512_maskz_adds_epu16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_adds_epi32(simde__m128i a, simde__m128i b) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vqaddq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P6) - r_.altivec_i32 = vec_adds(a_.altivec_i32, b_.altivec_i32); - #else - #if defined(SIMDE_X86_SSE2_NATIVE) - /* https://stackoverflow.com/a/56544654/501126 */ - const __m128i int_max = _mm_set1_epi32(INT32_MAX); - - /* normal result (possibly wraps around) */ - const __m128i sum = _mm_add_epi32(a_.n, b_.n); - - /* If result saturates, it has the same sign as both a and b */ - const __m128i sign_bit = _mm_srli_epi32(a_.n, 31); /* shift sign to lowest bit */ - - #if defined(SIMDE_X86_AVX512VL_NATIVE) - const __m128i overflow = _mm_ternarylogic_epi32(a_.n, b_.n, sum, 0x42); - #else - const __m128i sign_xor = _mm_xor_si128(a_.n, b_.n); - const __m128i overflow = _mm_andnot_si128(sign_xor, _mm_xor_si128(a_.n, sum)); - #endif - - #if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - r_.n = _mm_mask_add_epi32(sum, _mm_movepi32_mask(overflow), int_max, sign_bit); - #else - const __m128i saturated = _mm_add_epi32(int_max, sign_bit); - - #if defined(SIMDE_X86_SSE4_1_NATIVE) - r_.n = - _mm_castps_si128( - _mm_blendv_ps( - _mm_castsi128_ps(sum), - _mm_castsi128_ps(saturated), - _mm_castsi128_ps(overflow) - ) - ); - #else - const __m128i overflow_mask = _mm_srai_epi32(overflow, 31); - r_.n = - _mm_or_si128( - _mm_and_si128(overflow_mask, saturated), - _mm_andnot_si128(overflow_mask, sum) - ); - #endif - #endif - #elif defined(SIMDE_VECTOR_SCALAR) - uint32_t au SIMDE_VECTOR(16) = HEDLEY_REINTERPRET_CAST(__typeof__(au), a_.i32); - uint32_t bu SIMDE_VECTOR(16) = HEDLEY_REINTERPRET_CAST(__typeof__(bu), b_.i32); - uint32_t ru SIMDE_VECTOR(16) = au + bu; - - au = (au >> 31) + INT32_MAX; - - uint32_t m SIMDE_VECTOR(16) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (au ^ bu) | ~(bu ^ ru)) < 0); - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (au & ~m) | (ru & m)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = simde_math_adds_i32(a_.i32[i], b_.i32[i]); - } - #endif - #endif - - return simde__m128i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_adds_epi32(simde__m256i a, simde__m256i b) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_adds_epi32(a_.m128i[i], b_.m128i[i]); - } - #elif defined(SIMDE_VECTOR_SCALAR) - uint32_t au SIMDE_VECTOR(32) = HEDLEY_REINTERPRET_CAST(__typeof__(au), a_.i32); - uint32_t bu SIMDE_VECTOR(32) = HEDLEY_REINTERPRET_CAST(__typeof__(bu), b_.i32); - uint32_t ru SIMDE_VECTOR(32) = au + bu; - - au = (au >> 31) + INT32_MAX; - - uint32_t m SIMDE_VECTOR(32) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (au ^ bu) | ~(bu ^ ru)) < 0); - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (au & ~m) | (ru & m)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = simde_math_adds_i32(a_.i32[i], b_.i32[i]); - } - #endif - - return simde__m256i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_adds_epi32(simde__m512i a, simde__m512i b) { - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_adds_epi32(a_.m128i[i], b_.m128i[i]); - } - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_x_mm256_adds_epi32(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SCALAR) - uint32_t au SIMDE_VECTOR(64) = HEDLEY_REINTERPRET_CAST(__typeof__(au), a_.i32); - uint32_t bu SIMDE_VECTOR(64) = HEDLEY_REINTERPRET_CAST(__typeof__(bu), b_.i32); - uint32_t ru SIMDE_VECTOR(64) = au + bu; - - au = (au >> 31) + INT32_MAX; - - uint32_t m SIMDE_VECTOR(64) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (au ^ bu) | ~(bu ^ ru)) < 0); - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (au & ~m) | (ru & m)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = simde_math_adds_i32(a_.i32[i], b_.i32[i]); - } - #endif - - return simde__m512i_from_private(r_); -} - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_ADDS_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/and.h b/ffi-deps/simde/simde/x86/avx512/and.h deleted file mode 100644 index fd7118f..0000000 --- a/ffi-deps/simde/simde/x86/avx512/and.h +++ /dev/null @@ -1,305 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Himanshi Mathur - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_AND_H) -#define SIMDE_X86_AVX512_AND_H - -#include "types.h" -#include "../avx2.h" - -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_and_pd (simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_and_pd(a, b); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a), - b_ = simde__m512d_to_private(b); - - #if defined(SIMDE_X86_AVX_NATIVE) - r_.m256d[0] = simde_mm256_and_pd(a_.m256d[0], b_.m256d[0]); - r_.m256d[1] = simde_mm256_and_pd(a_.m256d[1], b_.m256d[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f & b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] & b_.i32f[i]; - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_and_pd - #define _mm512_and_pd(a, b) simde_mm512_and_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_and_ps (simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_and_ps(a, b); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a), - b_ = simde__m512_to_private(b); - - #if defined(SIMDE_X86_AVX_NATIVE) - r_.m256[0] = simde_mm256_and_ps(a_.m256[0], b_.m256[0]); - r_.m256[1] = simde_mm256_and_ps(a_.m256[1], b_.m256[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f & b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] & b_.i32f[i]; - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_and_ps - #define _mm512_and_ps(a, b) simde_mm512_and_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_and_ps(simde__m512 src, simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_mask_and_ps(src, k, a, b); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_and_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_and_ps - #define _mm512_mask_and_ps(src, k, a, b) simde_mm512_mask_and_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_and_ps(simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_maskz_and_ps(k, a, b); - #else - return simde_mm512_maskz_mov_ps(k, simde_mm512_and_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_and_ps - #define _mm512_maskz_and_ps(k, a, b) simde_mm512_maskz_and_ps(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_and_pd(simde__m512d src, simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_mask_and_pd(src, k, a, b); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_and_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_and_pd - #define _mm512_mask_and_pd(src, k, a, b) simde_mm512_mask_and_pd(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_maskz_and_pd(simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_maskz_and_pd(k, a, b); - #else - return simde_mm512_maskz_mov_pd(k, simde_mm512_and_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_and_pd - #define _mm512_maskz_and_pd(k, a, b) simde_mm512_maskz_and_pd(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_and_epi32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_and_epi32(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 & b_.i32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] & b_.i32[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_and_epi32 - #define _mm512_and_epi32(a, b) simde_mm512_and_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_and_epi32(simde__m512i src, simde__mmask16 k, simde__m512i v2, simde__m512i v3) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_and_epi32(src, k, v2, v3); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_and_epi32(v2, v3)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_and_epi32 - #define _mm512_mask_and_epi32(src, k, v2, v3) simde_mm512_mask_and_epi32(src, k, v2, v3) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_and_epi32(simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_and_epi32(k, a, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_and_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_and_epi32 - #define _mm512_maskz_and_epi32(k, a, b) simde_mm512_maskz_and_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_and_epi64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_and_epi64(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 & b_.i64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] & b_.i64[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_and_epi64 - #define _mm512_and_epi64(a, b) simde_mm512_and_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_and_epi64(simde__m512i src, simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_and_epi64(src, k, a, b); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_and_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_and_epi64 - #define _mm512_mask_and_epi64(src, k, a, b) simde_mm512_mask_and_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_and_epi64(simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_and_epi64(k, a, b); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_and_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_and_epi64 - #define _mm512_maskz_and_epi64(k, a, b) simde_mm512_maskz_and_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_and_si512 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_and_si512(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_X86_AVX2_NATIVE) - r_.m256i[0] = simde_mm256_and_si256(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_and_si256(a_.m256i[1], b_.m256i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f & b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] & b_.i32[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_and_si512 - #define _mm512_and_si512(a, b) simde_mm512_and_si512(a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_AND_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/andnot.h b/ffi-deps/simde/simde/x86/avx512/andnot.h deleted file mode 100644 index ddc3dcb..0000000 --- a/ffi-deps/simde/simde/x86/avx512/andnot.h +++ /dev/null @@ -1,193 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Himanshi Mathur - */ - -#if !defined(SIMDE_X86_AVX512_ANDNOT_H) -#define SIMDE_X86_AVX512_ANDNOT_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm512_andnot_ps(a, b) _mm512_andnot_ps(a, b) -#else - #define simde_mm512_andnot_ps(a, b) simde_mm512_castsi512_ps(simde_mm512_andnot_si512(simde_mm512_castps_si512(a), simde_mm512_castps_si512(b))) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_andnot_ps - #define _mm512_andnot_ps(a, b) simde_mm512_andnot_ps(a, b) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm512_mask_andnot_ps(src, k, a, b) _mm512_mask_andnot_ps((src), (k), (a), (b)) -#else - #define simde_mm512_mask_andnot_ps(src, k, a, b) simde_mm512_castsi512_ps(simde_mm512_mask_andnot_epi32(simde_mm512_castps_si512(src), k, simde_mm512_castps_si512(a), simde_mm512_castps_si512(b))) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_andnot_ps - #define _mm512_mask_andnot_ps(src, k, a, b) simde_mm512_mask_andnot_ps(src, k, a, b) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm512_maskz_andnot_ps(k, a, b) _mm512_maskz_andnot_ps((k), (a), (b)) -#else - #define simde_mm512_maskz_andnot_ps(k, a, b) simde_mm512_castsi512_ps(simde_mm512_maskz_andnot_epi32(k, simde_mm512_castps_si512(a), simde_mm512_castps_si512(b))) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_andnot_ps - #define _mm512_maskz_andnot_ps(k, a, b) simde_mm512_maskz_andnot_ps(k, a, b) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm512_andnot_pd(a, b) _mm512_andnot_pd(a, b) -#else - #define simde_mm512_andnot_pd(a, b) simde_mm512_castsi512_pd(simde_mm512_andnot_si512(simde_mm512_castpd_si512(a), simde_mm512_castpd_si512(b))) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_andnot_pd - #define _mm512_andnot_pd(a, b) simde_mm512_andnot_pd(a, b) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm512_mask_andnot_pd(src, k, a, b) _mm512_mask_andnot_pd((src), (k), (a), (b)) -#else - #define simde_mm512_mask_andnot_pd(src, k, a, b) simde_mm512_castsi512_pd(simde_mm512_mask_andnot_epi64(simde_mm512_castpd_si512(src), k, simde_mm512_castpd_si512(a), simde_mm512_castpd_si512(b))) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_andnot_pd - #define _mm512_mask_andnot_pd(src, k, a, b) simde_mm512_mask_andnot_pd(src, k, a, b) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm512_maskz_andnot_pd(k, a, b) _mm512_maskz_andnot_pd((k), (a), (b)) -#else - #define simde_mm512_maskz_andnot_pd(k, a, b) simde_mm512_castsi512_pd(simde_mm512_maskz_andnot_epi64(k, simde_mm512_castpd_si512(a), simde_mm512_castpd_si512(b))) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_andnot_pd - #define _mm512_maskz_andnot_pd(k, a, b) simde_mm512_maskz_andnot_pd(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_andnot_si512 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_andnot_si512(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_X86_AVX2_NATIVE) - r_.m256i[0] = simde_mm256_andnot_si256(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_andnot_si256(a_.m256i[1], b_.m256i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#define simde_mm512_andnot_epi32(a, b) simde_mm512_andnot_si512(a, b) -#define simde_mm512_andnot_epi64(a, b) simde_mm512_andnot_si512(a, b) -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_andnot_si512 - #define _mm512_andnot_si512(a, b) simde_mm512_andnot_si512(a, b) - #undef _mm512_andnot_epi32 - #define _mm512_andnot_epi32(a, b) simde_mm512_andnot_si512(a, b) - #undef _mm512_andnot_epi64 - #define _mm512_andnot_epi64(a, b) simde_mm512_andnot_si512(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_andnot_epi32(simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_andnot_epi32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_andnot_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_andnot_epi32 - #define _mm512_mask_andnot_epi32(src, k, a, b) simde_mm512_mask_andnot_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_andnot_epi32(simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_andnot_epi32(k, a, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_andnot_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_andnot_epi32 - #define _mm512_maskz_andnot_epi32(k, a, b) simde_mm512_maskz_andnot_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_andnot_epi64(simde__m512i src, simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_andnot_epi64(src, k, a, b); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_andnot_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_andnot_epi64 - #define _mm512_mask_andnot_epi64(src, k, a, b) simde_mm512_mask_andnot_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_andnot_epi64(simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_andnot_epi64(k, a, b); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_andnot_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_andnot_epi64 - #define _mm512_maskz_andnot_epi64(k, a, b) simde_mm512_maskz_andnot_epi64(k, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_ANDNOT_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/avg.h b/ffi-deps/simde/simde/x86/avx512/avg.h deleted file mode 100644 index 2ec3441..0000000 --- a/ffi-deps/simde/simde/x86/avx512/avg.h +++ /dev/null @@ -1,258 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_AVG_H) -#define SIMDE_X86_AVX512_AVG_H - -#include "types.h" -#include "mov.h" -#include "../avx2.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_avg_epu8(simde__m128i src, simde__mmask16 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_avg_epu8(src, k, a, b); - #else - return simde_mm_mask_mov_epi8(src, k, simde_mm_avg_epu8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_avg_epu8 - #define _mm_mask_avg_epu8(src, k, a, b) simde_mm_mask_avg_epu8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_avg_epu8(simde__mmask16 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_maskz_avg_epu8(k, a, b); - #else - return simde_mm_maskz_mov_epi8(k, simde_mm_avg_epu8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_avg_epu8 - #define _mm_maskz_avg_epu8(k, a, b) simde_mm_maskz_avg_epu8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_avg_epu16(simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_avg_epu16(src, k, a, b); - #else - return simde_mm_mask_mov_epi16(src, k, simde_mm_avg_epu16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_avg_epu16 - #define _mm_mask_avg_epu16(src, k, a, b) simde_mm_mask_avg_epu16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_avg_epu16(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_maskz_avg_epu16(k, a, b); - #else - return simde_mm_maskz_mov_epi16(k, simde_mm_avg_epu16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_avg_epu16 - #define _mm_maskz_avg_epu16(k, a, b) simde_mm_maskz_avg_epu16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_avg_epu8(simde__m256i src, simde__mmask32 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_mask_avg_epu8(src, k, a, b); - #else - return simde_mm256_mask_mov_epi8(src, k, simde_mm256_avg_epu8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_avg_epu8 - #define _mm256_mask_avg_epu8(src, k, a, b) simde_mm256_mask_avg_epu8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_avg_epu8(simde__mmask32 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_maskz_avg_epu8(k, a, b); - #else - return simde_mm256_maskz_mov_epi8(k, simde_mm256_avg_epu8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_avg_epu8 - #define _mm256_maskz_avg_epu8(k, a, b) simde_mm256_maskz_avg_epu8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_avg_epu16(simde__m256i src, simde__mmask16 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_mask_avg_epu16(src, k, a, b); - #else - return simde_mm256_mask_mov_epi16(src, k, simde_mm256_avg_epu16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_avg_epu16 - #define _mm256_mask_avg_epu16(src, k, a, b) simde_mm256_mask_avg_epu16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_avg_epu16(simde__mmask16 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_maskz_avg_epu16(k, a, b); - #else - return simde_mm256_maskz_mov_epi16(k, simde_mm256_avg_epu16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_avg_epu16 - #define _mm256_maskz_avg_epu16(k, a, b) simde_mm256_maskz_avg_epu16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_avg_epu8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_avg_epu8(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1; - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_avg_epu8 - #define _mm512_avg_epu8(a, b) simde_mm512_avg_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_avg_epu8 (simde__m512i src, simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_avg_epu8(src, k, a, b); - #else - return simde_mm512_mask_mov_epi8(src, k, simde_mm512_avg_epu8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_avg_epu8 - #define _mm512_mask_avg_epu8(src, k, a, b) simde_mm512_mask_avg_epu8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_avg_epu8 (simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_avg_epu8(k, a, b); - #else - return simde_mm512_maskz_mov_epi8(k, simde_mm512_avg_epu8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_avg_epu8 - #define _mm512_maskz_avg_epu8(k, a, b) simde_mm512_maskz_avg_epu8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_avg_epu16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_avg_epu16(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1; - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_avg_epu16 - #define _mm512_avg_epu16(a, b) simde_mm512_avg_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_avg_epu16 (simde__m512i src, simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_avg_epu16(src, k, a, b); - #else - return simde_mm512_mask_mov_epi16(src, k, simde_mm512_avg_epu16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_avg_epu16 - #define _mm512_mask_avg_epu16(src, k, a, b) simde_mm512_mask_avg_epu16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_avg_epu16 (simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_avg_epu16(k, a, b); - #else - return simde_mm512_maskz_mov_epi16(k, simde_mm512_avg_epu16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_avg_epu16 - #define _mm512_maskz_avg_epu16(k, a, b) simde_mm512_maskz_avg_epu16(k, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_AVG_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/bitshuffle.h b/ffi-deps/simde/simde/x86/avx512/bitshuffle.h deleted file mode 100644 index 05f4b5c..0000000 --- a/ffi-deps/simde/simde/x86/avx512/bitshuffle.h +++ /dev/null @@ -1,202 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_BITSHUFFLE_H) -#define SIMDE_X86_AVX512_BITSHUFFLE_H - -#include "types.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm_bitshuffle_epi64_mask (simde__m128i b, simde__m128i c) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_bitshuffle_epi64_mask(b, c); - #else - simde__m128i_private - b_ = simde__m128i_to_private(b), - c_ = simde__m128i_to_private(c); - simde__mmask16 r = 0; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - __typeof__(b_.u64) rv = { 0, 0 }; - __typeof__(b_.u64) lshift = { 0, 8 }; - - for (int8_t i = 0 ; i < 8 ; i++) { - __typeof__(b_.u64) ct = (HEDLEY_REINTERPRET_CAST(__typeof__(ct), c_.u8) >> (i * 8)) & 63; - rv |= ((b_.u64 >> ct) & 1) << lshift; - lshift += 1; - } - - r = - HEDLEY_STATIC_CAST(simde__mmask16, rv[0]) | - HEDLEY_STATIC_CAST(simde__mmask16, rv[1]); - #else - for (size_t i = 0 ; i < (sizeof(c_.m64_private) / sizeof(c_.m64_private[0])) ; i++) { - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t j = 0 ; j < (sizeof(c_.m64_private[i].u8) / sizeof(c_.m64_private[i].u8[0])) ; j++) { - r |= (((b_.u64[i] >> (c_.m64_private[i].u8[j]) & 63) & 1) << ((i * 8) + j)); - } - } - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_bitshuffle_epi64_mask - #define _mm_bitshuffle_epi64_mask(b, c) simde_mm_bitshuffle_epi64_mask(b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm_mask_bitshuffle_epi64_mask (simde__mmask16 k, simde__m128i b, simde__m128i c) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_bitshuffle_epi64_mask(k, b, c); - #else - return (k & simde_mm_bitshuffle_epi64_mask(b, c)); - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_bitshuffle_epi64_mask - #define _mm_mask_bitshuffle_epi64_mask(k, b, c) simde_mm_mask_bitshuffle_epi64_mask(k, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm256_bitshuffle_epi64_mask (simde__m256i b, simde__m256i c) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_bitshuffle_epi64_mask(b, c); - #else - simde__m256i_private - b_ = simde__m256i_to_private(b), - c_ = simde__m256i_to_private(c); - simde__mmask32 r = 0; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < sizeof(b_.m128i) / sizeof(b_.m128i[0]) ; i++) { - r |= (HEDLEY_STATIC_CAST(simde__mmask32, simde_mm_bitshuffle_epi64_mask(b_.m128i[i], c_.m128i[i])) << (i * 16)); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - __typeof__(b_.u64) rv = { 0, 0, 0, 0 }; - __typeof__(b_.u64) lshift = { 0, 8, 16, 24 }; - - for (int8_t i = 0 ; i < 8 ; i++) { - __typeof__(b_.u64) ct = (HEDLEY_REINTERPRET_CAST(__typeof__(ct), c_.u8) >> (i * 8)) & 63; - rv |= ((b_.u64 >> ct) & 1) << lshift; - lshift += 1; - } - - r = - HEDLEY_STATIC_CAST(simde__mmask32, rv[0]) | - HEDLEY_STATIC_CAST(simde__mmask32, rv[1]) | - HEDLEY_STATIC_CAST(simde__mmask32, rv[2]) | - HEDLEY_STATIC_CAST(simde__mmask32, rv[3]); - #else - for (size_t i = 0 ; i < (sizeof(c_.m128i_private) / sizeof(c_.m128i_private[0])) ; i++) { - for (size_t j = 0 ; j < (sizeof(c_.m128i_private[i].m64_private) / sizeof(c_.m128i_private[i].m64_private[0])) ; j++) { - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t k = 0 ; k < (sizeof(c_.m128i_private[i].m64_private[j].u8) / sizeof(c_.m128i_private[i].m64_private[j].u8[0])) ; k++) { - r |= (((b_.m128i_private[i].u64[j] >> (c_.m128i_private[i].m64_private[j].u8[k]) & 63) & 1) << ((i * 16) + (j * 8) + k)); - } - } - } - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_bitshuffle_epi64_mask - #define _mm256_bitshuffle_epi64_mask(b, c) simde_mm256_bitshuffle_epi64_mask(b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm256_mask_bitshuffle_epi64_mask (simde__mmask32 k, simde__m256i b, simde__m256i c) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_bitshuffle_epi64_mask(k, b, c); - #else - return (k & simde_mm256_bitshuffle_epi64_mask(b, c)); - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_bitshuffle_epi64_mask - #define _mm256_mask_bitshuffle_epi64_mask(k, b, c) simde_mm256_mask_bitshuffle_epi64_mask(k, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_mm512_bitshuffle_epi64_mask (simde__m512i b, simde__m512i c) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) - return _mm512_bitshuffle_epi64_mask(b, c); - #else - simde__m512i_private - b_ = simde__m512i_to_private(b), - c_ = simde__m512i_to_private(c); - simde__mmask64 r = 0; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(b_.m128i) / sizeof(b_.m128i[0])) ; i++) { - r |= (HEDLEY_STATIC_CAST(simde__mmask64, simde_mm_bitshuffle_epi64_mask(b_.m128i[i], c_.m128i[i])) << (i * 16)); - } - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(b_.m256i) / sizeof(b_.m256i[0])) ; i++) { - r |= (HEDLEY_STATIC_CAST(simde__mmask64, simde_mm256_bitshuffle_epi64_mask(b_.m256i[i], c_.m256i[i])) << (i * 32)); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - __typeof__(b_.u64) rv = { 0, 0, 0, 0, 0, 0, 0, 0 }; - __typeof__(b_.u64) lshift = { 0, 8, 16, 24, 32, 40, 48, 56 }; - - for (int8_t i = 0 ; i < 8 ; i++) { - __typeof__(b_.u64) ct = (HEDLEY_REINTERPRET_CAST(__typeof__(ct), c_.u8) >> (i * 8)) & 63; - rv |= ((b_.u64 >> ct) & 1) << lshift; - lshift += 1; - } - - r = - HEDLEY_STATIC_CAST(simde__mmask64, rv[0]) | - HEDLEY_STATIC_CAST(simde__mmask64, rv[1]) | - HEDLEY_STATIC_CAST(simde__mmask64, rv[2]) | - HEDLEY_STATIC_CAST(simde__mmask64, rv[3]) | - HEDLEY_STATIC_CAST(simde__mmask64, rv[4]) | - HEDLEY_STATIC_CAST(simde__mmask64, rv[5]) | - HEDLEY_STATIC_CAST(simde__mmask64, rv[6]) | - HEDLEY_STATIC_CAST(simde__mmask64, rv[7]); - #else - for (size_t i = 0 ; i < (sizeof(c_.m128i_private) / sizeof(c_.m128i_private[0])) ; i++) { - for (size_t j = 0 ; j < (sizeof(c_.m128i_private[i].m64_private) / sizeof(c_.m128i_private[i].m64_private[0])) ; j++) { - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t k = 0 ; k < (sizeof(c_.m128i_private[i].m64_private[j].u8) / sizeof(c_.m128i_private[i].m64_private[j].u8[0])) ; k++) { - r |= (((b_.m128i_private[i].u64[j] >> (c_.m128i_private[i].m64_private[j].u8[k]) & 63) & 1) << ((i * 16) + (j * 8) + k)); - } - } - } - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) - #undef _mm512_bitshuffle_epi64_mask - #define _mm512_bitshuffle_epi64_mask(b, c) simde_mm512_bitshuffle_epi64_mask(b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_mm512_mask_bitshuffle_epi64_mask (simde__mmask64 k, simde__m512i b, simde__m512i c) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) - return _mm512_mask_bitshuffle_epi64_mask(k, b, c); - #else - return (k & simde_mm512_bitshuffle_epi64_mask(b, c)); - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_bitshuffle_epi64_mask - #define _mm512_mask_bitshuffle_epi64_mask(k, b, c) simde_mm512_mask_bitshuffle_epi64_mask(k, b, c) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_BITSHUFFLE_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/blend.h b/ffi-deps/simde/simde/x86/avx512/blend.h deleted file mode 100644 index e34dd20..0000000 --- a/ffi-deps/simde/simde/x86/avx512/blend.h +++ /dev/null @@ -1,293 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_BLEND_H) -#define SIMDE_X86_AVX512_BLEND_H - -#include "types.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_blend_epi8(simde__mmask16 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_blend_epi8(k, a, b); - #else - return simde_mm_mask_mov_epi8(a, k, b); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_blend_epi8 - #define _mm_mask_blend_epi8(k, a, b) simde_mm_mask_blend_epi8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_blend_epi16(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_blend_epi16(k, a, b); - #else - return simde_mm_mask_mov_epi16(a, k, b); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_blend_epi16 - #define _mm_mask_blend_epi16(k, a, b) simde_mm_mask_blend_epi16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_blend_epi32(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_blend_epi32(k, a, b); - #else - return simde_mm_mask_mov_epi32(a, k, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_blend_epi32 - #define _mm_mask_blend_epi32(k, a, b) simde_mm_mask_blend_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_blend_epi64(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_blend_epi64(k, a, b); - #else - return simde_mm_mask_mov_epi64(a, k, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_blend_epi64 - #define _mm_mask_blend_epi64(k, a, b) simde_mm_mask_blend_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_mask_blend_ps(simde__mmask8 k, simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_blend_ps(k, a, b); - #else - return simde_mm_mask_mov_ps(a, k, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_blend_ps - #define _mm_mask_blend_ps(k, a, b) simde_mm_mask_blend_ps(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_mask_blend_pd(simde__mmask8 k, simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_blend_pd(k, a, b); - #else - return simde_mm_mask_mov_pd(a, k, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_blend_pd - #define _mm_mask_blend_pd(k, a, b) simde_mm_mask_blend_pd(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_blend_epi8(simde__mmask32 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_mask_blend_epi8(k, a, b); - #else - return simde_mm256_mask_mov_epi8(a, k, b); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_blend_epi8 - #define _mm256_mask_blend_epi8(k, a, b) simde_mm256_mask_blend_epi8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_blend_epi16(simde__mmask16 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_mask_blend_epi16(k, a, b); - #else - return simde_mm256_mask_mov_epi16(a, k, b); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_blend_epi16 - #define _mm256_mask_blend_epi16(k, a, b) simde_mm256_mask_blend_epi16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_blend_epi32(simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_blend_epi32(k, a, b); - #else - return simde_mm256_mask_mov_epi32(a, k, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_blend_epi32 - #define _mm256_mask_blend_epi32(k, a, b) simde_mm256_mask_blend_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_blend_epi64(simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_blend_epi64(k, a, b); - #else - return simde_mm256_mask_mov_epi64(a, k, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_blend_epi64 - #define _mm256_mask_blend_epi64(k, a, b) simde_mm256_mask_blend_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_mask_blend_ps(simde__mmask8 k, simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_blend_ps(k, a, b); - #else - return simde_mm256_mask_mov_ps(a, k, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_blend_ps - #define _mm256_mask_blend_ps(k, a, b) simde_mm256_mask_blend_ps(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_mask_blend_pd(simde__mmask8 k, simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_blend_pd(k, a, b); - #else - return simde_mm256_mask_mov_pd(a, k, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_blend_pd - #define _mm256_mask_blend_pd(k, a, b) simde_mm256_mask_blend_pd(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_blend_epi8(simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_blend_epi8(k, a, b); - #else - return simde_mm512_mask_mov_epi8(a, k, b); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_blend_epi8 - #define _mm512_mask_blend_epi8(k, a, b) simde_mm512_mask_blend_epi8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_blend_epi16(simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_blend_epi16(k, a, b); - #else - return simde_mm512_mask_mov_epi16(a, k, b); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_blend_epi16 - #define _mm512_mask_blend_epi16(k, a, b) simde_mm512_mask_blend_epi16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_blend_epi32(simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_blend_epi32(k, a, b); - #else - return simde_mm512_mask_mov_epi32(a, k, b); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_blend_epi32 - #define _mm512_mask_blend_epi32(k, a, b) simde_mm512_mask_blend_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_blend_epi64(simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_blend_epi64(k, a, b); - #else - return simde_mm512_mask_mov_epi64(a, k, b); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_blend_epi64 - #define _mm512_mask_blend_epi64(k, a, b) simde_mm512_mask_blend_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_blend_ps(simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_blend_ps(k, a, b); - #else - return simde_mm512_mask_mov_ps(a, k, b); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_blend_ps - #define _mm512_mask_blend_ps(k, a, b) simde_mm512_mask_blend_ps(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_blend_pd(simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_blend_pd(k, a, b); - #else - return simde_mm512_mask_mov_pd(a, k, b); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_blend_pd - #define _mm512_mask_blend_pd(k, a, b) simde_mm512_mask_blend_pd(k, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_BLEND_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/broadcast.h b/ffi-deps/simde/simde/x86/avx512/broadcast.h deleted file mode 100644 index 33b41ab..0000000 --- a/ffi-deps/simde/simde/x86/avx512/broadcast.h +++ /dev/null @@ -1,897 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - * 2020 Christopher Moore - */ - -#if !defined(SIMDE_X86_AVX512_BROADCAST_H) -#define SIMDE_X86_AVX512_BROADCAST_H - -#include "types.h" -#include "../avx2.h" - -#include "mov.h" -#include "cast.h" -#include "set1.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_broadcast_f32x2 (simde__m128 a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm256_broadcast_f32x2(a); - #else - simde__m256_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) - r_.f32 = __builtin_shufflevector(a_.f32, a_.f32, 0, 1, 0, 1, 0, 1, 0, 1); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) { - r_.f32[ i ] = a_.f32[0]; - r_.f32[i + 1] = a_.f32[1]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcast_f32x2 - #define _mm256_broadcast_f32x2(a) simde_mm256_broadcast_f32x2(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_mask_broadcast_f32x2(simde__m256 src, simde__mmask8 k, simde__m128 a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm256_mask_broadcast_f32x2(src, k, a); - #else - return simde_mm256_mask_mov_ps(src, k, simde_mm256_broadcast_f32x2(a)); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_broadcast_f32x2 - #define _mm256_mask_broadcast_f32x2(src, k, a) simde_mm256_mask_broadcast_f32x2(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_maskz_broadcast_f32x2(simde__mmask8 k, simde__m128 a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm256_maskz_broadcast_f32x2(k, a); - #else - return simde_mm256_maskz_mov_ps(k, simde_mm256_broadcast_f32x2(a)); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_broadcast_f32x2 - #define _mm256_maskz_broadcast_f32x2(k, a) simde_mm256_maskz_broadcast_f32x2(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_broadcast_f32x2 (simde__m128 a) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_broadcast_f32x2(a); - #else - simde__m512_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) - r_.f32 = __builtin_shufflevector(a_.f32, a_.f32, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i+=2) { - r_.f32[ i ] = a_.f32[0]; - r_.f32[i + 1] = a_.f32[1]; - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_broadcast_f32x2 - #define _mm512_broadcast_f32x2(a) simde_mm512_broadcast_f32x2(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_broadcast_f32x2(simde__m512 src, simde__mmask16 k, simde__m128 a) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_mask_broadcast_f32x2(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_broadcast_f32x2(a)); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_broadcast_f32x2 - #define _mm512_mask_broadcast_f32x2(src, k, a) simde_mm512_mask_broadcast_f32x2(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_broadcast_f32x2(simde__mmask16 k, simde__m128 a) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_maskz_broadcast_f32x2(k, a); - #else - return simde_mm512_maskz_mov_ps(k, simde_mm512_broadcast_f32x2(a)); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_broadcast_f32x2 - #define _mm512_maskz_broadcast_f32x2(k, a) simde_mm512_maskz_broadcast_f32x2(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_broadcast_f32x8 (simde__m256 a) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_broadcast_f32x8(a); - #else - simde__m512_private r_; - simde__m256_private a_ = simde__m256_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) - r_.f32 = __builtin_shufflevector(a_.f32, a_.f32, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i+=8) { - r_.f32[ i ] = a_.f32[0]; - r_.f32[i + 1] = a_.f32[1]; - r_.f32[i + 2] = a_.f32[2]; - r_.f32[i + 3] = a_.f32[3]; - r_.f32[i + 4] = a_.f32[4]; - r_.f32[i + 5] = a_.f32[5]; - r_.f32[i + 6] = a_.f32[6]; - r_.f32[i + 7] = a_.f32[7]; - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_broadcast_f32x8 - #define _mm512_broadcast_f32x8(a) simde_mm512_broadcast_f32x8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_broadcast_f32x8(simde__m512 src, simde__mmask16 k, simde__m256 a) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_mask_broadcast_f32x8(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_broadcast_f32x8(a)); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_broadcast_f32x8 - #define _mm512_mask_broadcast_f32x8(src, k, a) simde_mm512_mask_broadcast_f32x8(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_broadcast_f32x8(simde__mmask16 k, simde__m256 a) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_maskz_broadcast_f32x8(k, a); - #else - return simde_mm512_maskz_mov_ps(k, simde_mm512_broadcast_f32x8(a)); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_broadcast_f32x8 - #define _mm512_maskz_broadcast_f32x8(k, a) simde_mm512_maskz_broadcast_f32x8(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_broadcast_f64x2 (simde__m128d a) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_broadcast_f64x2(a); - #else - simde__m512d_private r_; - simde__m128d_private a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(SIMDE_BUG_CLANG_BAD_VI64_OPS) - r_.f64 = __builtin_shufflevector(a_.f64, a_.f64, 0, 1, 0, 1, 0, 1, 0, 1); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i += 2) { - r_.f64[ i ] = a_.f64[0]; - r_.f64[i + 1] = a_.f64[1]; - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_broadcast_f64x2 - #define _mm512_broadcast_f64x2(a) simde_mm512_broadcast_f64x2(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_broadcast_f64x2(simde__m512d src, simde__mmask8 k, simde__m128d a) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_mask_broadcast_f64x2(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_broadcast_f64x2(a)); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_broadcast_f64x2 - #define _mm512_mask_broadcast_f64x2(src, k, a) simde_mm512_mask_broadcast_f64x2(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_maskz_broadcast_f64x2(simde__mmask8 k, simde__m128d a) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_maskz_broadcast_f64x2(k, a); - #else - return simde_mm512_maskz_mov_pd(k, simde_mm512_broadcast_f64x2(a)); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_broadcast_f64x2 - #define _mm512_maskz_broadcast_f64x2(k, a) simde_mm512_maskz_broadcast_f64x2(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_broadcast_f32x4 (simde__m128 a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_broadcast_f32x4(a); - #else - simde__m256_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128_private[0] = a_; - r_.m128_private[1] = a_; - #elif defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) - r_.f32 = __builtin_shufflevector(a_.f32, a_.f32, 0, 1, 2, 3, 0, 1, 2, 3); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 4) { - r_.f32[ i ] = a_.f32[0]; - r_.f32[i + 1] = a_.f32[1]; - r_.f32[i + 2] = a_.f32[2]; - r_.f32[i + 3] = a_.f32[3]; - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcast_f32x4 - #define _mm256_broadcast_f32x4(a) simde_mm256_broadcast_f32x4(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_mask_broadcast_f32x4(simde__m256 src, simde__mmask8 k, simde__m128 a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_broadcast_f32x4(src, k, a); - #else - return simde_mm256_mask_mov_ps(src, k, simde_mm256_broadcast_f32x4(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_broadcast_f32x4 - #define _mm256_mask_broadcast_f32x4(src, k, a) simde_mm256_mask_broadcast_f32x4(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_maskz_broadcast_f32x4(simde__mmask8 k, simde__m128 a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_broadcast_f32x4(k, a); - #else - return simde_mm256_maskz_mov_ps(k, simde_mm256_broadcast_f32x4(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_broadcast_f32x4 - #define _mm256_maskz_broadcast_f32x4(k, a) simde_mm256_maskz_broadcast_f32x4(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_broadcast_f64x2 (simde__m128d a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm256_broadcast_f64x2(a); - #else - simde__m256d_private r_; - simde__m128d_private a_ = simde__m128d_to_private(a); - - /* I don't have a bug # for this, but when compiled with clang-10 without optimization on aarch64 - * the __builtin_shufflevector version doesn't work correctly. clang 9 and 11 aren't a problem */ - #if defined(SIMDE_VECTOR_SUBSCRIPT) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && \ - (!defined(__clang__) || (SIMDE_DETECT_CLANG_VERSION < 100000 || SIMDE_DETECT_CLANG_VERSION > 100000)) - r_.f64 = __builtin_shufflevector(a_.f64, a_.f64, 0, 1, 0, 1); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i += 2) { - r_.f64[ i ] = a_.f64[0]; - r_.f64[i + 1] = a_.f64[1]; - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_broadcast_f64x2 - #define _mm256_broadcast_f64x2(a) simde_mm256_broadcast_f64x2(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_mask_broadcast_f64x2(simde__m256d src, simde__mmask8 k, simde__m128d a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm256_mask_broadcast_f64x2(src, k, a); - #else - return simde_mm256_mask_mov_pd(src, k, simde_mm256_broadcast_f64x2(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_broadcast_f64x2 - #define _mm256_mask_broadcast_f64x2(src, k, a) simde_mm256_mask_broadcast_f64x2(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_maskz_broadcast_f64x2(simde__mmask8 k, simde__m128d a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm256_maskz_broadcast_f64x2(k, a); - #else - return simde_mm256_maskz_mov_pd(k, simde_mm256_broadcast_f64x2(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_broadcast_f64x2 - #define _mm256_maskz_broadcast_f64x2(k, a) simde_mm256_maskz_broadcast_f64x2(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_broadcast_f32x4 (simde__m128 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_broadcast_f32x4(a); - #else - simde__m512_private r_; - - #if defined(SIMDE_X86_AVX2_NATIVE) - r_.m256[1] = r_.m256[0] = simde_mm256_castsi256_ps(simde_mm256_broadcastsi128_si256(simde_mm_castps_si128(a))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = a; - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_broadcast_f32x4 - #define _mm512_broadcast_f32x4(a) simde_mm512_broadcast_f32x4(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_broadcast_f32x4(simde__m512 src, simde__mmask16 k, simde__m128 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_broadcast_f32x4(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_broadcast_f32x4(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_broadcast_f32x4 - #define _mm512_mask_broadcast_f32x4(src, k, a) simde_mm512_mask_broadcast_f32x4(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_broadcast_f32x4(simde__mmask16 k, simde__m128 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_broadcast_f32x4(k, a); - #else - return simde_mm512_maskz_mov_ps(k, simde_mm512_broadcast_f32x4(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_broadcast_f32x4 - #define _mm512_maskz_broadcast_f32x4(k, a) simde_mm512_maskz_broadcast_f32x4(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_broadcast_f64x4 (simde__m256d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_broadcast_f64x4(a); - #else - simde__m512d_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = a; - } - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_broadcast_f64x4 - #define _mm512_broadcast_f64x4(a) simde_mm512_broadcast_f64x4(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_broadcast_f64x4(simde__m512d src, simde__mmask8 k, simde__m256d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_broadcast_f64x4(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_broadcast_f64x4(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_broadcast_f64x4 - #define _mm512_mask_broadcast_f64x4(src, k, a) simde_mm512_mask_broadcast_f64x4(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_maskz_broadcast_f64x4(simde__mmask8 k, simde__m256d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_broadcast_f64x4(k, a); - #else - return simde_mm512_maskz_mov_pd(k, simde_mm512_broadcast_f64x4(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_broadcast_f64x4 - #define _mm512_maskz_broadcast_f64x4(k, a) simde_mm512_maskz_broadcast_f64x4(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_broadcast_i32x4 (simde__m128i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_broadcast_i32x4(a); - #else - simde__m512i_private r_; - - #if defined(SIMDE_X86_AVX2_NATIVE) - r_.m256i[1] = r_.m256i[0] = simde_mm256_broadcastsi128_si256(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - r_.m128i[3] = r_.m128i[2] = r_.m128i[1] = r_.m128i[0] = a; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = a; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_broadcast_i32x4 - #define _mm512_broadcast_i32x4(a) simde_mm512_broadcast_i32x4(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_broadcast_i32x4(simde__m512i src, simde__mmask16 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_broadcast_i32x4(src, k, a); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_broadcast_i32x4(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_broadcast_i32x4 - #define _mm512_mask_broadcast_i32x4(src, k, a) simde_mm512_mask_broadcast_i32x4(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_broadcast_i32x4(simde__mmask16 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_broadcast_i32x4(k, a); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_broadcast_i32x4(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_broadcast_i32x4 - #define _mm512_maskz_broadcast_i32x4(k, a) simde_mm512_maskz_broadcast_i32x4(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_broadcast_i64x4 (simde__m256i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_broadcast_i64x4(a); - #else - simde__m512i_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = a; - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_broadcast_i64x4 - #define _mm512_broadcast_i64x4(a) simde_mm512_broadcast_i64x4(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_broadcast_i64x4(simde__m512i src, simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_broadcast_i64x4(src, k, a); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_broadcast_i64x4(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_broadcast_i64x4 - #define _mm512_mask_broadcast_i64x4(src, k, a) simde_mm512_mask_broadcast_i64x4(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_broadcast_i64x4(simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_broadcast_i64x4(k, a); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_broadcast_i64x4(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_broadcast_i64x4 - #define _mm512_maskz_broadcast_i64x4(k, a) simde_mm512_maskz_broadcast_i64x4(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_broadcastd_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_broadcastd_epi32(a); - #else - simde__m512i_private r_; - simde__m128i_private a_= simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[0]; - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_broadcastd_epi32 - #define _mm512_broadcastd_epi32(a) simde_mm512_broadcastd_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_broadcastd_epi32(simde__m512i src, simde__mmask16 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_broadcastd_epi32(src, k, a); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_broadcastd_epi32(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_broadcastd_epi32 - #define _mm512_mask_broadcastd_epi32(src, k, a) simde_mm512_mask_broadcastd_epi32(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_broadcastd_epi32(simde__mmask16 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_broadcastd_epi32(k, a); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_broadcastd_epi32(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_broadcastd_epi32 - #define _mm512_maskz_broadcastd_epi32(k, a) simde_mm512_maskz_broadcastd_epi32(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_broadcastq_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_broadcastq_epi64(a); - #else - simde__m512i_private r_; - simde__m128i_private a_= simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[0]; - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_broadcastq_epi64 - #define _mm512_broadcastq_epi64(a) simde_mm512_broadcastq_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_broadcastq_epi64(simde__m512i src, simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_broadcastq_epi64(src, k, a); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_broadcastq_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_broadcastq_epi64 - #define _mm512_mask_broadcastq_epi64(src, k, a) simde_mm512_mask_broadcastq_epi64(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_broadcastq_epi64(simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_broadcastq_epi64(k, a); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_broadcastq_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_broadcastq_epi64 - #define _mm512_maskz_broadcastq_epi64(k, a) simde_mm512_maskz_broadcastq_epi64(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_broadcastss_ps (simde__m128 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_broadcastss_ps(a); - #else - simde__m512_private r_; - simde__m128_private a_= simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[0]; - } - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_broadcastss_ps - #define _mm512_broadcastss_ps(a) simde_mm512_broadcastss_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_broadcastss_ps(simde__m512 src, simde__mmask16 k, simde__m128 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_broadcastss_ps(src, k, a); - #else - simde__m512_private - src_ = simde__m512_to_private(src), - r_; - simde__m128_private - a_ = simde__m128_to_private(a); - - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = ((k >> i) & 1) ? a_.f32[0] : src_.f32[i]; - } - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_broadcastss_ps - #define _mm512_mask_broadcastss_ps(src, k, a) simde_mm512_mask_broadcastss_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_broadcastss_ps(simde__mmask16 k, simde__m128 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_broadcastss_ps(k, a); - #else - simde__m512_private - r_; - simde__m128_private - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = ((k >> i) & 1) ? a_.f32[0] : INT32_C(0); - } - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_broadcastss_ps - #define _mm512_maskz_broadcastss_ps(k, a) simde_mm512_maskz_broadcastss_ps(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_broadcastsd_pd (simde__m128d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_broadcastsd_pd(a); - #else - simde__m512d_private r_; - simde__m128d_private a_= simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[0]; - } - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_broadcastsd_pd - #define _mm512_broadcastsd_pd(a) simde_mm512_broadcastsd_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_broadcastsd_pd(simde__m512d src, simde__mmask8 k, simde__m128d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_broadcastsd_pd(src, k, a); - #else - simde__m512d_private - src_ = simde__m512d_to_private(src), - r_; - simde__m128d_private - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = ((k >> i) & 1) ? a_.f64[0] : src_.f64[i]; - } - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_broadcastsd_pd - #define _mm512_mask_broadcastsd_pd(src, k, a) simde_mm512_mask_broadcastsd_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_maskz_broadcastsd_pd(simde__mmask8 k, simde__m128d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_broadcastsd_pd(k, a); - #else - simde__m512d_private - r_; - simde__m128d_private - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = ((k >> i) & 1) ? a_.f64[0] : INT64_C(0); - } - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_broadcastsd_pd - #define _mm512_maskz_broadcastsd_pd(k, a) simde_mm512_maskz_broadcastsd_pd(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_broadcastb_epi8 (simde__m128i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_broadcastb_epi8(a); - #else - simde__m128i_private a_= simde__m128i_to_private(a); - return simde_mm512_set1_epi8(a_.i8[0]); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_broadcastb_epi8 - #define _mm512_broadcastb_epi8(a) simde_mm512_broadcastb_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_broadcastb_epi8 (simde__m512i src, simde__mmask64 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_broadcastb_epi8(src, k, a); - #else - return simde_mm512_mask_mov_epi8(src, k, simde_mm512_broadcastb_epi8(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_broadcastb_epi8 - #define _mm512_mask_broadcastb_epi8(src, k, a) simde_mm512_mask_broadcastb_epi8(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_broadcastb_epi8 (simde__mmask64 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_broadcastb_epi8(k, a); - #else - return simde_mm512_maskz_mov_epi8(k, simde_mm512_broadcastb_epi8(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_broadcastb_epi8 - #define _mm512_maskz_broadcastb_epi8(k, a) simde_mm512_maskz_broadcastb_epi8(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_broadcastw_epi16 (simde__m128i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_broadcastw_epi16(a); - #else - simde__m128i_private a_= simde__m128i_to_private(a); - return simde_mm512_set1_epi16(a_.i16[0]); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_broadcastw_epi16 - #define _mm512_broadcastw_epi16(a) simde_mm512_broadcastw_epi16(a) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_BROADCAST_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/cast.h b/ffi-deps/simde/simde/x86/avx512/cast.h deleted file mode 100644 index 7f67a57..0000000 --- a/ffi-deps/simde/simde/x86/avx512/cast.h +++ /dev/null @@ -1,357 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Himanshi Mathur - * 2020 Hidayat Khan - * 2020 Christopher Moore - */ - -#if !defined(SIMDE_X86_AVX512_CAST_H) -#define SIMDE_X86_AVX512_CAST_H - -#include "types.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_castpd_ps (simde__m512d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_castpd_ps(a); - #else - simde__m512 r; - simde_memcpy(&r, &a, sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_castpd_ps - #define _mm512_castpd_ps(a) simde_mm512_castpd_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_castpd_si512 (simde__m512d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_castpd_si512(a); - #else - simde__m512i r; - simde_memcpy(&r, &a, sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_castpd_si512 - #define _mm512_castpd_si512(a) simde_mm512_castpd_si512(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_castps_pd (simde__m512 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_castps_pd(a); - #else - simde__m512d r; - simde_memcpy(&r, &a, sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_castps_pd - #define _mm512_castps_pd(a) simde_mm512_castps_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_castps_si512 (simde__m512 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_castps_si512(a); - #else - simde__m512i r; - simde_memcpy(&r, &a, sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_castps_si512 - #define _mm512_castps_si512(a) simde_mm512_castps_si512(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_castph_si512 (simde__m512h a) { - #if defined(SIMDE_X86_AVX512FP16_NATIVE) - return _mm512_castph_si512(a); - #else - simde__m512i r; - simde_memcpy(&r, &a, sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) - #undef _mm512_castph_si512 - #define _mm512_castph_si512(a) simde_mm512_castph_si512(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512h -simde_mm512_castsi512_ph (simde__m512i a) { - #if defined(SIMDE_X86_AVX512FP16_NATIVE) - return _mm512_castsi512_ph(a); - #else - simde__m512h r; - simde_memcpy(&r, &a, sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) - #undef _mm512_castsi512_ph - #define _mm512_castsi512_ph(a) simde_mm512_castsi512_ph(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_castsi512_ps (simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_castsi512_ps(a); - #else - simde__m512 r; - simde_memcpy(&r, &a, sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_castsi512_ps - #define _mm512_castsi512_ps(a) simde_mm512_castsi512_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_castsi512_pd (simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_castsi512_pd(a); - #else - simde__m512d r; - simde_memcpy(&r, &a, sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_castsi512_pd - #define _mm512_castsi512_pd(a) simde_mm512_castsi512_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_castpd128_pd512 (simde__m128d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_castpd128_pd512(a); - #else - simde__m512d_private r_; - r_.m128d[0] = a; - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_castpd128_pd512 - #define _mm512_castpd128_pd512(a) simde_mm512_castpd128_pd512(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_castpd256_pd512 (simde__m256d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_castpd256_pd512(a); - #else - simde__m512d_private r_; - r_.m256d[0] = a; - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_castpd256_pd512 - #define _mm512_castpd256_pd512(a) simde_mm512_castpd256_pd512(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm512_castpd512_pd128 (simde__m512d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_castpd512_pd128(a); - #else - simde__m512d_private a_ = simde__m512d_to_private(a); - return a_.m128d[0]; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_castpd512_pd128 - #define _mm512_castpd512_pd128(a) simde_mm512_castpd512_pd128(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm512_castpd512_pd256 (simde__m512d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_castpd512_pd256(a); - #else - simde__m512d_private a_ = simde__m512d_to_private(a); - return a_.m256d[0]; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_castpd512_pd256 - #define _mm512_castpd512_pd256(a) simde_mm512_castpd512_pd256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_castps128_ps512 (simde__m128 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_castps128_ps512(a); - #else - simde__m512_private r_; - r_.m128[0] = a; - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_castps128_ps512 - #define _mm512_castps128_ps512(a) simde_mm512_castps128_ps512(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_castps256_ps512 (simde__m256 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_castps256_ps512(a); - #else - simde__m512_private r_; - r_.m256[0] = a; - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_castps256_ps512 - #define _mm512_castps256_ps512(a) simde_mm512_castps256_ps512(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm512_castps512_ps128 (simde__m512 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_castps512_ps128(a); - #else - simde__m512_private a_ = simde__m512_to_private(a); - return a_.m128[0]; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_castps512_ps128 - #define _mm512_castps512_ps128(a) simde_mm512_castps512_ps128(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm512_castps512_ps256 (simde__m512 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_castps512_ps256(a); - #else - simde__m512_private a_ = simde__m512_to_private(a); - return a_.m256[0]; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_castps512_ps256 - #define _mm512_castps512_ps256(a) simde_mm512_castps512_ps256(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_castsi128_si512 (simde__m128i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_castsi128_si512(a); - #else - simde__m512i_private r_; - r_.m128i[0] = a; - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_castsi128_si512 - #define _mm512_castsi128_si512(a) simde_mm512_castsi128_si512(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_castsi256_si512 (simde__m256i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_castsi256_si512(a); - #else - simde__m512i_private r_; - r_.m256i[0] = a; - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_castsi256_si512 - #define _mm512_castsi256_si512(a) simde_mm512_castsi256_si512(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm512_castsi512_si128 (simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_castsi512_si128(a); - #else - simde__m512i_private a_ = simde__m512i_to_private(a); - return a_.m128i[0]; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_castsi512_si128 - #define _mm512_castsi512_si128(a) simde_mm512_castsi512_si128(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm512_castsi512_si256 (simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_castsi512_si256(a); - #else - simde__m512i_private a_ = simde__m512i_to_private(a); - return a_.m256i[0]; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_castsi512_si256 - #define _mm512_castsi512_si256(a) simde_mm512_castsi512_si256(a) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_CAST_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/cmp.h b/ffi-deps/simde/simde/x86/avx512/cmp.h deleted file mode 100644 index 2a3b99c..0000000 --- a/ffi-deps/simde/simde/x86/avx512/cmp.h +++ /dev/null @@ -1,1714 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020-2021 Evan Nemerson - * 2020 Himanshi Mathur - */ - -#if !defined(SIMDE_X86_AVX512_CMP_H) -#define SIMDE_X86_AVX512_CMP_H - -#include "types.h" -#include "mov.h" -#include "mov_mask.h" -#include "setzero.h" -#include "setone.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -#if defined(__clang__) && SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16 -SIMDE_DIAGNOSTIC_DISABLE_DOUBLE_PROMOTION_ -#endif - -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_mm512_cmp_epi8_mask (simde__m512i a, simde__m512i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - switch (imm8) { - case SIMDE_MM_CMPINT_EQ: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 == b_.i8)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_LT: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 < b_.i8)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] < b_.i8[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_LE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 <= b_.i8)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] <= b_.i8[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_FALSE: - r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); - break; - - - case SIMDE_MM_CMPINT_NE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 != b_.i8)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] != b_.i8[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_NLT: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), ~(a_.i8 < b_.i8)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = !(a_.i8[i] < b_.i8[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_NLE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), ~(a_.i8 <= b_.i8)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = !(a_.i8[i] <= b_.i8[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_TRUE: - r_ = simde__m512i_to_private(simde_x_mm512_setone_si512()); - break; - - default: - HEDLEY_UNREACHABLE(); - } - - return simde_mm512_movepi8_mask(simde__m512i_from_private(r_)); -} -#if defined(SIMDE_X86_AVX512BW_NATIVE) - #define simde_mm512_cmp_epi8_mask(a, b, imm8) _mm512_cmp_epi8_mask((a), (b), (imm8)) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmp_epi8_mask - #define _mm512_cmp_epi8_mask(a, b, imm8) simde_mm512_cmp_epi8_mask((a), (b), (imm8)) -#endif - -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_cmp_epi32_mask (simde__m256i a, simde__m256i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - switch (imm8) { - case SIMDE_MM_CMPINT_EQ: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 == b_.i32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_LT: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 < b_.i32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] < b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_LE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 <= b_.i32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] <= b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_FALSE: - r_ = simde__m256i_to_private(simde_mm256_setzero_si256()); - break; - - - case SIMDE_MM_CMPINT_NE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 != b_.i32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] != b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_NLT: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.i32 < b_.i32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = !(a_.i32[i] < b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_NLE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.i32 <= b_.i32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = !(a_.i32[i] <= b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_TRUE: - r_ = simde__m256i_to_private(simde_x_mm256_setone_si256()); - break; - - default: - HEDLEY_UNREACHABLE(); - } - - return simde_mm256_movepi32_mask(simde__m256i_from_private(r_)); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_cmp_epi32_mask(a, b, imm8) _mm256_cmp_epi32_mask((a), (b), (imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmp_epi32_mask - #define _mm256_cmp_epi32_mask(a, b, imm8) simde_mm256_cmp_epi32_mask((a), (b), (imm8)) -#endif - -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm512_cmp_ps_mask (simde__m512 a, simde__m512 b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) { - simde__m512_private - r_, - a_ = simde__m512_to_private(a), - b_ = simde__m512_to_private(b); - - switch (imm8) { - case SIMDE_CMP_EQ_OQ: - case SIMDE_CMP_EQ_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 == b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = (a_.f32[i] == b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_LT_OQ: - case SIMDE_CMP_LT_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 < b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = (a_.f32[i] < b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_LE_OQ: - case SIMDE_CMP_LE_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 <= b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = (a_.f32[i] <= b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_UNORD_Q: - case SIMDE_CMP_UNORD_S: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 != a_.f32) | (b_.f32 != b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = ((a_.f32[i] != a_.f32[i]) || (b_.f32[i] != b_.f32[i])) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NEQ_UQ: - case SIMDE_CMP_NEQ_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 != b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = (a_.f32[i] != b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NEQ_OQ: - case SIMDE_CMP_NEQ_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 == a_.f32) & (b_.f32 == b_.f32) & (a_.f32 != b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = ((a_.f32[i] == a_.f32[i]) & (b_.f32[i] == b_.f32[i]) & (a_.f32[i] != b_.f32[i])) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NLT_UQ: - case SIMDE_CMP_NLT_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.f32 < b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = !(a_.f32[i] < b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NLE_UQ: - case SIMDE_CMP_NLE_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.f32 <= b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = !(a_.f32[i] <= b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_ORD_Q: - case SIMDE_CMP_ORD_S: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ((a_.f32 == a_.f32) & (b_.f32 == b_.f32))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = ((a_.f32[i] == a_.f32[i]) & (b_.f32[i] == b_.f32[i])) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_EQ_UQ: - case SIMDE_CMP_EQ_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 != a_.f32) | (b_.f32 != b_.f32) | (a_.f32 == b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = ((a_.f32[i] != a_.f32[i]) | (b_.f32[i] != b_.f32[i]) | (a_.f32[i] == b_.f32[i])) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NGE_UQ: - case SIMDE_CMP_NGE_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.f32 >= b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = !(a_.f32[i] >= b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_NGT_UQ: - case SIMDE_CMP_NGT_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.f32 > b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = !(a_.f32[i] > b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_FALSE_OQ: - case SIMDE_CMP_FALSE_OS: - r_ = simde__m512_to_private(simde_mm512_setzero_ps()); - break; - - case SIMDE_CMP_GE_OQ: - case SIMDE_CMP_GE_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 >= b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = (a_.f32[i] >= b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_GT_OQ: - case SIMDE_CMP_GT_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 > b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.i32[i] = (a_.f32[i] > b_.f32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - break; - - case SIMDE_CMP_TRUE_UQ: - case SIMDE_CMP_TRUE_US: - r_ = simde__m512_to_private(simde_x_mm512_setone_ps()); - break; - - default: - HEDLEY_UNREACHABLE(); - } - - return simde_mm512_movepi32_mask(simde_mm512_castps_si512(simde__m512_from_private(r_))); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_cmp_ps_mask(a, b, imm8) _mm512_cmp_ps_mask((a), (b), (imm8)) -#elif defined(SIMDE_STATEMENT_EXPR_) && SIMDE_NATURAL_VECTOR_SIZE_LE(128) - #define simde_mm512_cmp_ps_mask(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512_private \ - simde_mm512_cmp_ps_mask_r_ = simde__m512_to_private(simde_mm512_setzero_ps()), \ - simde_mm512_cmp_ps_mask_a_ = simde__m512_to_private((a)), \ - simde_mm512_cmp_ps_mask_b_ = simde__m512_to_private((b)); \ - \ - for (size_t i = 0 ; i < (sizeof(simde_mm512_cmp_ps_mask_r_.m128) / sizeof(simde_mm512_cmp_ps_mask_r_.m128[0])) ; i++) { \ - simde_mm512_cmp_ps_mask_r_.m128[i] = simde_mm_cmp_ps(simde_mm512_cmp_ps_mask_a_.m128[i], simde_mm512_cmp_ps_mask_b_.m128[i], (imm8)); \ - } \ - \ - simde_mm512_movepi32_mask(simde_mm512_castps_si512(simde__m512_from_private(simde_mm512_cmp_ps_mask_r_))); \ - })) -#elif defined(SIMDE_STATEMENT_EXPR_) && SIMDE_NATURAL_VECTOR_SIZE_LE(256) - #define simde_mm512_cmp_ps_mask(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512_private \ - simde_mm512_cmp_ps_mask_r_ = simde__m512_to_private(simde_mm512_setzero_ps()), \ - simde_mm512_cmp_ps_mask_a_ = simde__m512_to_private((a)), \ - simde_mm512_cmp_ps_mask_b_ = simde__m512_to_private((b)); \ - \ - for (size_t i = 0 ; i < (sizeof(simde_mm512_cmp_ps_mask_r_.m256) / sizeof(simde_mm512_cmp_ps_mask_r_.m256[0])) ; i++) { \ - simde_mm512_cmp_ps_mask_r_.m256[i] = simde_mm256_cmp_ps(simde_mm512_cmp_ps_mask_a_.m256[i], simde_mm512_cmp_ps_mask_b_.m256[i], (imm8)); \ - } \ - \ - simde_mm512_movepi32_mask(simde_mm512_castps_si512(simde__m512_from_private(simde_mm512_cmp_ps_mask_r_))); \ - })) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmp_ps_mask - #define _mm512_cmp_ps_mask(a, b, imm8) simde_mm512_cmp_ps_mask((a), (b), (imm8)) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_cmp_ps_mask(a, b, imm8) _mm256_cmp_ps_mask((a), (b), (imm8)) -#else - #define simde_mm256_cmp_ps_mask(a, b, imm8) simde_mm256_movepi32_mask(simde_mm256_castps_si256(simde_mm256_cmp_ps((a), (b), (imm8)))) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmp_ps_mask - #define _mm256_cmp_ps_mask(a, b, imm8) simde_mm256_cmp_ps_mask((a), (b), (imm8)) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_cmp_ps_mask(a, b, imm8) _mm_cmp_ps_mask((a), (b), (imm8)) -#else - #define simde_mm_cmp_ps_mask(a, b, imm8) simde_mm_movepi32_mask(simde_mm_castps_si128(simde_mm_cmp_ps((a), (b), (imm8)))) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_cmp_ps_mask - #define _mm_cmp_ps_mask(a, b, imm8) simde_mm_cmp_ps_mask((a), (b), (imm8)) -#endif - -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm512_cmp_pd_mask (simde__m512d a, simde__m512d b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) { - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a), - b_ = simde__m512d_to_private(b); - - switch (imm8) { - case SIMDE_CMP_EQ_OQ: - case SIMDE_CMP_EQ_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] == b_.f64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - break; - - case SIMDE_CMP_LT_OQ: - case SIMDE_CMP_LT_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] < b_.f64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - break; - - case SIMDE_CMP_LE_OQ: - case SIMDE_CMP_LE_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] <= b_.f64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - break; - - case SIMDE_CMP_UNORD_Q: - case SIMDE_CMP_UNORD_S: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 != a_.f64) | (b_.f64 != b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = ((a_.f64[i] != a_.f64[i]) || (b_.f64[i] != b_.f64[i])) ? ~INT64_C(0) : INT64_C(0); - } - #endif - break; - - case SIMDE_CMP_NEQ_UQ: - case SIMDE_CMP_NEQ_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] != b_.f64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - break; - - case SIMDE_CMP_NEQ_OQ: - case SIMDE_CMP_NEQ_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 == a_.f64) & (b_.f64 == b_.f64) & (a_.f64 != b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = ((a_.f64[i] == a_.f64[i]) & (b_.f64[i] == b_.f64[i]) & (a_.f64[i] != b_.f64[i])) ? ~INT64_C(0) : INT64_C(0); - } - #endif - break; - - case SIMDE_CMP_NLT_UQ: - case SIMDE_CMP_NLT_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.f64 < b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = !(a_.f64[i] < b_.f64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - break; - - case SIMDE_CMP_NLE_UQ: - case SIMDE_CMP_NLE_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.f64 <= b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = !(a_.f64[i] <= b_.f64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - break; - - case SIMDE_CMP_ORD_Q: - case SIMDE_CMP_ORD_S: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ((a_.f64 == a_.f64) & (b_.f64 == b_.f64))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = ((a_.f64[i] == a_.f64[i]) & (b_.f64[i] == b_.f64[i])) ? ~INT64_C(0) : INT64_C(0); - } - #endif - break; - - case SIMDE_CMP_EQ_UQ: - case SIMDE_CMP_EQ_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 != a_.f64) | (b_.f64 != b_.f64) | (a_.f64 == b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = ((a_.f64[i] != a_.f64[i]) | (b_.f64[i] != b_.f64[i]) | (a_.f64[i] == b_.f64[i])) ? ~INT64_C(0) : INT64_C(0); - } - #endif - break; - - case SIMDE_CMP_NGE_UQ: - case SIMDE_CMP_NGE_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.f64 >= b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = !(a_.f64[i] >= b_.f64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - break; - - case SIMDE_CMP_NGT_UQ: - case SIMDE_CMP_NGT_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.f64 > b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = !(a_.f64[i] > b_.f64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - break; - - case SIMDE_CMP_FALSE_OQ: - case SIMDE_CMP_FALSE_OS: - r_ = simde__m512d_to_private(simde_mm512_setzero_pd()); - break; - - case SIMDE_CMP_GE_OQ: - case SIMDE_CMP_GE_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] >= b_.f64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - break; - - case SIMDE_CMP_GT_OQ: - case SIMDE_CMP_GT_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.i64[i] = (a_.f64[i] > b_.f64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - break; - - case SIMDE_CMP_TRUE_UQ: - case SIMDE_CMP_TRUE_US: - r_ = simde__m512d_to_private(simde_x_mm512_setone_pd()); - break; - - default: - HEDLEY_UNREACHABLE(); - } - - return simde_mm512_movepi64_mask(simde_mm512_castpd_si512(simde__m512d_from_private(r_))); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_cmp_pd_mask(a, b, imm8) _mm512_cmp_pd_mask((a), (b), (imm8)) -#elif defined(SIMDE_STATEMENT_EXPR_) && SIMDE_NATURAL_VECTOR_SIZE_LE(128) - #define simde_mm512_cmp_pd_mask(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512d_private \ - simde_mm512_cmp_pd_mask_r_ = simde__m512d_to_private(simde_mm512_setzero_pd()), \ - simde_mm512_cmp_pd_mask_a_ = simde__m512d_to_private((a)), \ - simde_mm512_cmp_pd_mask_b_ = simde__m512d_to_private((b)); \ - \ - for (size_t simde_mm512_cmp_pd_mask_i = 0 ; simde_mm512_cmp_pd_mask_i < (sizeof(simde_mm512_cmp_pd_mask_r_.m128d) / sizeof(simde_mm512_cmp_pd_mask_r_.m128d[0])) ; simde_mm512_cmp_pd_mask_i++) { \ - simde_mm512_cmp_pd_mask_r_.m128d[simde_mm512_cmp_pd_mask_i] = simde_mm_cmp_pd(simde_mm512_cmp_pd_mask_a_.m128d[simde_mm512_cmp_pd_mask_i], simde_mm512_cmp_pd_mask_b_.m128d[simde_mm512_cmp_pd_mask_i], (imm8)); \ - } \ - \ - simde_mm512_movepi64_mask(simde_mm512_castpd_si512(simde__m512d_from_private(simde_mm512_cmp_pd_mask_r_))); \ - })) -#elif defined(SIMDE_STATEMENT_EXPR_) && SIMDE_NATURAL_VECTOR_SIZE_LE(256) - #define simde_mm512_cmp_pd_mask(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512d_private \ - simde_mm512_cmp_pd_mask_r_ = simde__m512d_to_private(simde_mm512_setzero_pd()), \ - simde_mm512_cmp_pd_mask_a_ = simde__m512d_to_private((a)), \ - simde_mm512_cmp_pd_mask_b_ = simde__m512d_to_private((b)); \ - \ - for (size_t simde_mm512_cmp_pd_mask_i = 0 ; simde_mm512_cmp_pd_mask_i < (sizeof(simde_mm512_cmp_pd_mask_r_.m256d) / sizeof(simde_mm512_cmp_pd_mask_r_.m256d[0])) ; simde_mm512_cmp_pd_mask_i++) { \ - simde_mm512_cmp_pd_mask_r_.m256d[simde_mm512_cmp_pd_mask_i] = simde_mm256_cmp_pd(simde_mm512_cmp_pd_mask_a_.m256d[simde_mm512_cmp_pd_mask_i], simde_mm512_cmp_pd_mask_b_.m256d[simde_mm512_cmp_pd_mask_i], (imm8)); \ - } \ - \ - simde_mm512_movepi64_mask(simde_mm512_castpd_si512(simde__m512d_from_private(simde_mm512_cmp_pd_mask_r_))); \ - })) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmp_pd_mask - #define _mm512_cmp_pd_mask(a, b, imm8) simde_mm512_cmp_pd_mask((a), (b), (imm8)) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_cmp_pd_mask(a, b, imm8) _mm256_cmp_pd_mask((a), (b), (imm8)) -#else - #define simde_mm256_cmp_pd_mask(a, b, imm8) simde_mm256_movepi64_mask(simde_mm256_castpd_si256(simde_mm256_cmp_pd((a), (b), (imm8)))) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmp_pd_mask - #define _mm256_cmp_pd_mask(a, b, imm8) simde_mm256_cmp_pd_mask((a), (b), (imm8)) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_cmp_pd_mask(a, b, imm8) _mm_cmp_pd_mask((a), (b), (imm8)) -#else - #define simde_mm_cmp_pd_mask(a, b, imm8) simde_mm_movepi64_mask(simde_mm_castpd_si128(simde_mm_cmp_pd((a), (b), (imm8)))) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_cmp_pd_mask - #define _mm_cmp_pd_mask(a, b, imm8) simde_mm_cmp_pd_mask((a), (b), (imm8)) -#endif - -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm512_cmp_ph_mask (simde__m512h a, simde__m512h b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 31) { - simde__m512h_private - r_, - a_ = simde__m512h_to_private(a), - b_ = simde__m512h_to_private(b); - - switch (imm8) { - case SIMDE_CMP_EQ_OQ: - case SIMDE_CMP_EQ_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 == b_.f16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { - r_.i16[i] = ( - simde_float16_as_uint16(a_.f16[i]) == simde_float16_as_uint16(b_.f16[i]) - && !simde_isnanhf(a_.f16[i]) && !simde_isnanhf(b_.f16[i]) - ) ? ~INT16_C(0) : INT16_C(0); - } - #endif - break; - - case SIMDE_CMP_LT_OQ: - case SIMDE_CMP_LT_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 < b_.f16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { - r_.i16[i] = (simde_float16_to_float32(a_.f16[i]) < simde_float16_to_float32(b_.f16[i])) ? ~INT16_C(0) : INT16_C(0); - } - #endif - break; - - case SIMDE_CMP_LE_OQ: - case SIMDE_CMP_LE_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 <= b_.f16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { - r_.i16[i] = (simde_float16_to_float32(a_.f16[i]) <= simde_float16_to_float32(b_.f16[i])) ? ~INT16_C(0) : INT16_C(0); - } - #endif - break; - - case SIMDE_CMP_UNORD_Q: - case SIMDE_CMP_UNORD_S: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 != a_.f16) | (b_.f16 != b_.f16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { - r_.i16[i] = ( - (simde_float16_to_float32(a_.f16[i]) != simde_float16_to_float32(a_.f16[i])) - || (simde_float16_to_float32(b_.f16[i]) != simde_float16_to_float32(b_.f16[i])) - ) ? ~INT16_C(0) : INT16_C(0); - } - #endif - break; - - case SIMDE_CMP_NEQ_UQ: - case SIMDE_CMP_NEQ_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 != b_.f16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { - r_.i16[i] = ( - (simde_float16_as_uint16(a_.f16[i]) != simde_float16_as_uint16(b_.f16[i])) - || simde_isnanhf(a_.f16[i]) || simde_isnanhf(b_.f16[i]) - ) ? ~INT16_C(0) : INT16_C(0); - } - #endif - break; - - case SIMDE_CMP_NEQ_OQ: - case SIMDE_CMP_NEQ_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 == a_.f16) & (b_.f16 == b_.f16) & (a_.f16 != b_.f16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { - r_.i16[i] = ( - !(simde_isnanhf(a_.f16[i]) || simde_isnanhf(b_.f16[i])) - && (simde_float16_as_uint16(a_.f16[i]) != simde_float16_as_uint16(b_.f16[i])) - ) ? ~INT16_C(0) : INT16_C(0); - } - #endif - break; - - case SIMDE_CMP_NLT_UQ: - case SIMDE_CMP_NLT_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), ~(a_.f16 < b_.f16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { - r_.i16[i] = !( - simde_float16_to_float32(a_.f16[i]) < simde_float16_to_float32(b_.f16[i]) - ) ? ~INT16_C(0) : INT16_C(0); - } - #endif - break; - - case SIMDE_CMP_NLE_UQ: - case SIMDE_CMP_NLE_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), ~(a_.f16 <= b_.f16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { - r_.i16[i] = !( - simde_float16_to_float32(a_.f16[i]) <= simde_float16_to_float32(b_.f16[i]) - ) ? ~INT16_C(0) : INT16_C(0); - } - #endif - break; - - case SIMDE_CMP_ORD_Q: - case SIMDE_CMP_ORD_S: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), ((a_.f16 == a_.f16) & (b_.f16 == b_.f16))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { - r_.i16[i] = (simde_isnanhf(a_.f16[i]) || simde_isnanhf(b_.f16[i])) ? INT16_C(0) : ~INT16_C(0); - } - #endif - break; - - case SIMDE_CMP_EQ_UQ: - case SIMDE_CMP_EQ_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 != a_.f16) | (b_.f16 != b_.f16) | (a_.f16 == b_.f16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { - r_.i16[i] = ( - (simde_isnanhf(a_.f16[i]) || simde_isnanhf(b_.f16[i])) - || (simde_float16_as_uint16(a_.f16[i]) == simde_float16_as_uint16(b_.f16[i])) - ) ? ~INT16_C(0) : INT16_C(0); - } - #endif - break; - - case SIMDE_CMP_NGE_UQ: - case SIMDE_CMP_NGE_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), ~(a_.f16 >= b_.f16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { - r_.i16[i] = !( - simde_float16_to_float32(a_.f16[i]) >= simde_float16_to_float32(b_.f16[i]) - ) ? ~INT16_C(0) : INT16_C(0); - } - #endif - break; - - case SIMDE_CMP_NGT_UQ: - case SIMDE_CMP_NGT_US: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), ~(a_.f16 > b_.f16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { - r_.i16[i] = !( - simde_float16_to_float32(a_.f16[i]) > simde_float16_to_float32(b_.f16[i]) - ) ? ~INT16_C(0) : INT16_C(0); - } - #endif - break; - - case SIMDE_CMP_FALSE_OQ: - case SIMDE_CMP_FALSE_OS: - r_ = simde__m512h_to_private(simde_mm512_setzero_ph()); - break; - - case SIMDE_CMP_GE_OQ: - case SIMDE_CMP_GE_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 >= b_.f16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { - r_.i16[i] = ( - simde_float16_to_float32(a_.f16[i]) >= simde_float16_to_float32(b_.f16[i]) - ) ? ~INT16_C(0) : INT16_C(0); - } - #endif - break; - - case SIMDE_CMP_GT_OQ: - case SIMDE_CMP_GT_OS: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_FLOAT16_VECTOR) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.f16 > b_.f16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { - r_.i16[i] = ( - simde_float16_to_float32(a_.f16[i]) > simde_float16_to_float32(b_.f16[i]) - ) ? ~INT16_C(0) : INT16_C(0); - } - #endif - break; - - case SIMDE_CMP_TRUE_UQ: - case SIMDE_CMP_TRUE_US: - r_ = simde__m512h_to_private(simde_x_mm512_setone_ph()); - break; - - default: - HEDLEY_UNREACHABLE(); - } - - return simde_mm512_movepi16_mask(simde_mm512_castph_si512(simde__m512h_from_private(r_))); -} -#if defined(SIMDE_X86_AVX512FP16_NATIVE) - #define simde_mm512_cmp_ph_mask(a, b, imm8) _mm512_cmp_ph_mask((a), (b), (imm8)) -#endif -#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmp_ph_mask - #define _mm512_cmp_ph_mask(a, b, imm8) simde_mm512_cmp_ph_mask((a), (b), (imm8)) -#endif - -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm512_cmp_epi16_mask (simde__m512i a, simde__m512i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - switch (imm8) { - case SIMDE_MM_CMPINT_EQ: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.i16 == b_.i16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~UINT16_C(0) : UINT16_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_LT: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.i16 < b_.i16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? ~UINT16_C(0) : UINT16_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_LE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.i16 <= b_.i16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] <= b_.i16[i]) ? ~UINT16_C(0) : UINT16_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_FALSE: - r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); - break; - - - case SIMDE_MM_CMPINT_NE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.i16 != b_.i16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] != b_.i16[i]) ? ~UINT16_C(0) : UINT16_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_NLT: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), ~(a_.i16 < b_.i16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = !(a_.i16[i] < b_.i16[i]) ? ~UINT16_C(0) : UINT16_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_NLE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), ~(a_.i16 <= b_.i16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = !(a_.i16[i] <= b_.i16[i]) ? ~UINT16_C(0) : UINT16_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_TRUE: - r_ = simde__m512i_to_private(simde_x_mm512_setone_si512()); - break; - - default: - HEDLEY_UNREACHABLE(); - } - - return simde_mm512_movepi16_mask(simde__m512i_from_private(r_)); -} -#if defined(SIMDE_X86_AVX512BW_NATIVE) - #define simde_mm512_cmp_epi16_mask(a, b, imm8) _mm512_cmp_epi16_mask((a), (b), (imm8)) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmp_epi16_mask - #define _mm512_cmp_epi16_mask(a, b, imm8) simde_mm512_cmp_epi16_mask((a), (b), (imm8)) -#endif - -#if defined(SIMDE_X86_AVX512BW_NATIVE) - #define simde_mm512_mask_cmp_epi16_mask(k1, a, b, imm8) _mm512_mask_cmp_epi16_mask(k1, a, b, imm8) -#else - #define simde_mm512_mask_cmp_epi16_mask(k1, a, b, imm8) (k1) & simde_mm512_cmp_epi16_mask(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmp_epi16_mask -#define _mm512_mask_cmp_epi16_mask(k1, a, b, imm8) simde_mm512_mask_cmp_epi16_mask((k1), (a), (b), (imm8)) -#endif - -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm512_cmp_epi32_mask (simde__m512i a, simde__m512i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - switch (imm8) { - case SIMDE_MM_CMPINT_EQ: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 == b_.i32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_LT: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 < b_.i32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] < b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_LE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 <= b_.i32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] <= b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_FALSE: - r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); - break; - - - case SIMDE_MM_CMPINT_NE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 != b_.i32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] != b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_NLT: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.i32 < b_.i32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = !(a_.i32[i] < b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_NLE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), ~(a_.i32 <= b_.i32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = !(a_.i32[i] <= b_.i32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_TRUE: - r_ = simde__m512i_to_private(simde_x_mm512_setone_si512()); - break; - - default: - HEDLEY_UNREACHABLE(); - } - - return simde_mm512_movepi32_mask(simde__m512i_from_private(r_)); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_cmp_epi32_mask(a, b, imm8) _mm512_cmp_epi32_mask((a), (b), (imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmp_epi32_mask - #define _mm512_cmp_epi32_mask(a, b, imm8) simde_mm512_cmp_epi32_mask((a), (b), (imm8)) -#endif - -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm512_cmp_epi64_mask (simde__m512i a, simde__m512i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - switch (imm8) { - case SIMDE_MM_CMPINT_EQ: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.i64 == b_.i64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = (a_.i64[i] == b_.i64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_LT: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.i64 < b_.i64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = (a_.i64[i] < b_.i64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_LE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.i64 <= b_.i64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = (a_.i64[i] <= b_.i64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_FALSE: - r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); - break; - - - case SIMDE_MM_CMPINT_NE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.i64 != b_.i64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = (a_.i64[i] != b_.i64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_NLT: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.i64 < b_.i64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = !(a_.i64[i] < b_.i64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_NLE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), ~(a_.i64 <= b_.i64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = !(a_.i64[i] <= b_.i64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_TRUE: - r_ = simde__m512i_to_private(simde_x_mm512_setone_si512()); - break; - - default: - HEDLEY_UNREACHABLE(); - } - - return simde_mm512_movepi64_mask(simde__m512i_from_private(r_)); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_cmp_epi64_mask(a, b, imm8) _mm512_cmp_epi64_mask((a), (b), (imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmp_epi64_mask - #define _mm512_cmp_epi64_mask(a, b, imm8) simde_mm512_cmp_epi64_mask((a), (b), (imm8)) -#endif - -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm512_cmp_epu16_mask (simde__m512i a, simde__m512i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - switch (imm8) { - case SIMDE_MM_CMPINT_EQ: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), (a_.u16 == b_.u16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] == b_.u16[i]) ? ~UINT16_C(0) : UINT16_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_LT: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), (a_.u16 < b_.u16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] < b_.u16[i]) ? ~UINT16_C(0) : UINT16_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_LE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), (a_.u16 <= b_.u16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] <= b_.u16[i]) ? ~UINT16_C(0) : UINT16_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_FALSE: - r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); - break; - - - case SIMDE_MM_CMPINT_NE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), (a_.u16 != b_.u16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] != b_.u16[i]) ? ~UINT16_C(0) : UINT16_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_NLT: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), ~(a_.u16 < b_.u16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = !(a_.u16[i] < b_.u16[i]) ? ~UINT16_C(0) : UINT16_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_NLE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), ~(a_.u16 <= b_.u16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = !(a_.u16[i] <= b_.u16[i]) ? ~UINT16_C(0) : UINT16_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_TRUE: - r_ = simde__m512i_to_private(simde_x_mm512_setone_si512()); - break; - - default: - HEDLEY_UNREACHABLE(); - } - - return simde_mm512_movepi16_mask(simde__m512i_from_private(r_)); -} -#if defined(SIMDE_X86_AVX512BW_NATIVE) - #define simde_mm512_cmp_epu16_mask(a, b, imm8) _mm512_cmp_epu16_mask((a), (b), (imm8)) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmp_epu16_mask - #define _mm512_cmp_epu16_mask(a, b, imm8) simde_mm512_cmp_epu16_mask((a), (b), (imm8)) -#endif - -#if defined(SIMDE_X86_AVX512BW_NATIVE) - #define simde_mm512_mask_cmp_epu16_mask(k1, a, b, imm8) _mm512_mask_cmp_epu16_mask(k1, a, b, imm8) -#else - #define simde_mm512_mask_cmp_epu16_mask(k1, a, b, imm8) (k1) & simde_mm512_cmp_epu16_mask(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmp_epu16_mask -#define _mm512_mask_cmp_epu16_mask(k1, a, b, imm8) simde_mm512_mask_cmp_epu16_mask((k1), (a), (b), (imm8)) -#endif - -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_cmp_epu32_mask (simde__m256i a, simde__m256i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - switch (imm8) { - case SIMDE_MM_CMPINT_EQ: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 == b_.u32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] == b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_LT: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 < b_.u32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] < b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_LE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 <= b_.u32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] <= b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_FALSE: - r_ = simde__m256i_to_private(simde_mm256_setzero_si256()); - break; - - - case SIMDE_MM_CMPINT_NE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 != b_.u32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] != b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_NLT: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), ~(a_.u32 < b_.u32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = !(a_.u32[i] < b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_NLE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), ~(a_.u32 <= b_.u32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = !(a_.u32[i] <= b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_TRUE: - r_ = simde__m256i_to_private(simde_x_mm256_setone_si256()); - break; - - default: - HEDLEY_UNREACHABLE(); - } - - return simde_mm256_movepi32_mask(simde__m256i_from_private(r_)); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_cmp_epu32_mask(a, b, imm8) _mm256_cmp_epu32_mask((a), (b), (imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmp_epu32_mask - #define _mm256_cmp_epu32_mask(a, b, imm8) simde_mm256_cmp_epu32_mask((a), (b), (imm8)) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_mask_cmp_epu32_mask(k1, a, b, imm8) _mm256_mask_cmp_epu32_mask(k1, a, b, imm8) -#else - #define simde_mm256_mask_cmp_epu32_mask(k1, a, b, imm8) (k1) & simde_mm256_cmp_epu32_mask((a), (b), (imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmp_epu32_mask -#define _mm256_mask_cmp_epu32_mask(a, b, imm8) simde_mm256_mask_cmp_epu32_mask((a), (b), (imm8)) -#endif - -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm512_cmp_epu32_mask (simde__m512i a, simde__m512i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - switch (imm8) { - case SIMDE_MM_CMPINT_EQ: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 == b_.u32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] == b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_LT: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 < b_.u32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] < b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_LE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 <= b_.u32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] <= b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_FALSE: - r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); - break; - - - case SIMDE_MM_CMPINT_NE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (a_.u32 != b_.u32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] != b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_NLT: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), ~(a_.u32 < b_.u32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = !(a_.u32[i] < b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_NLE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), ~(a_.u32 <= b_.u32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = !(a_.u32[i] <= b_.u32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_TRUE: - r_ = simde__m512i_to_private(simde_x_mm512_setone_si512()); - break; - - default: - HEDLEY_UNREACHABLE(); - } - - return simde_mm512_movepi32_mask(simde__m512i_from_private(r_)); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_cmp_epu32_mask(a, b, imm8) _mm512_cmp_epu32_mask((a), (b), (imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmp_epu32_mask - #define _mm512_cmp_epu32_mask(a, b, imm8) simde_mm512_cmp_epu32_mask((a), (b), (imm8)) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_mask_cmp_epu32_mask(k1, a, b, imm8) _mm512_mask_cmp_epu32_mask(k1, a, b, imm8) -#else - #define simde_mm512_mask_cmp_epu32_mask(k1, a, b, imm8) (k1) & simde_mm512_cmp_epu32_mask(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmp_epu32_mask -#define _mm512_mask_cmp_epu32_mask(k1, a, b, imm8) simde_mm512_mask_cmp_epu32_mask((k1), (a), (b), (imm8)) -#endif - -SIMDE_HUGE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm512_cmp_epu64_mask (simde__m512i a, simde__m512i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - switch (imm8) { - case SIMDE_MM_CMPINT_EQ: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (a_.u64 == b_.u64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] == b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_LT: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (a_.u64 < b_.u64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] < b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_LE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (a_.u64 <= b_.u64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] <= b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_FALSE: - r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); - break; - - - case SIMDE_MM_CMPINT_NE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (a_.u64 != b_.u64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] != b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_NLT: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), ~(a_.u64 < b_.u64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = !(a_.u64[i] < b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_NLE: - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), ~(a_.u64 <= b_.u64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = !(a_.u64[i] <= b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - break; - - case SIMDE_MM_CMPINT_TRUE: - r_ = simde__m512i_to_private(simde_x_mm512_setone_si512()); - break; - - default: - HEDLEY_UNREACHABLE(); - } - - return simde_mm512_movepi64_mask(simde__m512i_from_private(r_)); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_cmp_epu64_mask(a, b, imm8) _mm512_cmp_epu64_mask((a), (b), (imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmp_epu64_mask - #define _mm512_cmp_epu64_mask(a, b, imm8) simde_mm512_cmp_epu64_mask((a), (b), (imm8)) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_mask_cmp_epu64_mask(k1, a, b, imm8) _mm512_mask_cmp_epu64_mask(k1, a, b, imm8) -#else - #define simde_mm512_mask_cmp_epu64_mask(k1, a, b, imm8) (k1) & simde_mm512_cmp_epu64_mask(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmp_epu64_mask -#define _mm512_mask_cmp_epu64_mask(k1, a, b, imm8) simde_mm512_mask_cmp_epu64_mask((k1), (a), (b), (imm8)) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_CMP_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/cmpeq.h b/ffi-deps/simde/simde/x86/avx512/cmpeq.h deleted file mode 100644 index 41f90b3..0000000 --- a/ffi-deps/simde/simde/x86/avx512/cmpeq.h +++ /dev/null @@ -1,241 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020-2021 Evan Nemerson - * 2020 Himanshi Mathur - */ - -#if !defined(SIMDE_X86_AVX512_CMPEQ_H) -#define SIMDE_X86_AVX512_CMPEQ_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" -#include "mov_mask.h" -#include "cmp.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_mm512_cmpeq_epi8_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_cmpeq_epi8_mask(a, b); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - simde__mmask64 r; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r = 0; - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.m256i) / sizeof(a_.m256i[0])) ; i++) { - const uint32_t t = HEDLEY_STATIC_CAST(uint32_t, simde_mm256_movemask_epi8(simde_mm256_cmpeq_epi8(a_.m256i[i], b_.m256i[i]))); - r |= HEDLEY_STATIC_CAST(uint64_t, t) << (i * 32); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - simde__m512i_private tmp; - - tmp.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.i8), a_.i8 == b_.i8); - r = simde_mm512_movepi8_mask(simde__m512i_from_private(tmp)); - #else - r = 0; - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { - r |= (a_.u8[i] == b_.u8[i]) ? (UINT64_C(1) << i) : 0; - } - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpeq_epi8_mask - #define _mm512_cmpeq_epi8_mask(a, b) simde_mm512_cmpeq_epi8_mask(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_mm512_mask_cmpeq_epi8_mask(simde__mmask64 k1, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_cmpeq_epi8_mask(k1, a, b); - #else - return simde_mm512_cmpeq_epi8_mask(a, b) & k1; - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmpeq_epi8_mask - #define _mm512_mask_cmpeq_epi8_mask(k1, a, b) simde_mm512_mask_cmpeq_epi8_mask((k1), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm512_cmpeq_epi32_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cmpeq_epi32_mask(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_cmpeq_epi32(a_.m256i[i], b_.m256i[i]); - } - - return simde_mm512_movepi32_mask(simde__m512i_from_private(r_)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpeq_epi32_mask - #define _mm512_cmpeq_epi32_mask(a, b) simde_mm512_cmpeq_epi32_mask(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm512_mask_cmpeq_epi32_mask (simde__mmask16 k1, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cmpeq_epi32_mask(k1, a, b); - #else - return simde_mm512_cmpeq_epi32_mask(a, b) & k1; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmpeq_epi32_mask - #define _mm512_mask_cmpeq_epi32_mask(k1, a, b) simde_mm512_mask_cmpeq_epi32_mask(k1, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm512_cmpeq_epi64_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cmpeq_epi64_mask(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_cmpeq_epi64(a_.m256i[i], b_.m256i[i]); - } - - return simde_mm512_movepi64_mask(simde__m512i_from_private(r_)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpeq_epi64_mask - #define _mm512_cmpeq_epi64_mask(a, b) simde_mm512_cmpeq_epi64_mask(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm512_mask_cmpeq_epi64_mask (simde__mmask8 k1, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cmpeq_epi64_mask(k1, a, b); - #else - return simde_mm512_cmpeq_epi64_mask(a, b) & k1; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmpeq_epi64_mask - #define _mm512_mask_cmpeq_epi64_mask(k1, a, b) simde_mm512_mask_cmpeq_epi64_mask(k1, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm512_cmpeq_epu16_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_cmpeq_epu16_mask(a, b); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - simde__mmask32 r; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - simde__m512i_private tmp; - - tmp.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.u16), a_.u16 == b_.u16); - r = simde_mm512_movepi16_mask(simde__m512i_from_private(tmp)); - #else - r = 0; - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { - r |= (a_.u16[i] == b_.u16[i]) ? (UINT16_C(1) << i) : 0; - } - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpeq_epu16_mask - #define _mm512_cmpeq_epu16_mask(a, b) simde_mm512_cmpeq_epu16_mask((a), (b)) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm512_mask_cmpeq_epu16_mask(simde__mmask32 k1, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_cmpeq_epu16_mask(k1, a, b); - #else - return k1 & simde_mm512_cmpeq_epu16_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmpeq_epu16_mask - #define _mm512_mask_cmpeq_epu16_mask(k1, a, b) simde_mm512_mask_cmpeq_epu16_mask(k1, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm512_cmpeq_ps_mask (simde__m512 a, simde__m512 b) { - return simde_mm512_cmp_ps_mask(a, b, SIMDE_CMP_EQ_OQ); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpeq_ps_mask - #define _mm512_cmpeq_ps_mask(a, b) simde_mm512_cmp_ps_mask(a, b, SIMDE_CMP_EQ_OQ) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm512_cmpeq_pd_mask (simde__m512d a, simde__m512d b) { - return simde_mm512_cmp_pd_mask(a, b, SIMDE_CMP_EQ_OQ); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpeq_pd_mask - #define _mm512_cmpeq_pd_mask(a, b) simde_mm512_cmp_pd_mask(a, b, SIMDE_CMP_EQ_OQ) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_CMPEQ_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/cmpge.h b/ffi-deps/simde/simde/x86/avx512/cmpge.h deleted file mode 100644 index d0d4287..0000000 --- a/ffi-deps/simde/simde/x86/avx512/cmpge.h +++ /dev/null @@ -1,1434 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020-2021 Evan Nemerson - * 2020 Christopher Moore - * 2021 Andrew Rodriguez - */ - -#if !defined(SIMDE_X86_AVX512_CMPGE_H) -#define SIMDE_X86_AVX512_CMPGE_H - -#include "types.h" -#include "mov.h" -#include "mov_mask.h" -#include "movm.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_cmpge_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_movm_epi8(_mm_cmpge_epi8_mask(a, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vcgeq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_ge(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpge(a_.altivec_i8, b_.altivec_i8)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), a_.i8 >= b_.i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] >= b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm_cmpge_epi8_mask (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_cmpge_epi8_mask(a, b); - #else - return simde_mm_movepi8_mask(simde_x_mm_cmpge_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpge_epi8_mask - #define _mm_cmpge_epi8_mask(a, b) simde_mm_cmpge_epi8_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm_mask_cmpge_epi8_mask(simde__mmask16 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_cmpge_epi8_mask(k, a, b); - #else - return k & simde_mm_cmpge_epi8_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VBW_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmpge_epi8_mask - #define _mm_mask_cmpge_epi8_mask(k, a, b) simde_mm_mask_cmpge_epi8_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_cmpge_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return simde_mm256_movm_epi8(_mm256_cmpge_epi8_mask(a, b)); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmpge_epi8(a_.m128i[i], b_.m128i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), a_.i8 >= b_.i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] >= b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm256_cmpge_epi8_mask (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_cmpge_epi8_mask(a, b); - #else - return simde_mm256_movepi8_mask(simde_x_mm256_cmpge_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VBW_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpge_epi8_mask - #define _mm256_cmpge_epi8_mask(a, b) simde_mm256_cmpge_epi8_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm256_mask_cmpge_epi8_mask(simde__mmask32 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_mask_cmpge_epi8_mask(k, a, b); - #else - return k & simde_mm256_cmpge_epi8_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmpge_epi8_mask - #define _mm256_mask_cmpge_epi8_mask(k, a, b) simde_mm256_mask_cmpge_epi8_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_cmpge_epi8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return simde_mm512_movm_epi8(_mm512_cmpge_epi8_mask(a, b)); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmpge_epi8(a_.m128i[i], b_.m128i[i]); - } - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_x_mm256_cmpge_epi8(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), a_.i8 >= b_.i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] >= b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_mm512_cmpge_epi8_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_cmpge_epi8_mask(a, b); - #else - return simde_mm512_movepi8_mask(simde_x_mm512_cmpge_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epi8_mask - #define _mm512_cmpge_epi8_mask(a, b) simde_mm512_cmpge_epi8_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_mm512_mask_cmpge_epi8_mask(simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_cmpge_epi8_mask(k, a, b); - #else - return k & simde_mm512_cmpge_epi8_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmpge_epi8_mask - #define _mm512_mask_cmpge_epi8_mask(k, a, b) simde_mm512_mask_cmpge_epi8_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_cmpge_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_movm_epi8(_mm_cmpge_epu8_mask(a, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vcgeq_u8(a_.neon_u8, b_.neon_u8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u8x16_ge(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_cmpge(a_.altivec_u8, b_.altivec_u8)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u8), a_.u8 >= b_.u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] >= b_.u8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm_cmpge_epu8_mask (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_cmpge_epu8_mask(a, b); - #else - return simde_mm_movepi8_mask(simde_x_mm_cmpge_epu8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpge_epu8_mask - #define _mm_cmpge_epu8_mask(a, b) simde_mm_cmpge_epu8_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm_mask_cmpge_epu8_mask(simde__mmask16 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_cmpge_epu8_mask(k, a, b); - #else - return k & simde_mm_cmpge_epu8_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmpge_epu8_mask - #define _mm_mask_cmpge_epu8_mask(k, a, b) simde_mm_mask_cmpge_epu8_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_cmpge_epu8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return simde_mm256_movm_epi8(_mm256_cmpge_epu8_mask(a, b)); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmpge_epu8(a_.m128i[i], b_.m128i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u8), a_.u8 >= b_.u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] >= b_.u8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm256_cmpge_epu8_mask (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_cmpge_epu8_mask(a, b); - #else - return simde_mm256_movepi8_mask(simde_x_mm256_cmpge_epu8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpge_epu8_mask - #define _mm256_cmpge_epu8_mask(a, b) simde_mm256_cmpge_epu8_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm256_mask_cmpge_epu8_mask(simde__mmask32 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_mask_cmpge_epu8_mask(k, a, b); - #else - return k & simde_mm256_cmpge_epu8_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmpge_epu8_mask - #define _mm256_mask_cmpge_epu8_mask(k, a, b) simde_mm256_mask_cmpge_epu8_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_cmpge_epu8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return simde_mm512_movm_epi8(_mm512_cmpge_epu8_mask(a, b)); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmpge_epu8(a_.m128i[i], b_.m128i[i]); - } - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_x_mm256_cmpge_epu8(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u8), a_.u8 >= b_.u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] >= b_.u8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_mm512_cmpge_epu8_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_cmpge_epu8_mask(a, b); - #else - return simde_mm512_movepi8_mask(simde_x_mm512_cmpge_epu8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epu8_mask - #define _mm512_cmpge_epu8_mask(a, b) simde_mm512_cmpge_epu8_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_mm512_mask_cmpge_epu8_mask(simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_cmpge_epu8_mask(k, a, b); - #else - return k & simde_mm512_cmpge_epu8_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmpge_epu8_mask - #define _mm512_mask_cmpge_epu8_mask(k, a, b) simde_mm512_mask_cmpge_epu8_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_cmpge_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_movm_epi16(_mm_cmpge_epi16_mask(a, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vcgeq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_ge(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpge(a_.altivec_i16, b_.altivec_i16)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), a_.i16 >= b_.i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i16) / sizeof(a_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] >= b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_cmpge_epi16_mask (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_cmpge_epi16_mask(a, b); - #else - return simde_mm_movepi16_mask(simde_x_mm_cmpge_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpge_epi16_mask - #define _mm_cmpge_epi16_mask(a, b) simde_mm_cmpge_epi16_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_mask_cmpge_epi16_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_cmpge_epi16_mask(k, a, b); - #else - return k & simde_mm_cmpge_epi16_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmpge_epi16_mask - #define _mm_mask_cmpge_epi16_mask(k, a, b) simde_mm_mask_cmpge_epi16_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_cmpge_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return simde_mm256_movm_epi16(_mm256_cmpge_epi16_mask(a, b)); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmpge_epi16(a_.m128i[i], b_.m128i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), a_.i16 >= b_.i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i16) / sizeof(a_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] >= b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm256_cmpge_epi16_mask (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_cmpge_epi16_mask(a, b); - #else - return simde_mm256_movepi16_mask(simde_x_mm256_cmpge_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpge_epi16_mask - #define _mm256_cmpge_epi16_mask(a, b) simde_mm256_cmpge_epi16_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm256_mask_cmpge_epi16_mask(simde__mmask16 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_mask_cmpge_epi16_mask(k, a, b); - #else - return k & simde_mm256_cmpge_epi16_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmpge_epi16_mask - #define _mm256_mask_cmpge_epi16_mask(k, a, b) simde_mm256_mask_cmpge_epi16_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_cmpge_epi16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return simde_mm512_movm_epi16(_mm512_cmpge_epi16_mask(a, b)); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmpge_epi16(a_.m128i[i], b_.m128i[i]); - } - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_x_mm256_cmpge_epi16(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), a_.i16 >= b_.i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i16) / sizeof(a_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] >= b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm512_cmpge_epi16_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_cmpge_epi16_mask(a, b); - #else - return simde_mm512_movepi16_mask(simde_x_mm512_cmpge_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epi16_mask - #define _mm512_cmpge_epi16_mask(a, b) simde_mm512_cmpge_epi16_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm512_mask_cmpge_epi16_mask(simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_cmpge_epi16_mask(k, a, b); - #else - return k & simde_mm512_cmpge_epi16_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmpge_epi16_mask - #define _mm512_mask_cmpge_epi16_mask(k, a, b) simde_mm512_mask_cmpge_epi16_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_cmpge_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_movm_epi16(_mm_cmpge_epu16_mask(a, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vcgeq_u16(a_.neon_u16, b_.neon_u16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u16x8_ge(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), vec_cmpge(a_.altivec_u16, b_.altivec_u16)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), a_.u16 >= b_.u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] >= b_.u16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_cmpge_epu16_mask (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_cmpge_epu16_mask(a, b); - #else - return simde_mm_movepi16_mask(simde_x_mm_cmpge_epu16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpge_epu16_mask - #define _mm_cmpge_epu16_mask(a, b) simde_mm_cmpge_epu16_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_mask_cmpge_epu16_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_cmpge_epu16_mask(k, a, b); - #else - return k & simde_mm_cmpge_epu16_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmpge_epu16_mask - #define _mm_mask_cmpge_epu16_mask(k, a, b) simde_mm_mask_cmpge_epu16_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_cmpge_epu16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return simde_mm256_movm_epi16(_mm256_cmpge_epu16_mask(a, b)); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmpge_epu16(a_.m128i[i], b_.m128i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), a_.u16 >= b_.u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] >= b_.u16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm256_cmpge_epu16_mask (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_cmpge_epu16_mask(a, b); - #else - return simde_mm256_movepi16_mask(simde_x_mm256_cmpge_epu16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpge_epu16_mask - #define _mm256_cmpge_epu16_mask(a, b) simde_mm256_cmpge_epu16_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm256_mask_cmpge_epu16_mask(simde__mmask16 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_mask_cmpge_epu16_mask(k, a, b); - #else - return k & simde_mm256_cmpge_epu16_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmpge_epu16_mask - #define _mm256_mask_cmpge_epu16_mask(k, a, b) simde_mm256_mask_cmpge_epu16_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_cmpge_epu16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return simde_mm512_movm_epi16(_mm512_cmpge_epu16_mask(a, b)); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmpge_epu16(a_.m128i[i], b_.m128i[i]); - } - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_x_mm256_cmpge_epu16(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), a_.u16 >= b_.u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] >= b_.u16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm512_cmpge_epu16_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_cmpge_epu16_mask(a, b); - #else - return simde_mm512_movepi16_mask(simde_x_mm512_cmpge_epu16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epu16_mask - #define _mm512_cmpge_epu16_mask(a, b) simde_mm512_cmpge_epu16_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm512_mask_cmpge_epu16_mask(simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_cmpge_epu16_mask(k, a, b); - #else - return k & simde_mm512_cmpge_epu16_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmpge_epu16_mask - #define _mm512_mask_cmpge_epu16_mask(k, a, b) simde_mm512_mask_cmpge_epu16_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_cmpge_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return simde_mm_movm_epi32(_mm_cmpge_epi32_mask(a, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcgeq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_ge(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpge(a_.altivec_i32, b_.altivec_i32)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.i32 >= b_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] >= b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_cmpge_epi32_mask (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_cmpge_epi32_mask(a, b); - #else - return simde_mm_movepi32_mask(simde_x_mm_cmpge_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpge_epi32_mask - #define _mm_cmpge_epi32_mask(a, b) simde_mm_cmpge_epi32_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_mask_cmpge_epi32_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_cmpge_epi32_mask(k, a, b); - #else - return k & simde_mm_cmpge_epi32_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmpge_epi32_mask - #define _mm_mask_cmpge_epi32_mask(k, a, b) simde_mm_mask_cmpge_epi32_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_cmpge_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return simde_mm256_movm_epi32(_mm256_cmpge_epi32_mask(a, b)); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmpge_epi32(a_.m128i[i], b_.m128i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.i32 >= b_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] >= b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_cmpge_epi32_mask (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_cmpge_epi32_mask(a, b); - #else - return simde_mm256_movepi32_mask(simde_x_mm256_cmpge_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpge_epi32_mask - #define _mm256_cmpge_epi32_mask(a, b) simde_mm256_cmpge_epi32_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_mask_cmpge_epi32_mask(simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_cmpge_epi32_mask(k, a, b); - #else - return k & simde_mm256_cmpge_epi32_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmpge_epi32_mask - #define _mm256_mask_cmpge_epi32_mask(k, a, b) simde_mm256_mask_cmpge_epi32_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_cmpge_epi32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return simde_mm512_movm_epi32(_mm512_cmpge_epi32_mask(a, b)); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmpge_epi32(a_.m128i[i], b_.m128i[i]); - } - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_x_mm256_cmpge_epi32(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.i32 >= b_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] >= b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm512_cmpge_epi32_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cmpge_epi32_mask(a, b); - #else - return simde_mm512_movepi32_mask(simde_x_mm512_cmpge_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epi32_mask - #define _mm512_cmpge_epi32_mask(a, b) simde_mm512_cmpge_epi32_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm512_mask_cmpge_epi32_mask(simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cmpge_epi32_mask(k, a, b); - #else - return k & simde_mm512_cmpge_epi32_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmpge_epi32_mask - #define _mm512_mask_cmpge_epi32_mask(k, a, b) simde_mm512_mask_cmpge_epi32_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_cmpge_epu32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return simde_mm_movm_epi32(_mm_cmpge_epu32_mask(a, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcgeq_u32(a_.neon_u32, b_.neon_u32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u32x4_ge(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), vec_cmpge(a_.altivec_u32, b_.altivec_u32)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), a_.u32 >= b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] >= b_.u32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_cmpge_epu32_mask (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_cmpge_epu32_mask(a, b); - #else - return simde_mm_movepi32_mask(simde_x_mm_cmpge_epu32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpge_epu32_mask - #define _mm_cmpge_epu32_mask(a, b) simde_mm_cmpge_epu32_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_mask_cmpge_epu32_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_cmpge_epu32_mask(k, a, b); - #else - return k & simde_mm_cmpge_epu32_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmpge_epu32_mask - #define _mm_mask_cmpge_epu32_mask(k, a, b) simde_mm_mask_cmpge_epu32_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_cmpge_epu32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return simde_mm256_movm_epi32(_mm256_cmpge_epu32_mask(a, b)); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmpge_epu32(a_.m128i[i], b_.m128i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), a_.u32 >= b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] >= b_.u32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_cmpge_epu32_mask (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_cmpge_epu32_mask(a, b); - #else - return simde_mm256_movepi32_mask(simde_x_mm256_cmpge_epu32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpge_epu32_mask - #define _mm256_cmpge_epu32_mask(a, b) simde_mm256_cmpge_epu32_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_mask_cmpge_epu32_mask(simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_cmpge_epu32_mask(k, a, b); - #else - return k & simde_mm256_cmpge_epu32_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmpge_epu32_mask - #define _mm256_mask_cmpge_epu32_mask(k, a, b) simde_mm256_mask_cmpge_epu32_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_cmpge_epu32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return simde_mm512_movm_epi32(_mm512_cmpge_epu32_mask(a, b)); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmpge_epu32(a_.m128i[i], b_.m128i[i]); - } - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_x_mm256_cmpge_epu32(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), a_.u32 >= b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] >= b_.u32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm512_cmpge_epu32_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cmpge_epu32_mask(a, b); - #else - return simde_mm512_movepi32_mask(simde_x_mm512_cmpge_epu32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epu32_mask - #define _mm512_cmpge_epu32_mask(a, b) simde_mm512_cmpge_epu32_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm512_mask_cmpge_epu32_mask(simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cmpge_epu32_mask(k, a, b); - #else - return k & simde_mm512_cmpge_epu32_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmpge_epu32_mask - #define _mm512_mask_cmpge_epu32_mask(k, a, b) simde_mm512_mask_cmpge_epu32_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_cmpge_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return simde_mm_movm_epi64(_mm_cmpge_epi64_mask(a, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vcgeq_s64(a_.neon_i64, b_.neon_i64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_ge(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), vec_cmpge(a_.altivec_i64, b_.altivec_i64)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 >= b_.i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r_.i64[i] = (a_.i64[i] >= b_.i64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_cmpge_epi64_mask (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_cmpge_epi64_mask(a, b); - #else - return simde_mm_movepi64_mask(simde_x_mm_cmpge_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpge_epi64_mask - #define _mm_cmpge_epi64_mask(a, b) simde_mm_cmpge_epi64_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_mask_cmpge_epi64_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_cmpge_epi64_mask(k, a, b); - #else - return k & simde_mm_cmpge_epi64_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmpge_epi64_mask - #define _mm_mask_cmpge_epi64_mask(k, a, b) simde_mm_mask_cmpge_epi64_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_cmpge_epi64 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return simde_mm256_movm_epi64(_mm256_cmpge_epi64_mask(a, b)); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmpge_epi64(a_.m128i[i], b_.m128i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 >= b_.i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r_.i64[i] = (a_.i64[i] >= b_.i64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_cmpge_epi64_mask (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_cmpge_epi64_mask(a, b); - #else - return simde_mm256_movepi64_mask(simde_x_mm256_cmpge_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpge_epi64_mask - #define _mm256_cmpge_epi64_mask(a, b) simde_mm256_cmpge_epi64_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_mask_cmpge_epi64_mask(simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_cmpge_epi64_mask(k, a, b); - #else - return k & simde_mm256_cmpge_epi64_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmpge_epi64_mask - #define _mm256_mask_cmpge_epi64_mask(k, a, b) simde_mm256_mask_cmpge_epi64_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_cmpge_epi64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return simde_mm512_movm_epi64(_mm512_cmpge_epi64_mask(a, b)); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmpge_epi64(a_.m128i[i], b_.m128i[i]); - } - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_x_mm256_cmpge_epi64(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 >= b_.i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r_.i64[i] = (a_.i64[i] >= b_.i64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm512_cmpge_epi64_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cmpge_epi64_mask(a, b); - #else - return simde_mm512_movepi64_mask(simde_x_mm512_cmpge_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epi64_mask - #define _mm512_cmpge_epi64_mask(a, b) simde_mm512_cmpge_epi64_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm512_mask_cmpge_epi64_mask(simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cmpge_epi64_mask(k, a, b); - #else - return k & simde_mm512_cmpge_epi64_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmpge_epi64_mask - #define _mm512_mask_cmpge_epi64_mask(k, a, b) simde_mm512_mask_cmpge_epi64_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_cmpge_epu64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return simde_mm_movm_epi64(_mm_cmpge_epu64_mask(a, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vcgeq_u64(a_.neon_u64, b_.neon_u64); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), vec_cmpge(a_.altivec_u64, b_.altivec_u64)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), a_.u64 >= b_.u64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] >= b_.u64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_cmpge_epu64_mask (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_cmpge_epu64_mask(a, b); - #else - return simde_mm_movepi64_mask(simde_x_mm_cmpge_epu64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpge_epu64_mask - #define _mm_cmpge_epu64_mask(a, b) simde_mm_cmpge_epu64_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_mask_cmpge_epu64_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_cmpge_epu64_mask(k, a, b); - #else - return k & simde_mm_cmpge_epu64_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmpge_epu64_mask - #define _mm_mask_cmpge_epu64_mask(k, a, b) simde_mm_mask_cmpge_epu64_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_cmpge_epu64 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return simde_mm256_movm_epi64(_mm256_cmpge_epu64_mask(a, b)); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmpge_epu64(a_.m128i[i], b_.m128i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), a_.u64 >= b_.u64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] >= b_.u64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_cmpge_epu64_mask (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_cmpge_epu64_mask(a, b); - #else - return simde_mm256_movepi64_mask(simde_x_mm256_cmpge_epu64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpge_epu64_mask - #define _mm256_cmpge_epu64_mask(a, b) simde_mm256_cmpge_epu64_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_mask_cmpge_epu64_mask(simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_cmpge_epu64_mask(k, a, b); - #else - return k & simde_mm256_cmpge_epu64_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmpge_epu64_mask - #define _mm256_mask_cmpge_epu64_mask(k, a, b) simde_mm256_mask_cmpge_epu64_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_cmpge_epu64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return simde_mm512_movm_epi64(_mm512_cmpge_epu64_mask(a, b)); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmpge_epu64(a_.m128i[i], b_.m128i[i]); - } - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_x_mm256_cmpge_epu64(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), a_.u64 >= b_.u64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] >= b_.u64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm512_cmpge_epu64_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cmpge_epu64_mask(a, b); - #else - return simde_mm512_movepi64_mask(simde_x_mm512_cmpge_epu64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpge_epu64_mask - #define _mm512_cmpge_epu64_mask(a, b) simde_mm512_cmpge_epu64_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm512_mask_cmpge_epu64_mask(simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cmpge_epu64_mask(k, a, b); - #else - return k & simde_mm512_cmpge_epu64_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmpge_epu64_mask - #define _mm512_mask_cmpge_epu64_mask(k, a, b) simde_mm512_mask_cmpge_epu64_mask((k), (a), (b)) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_CMPGE_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/cmpgt.h b/ffi-deps/simde/simde/x86/avx512/cmpgt.h deleted file mode 100644 index 15245f9..0000000 --- a/ffi-deps/simde/simde/x86/avx512/cmpgt.h +++ /dev/null @@ -1,212 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Christopher Moore - */ - -#if !defined(SIMDE_X86_AVX512_CMPGT_H) -#define SIMDE_X86_AVX512_CMPGT_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" -#include "mov_mask.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_mm512_cmpgt_epi8_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_cmpgt_epi8_mask(a, b); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - simde__mmask64 r; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) && !defined(HEDLEY_INTEL_VERSION) - r = 0; - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.m256i) / sizeof(a_.m256i[0])) ; i++) { - const uint32_t t = HEDLEY_STATIC_CAST(uint32_t, simde_mm256_movemask_epi8(simde_mm256_cmpgt_epi8(a_.m256i[i], b_.m256i[i]))); - r |= HEDLEY_STATIC_CAST(uint64_t, t) << HEDLEY_STATIC_CAST(uint64_t, i * 32); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - simde__m512i_private tmp; - - tmp.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.i8), a_.i8 > b_.i8); - r = simde_mm512_movepi8_mask(simde__m512i_from_private(tmp)); - #else - r = 0; - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) { - r |= (a_.i8[i] > b_.i8[i]) ? (UINT64_C(1) << i) : 0; - } - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpgt_epi8_mask - #define _mm512_cmpgt_epi8_mask(a, b) simde_mm512_cmpgt_epi8_mask(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_mm512_cmpgt_epu8_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_cmpgt_epu8_mask(a, b); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - simde__mmask64 r = 0; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - simde__m512i_private tmp; - - tmp.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.i8), a_.u8 > b_.u8); - r = simde_mm512_movepi8_mask(simde__m512i_from_private(tmp)); - #else - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { - r |= (a_.u8[i] > b_.u8[i]) ? (UINT64_C(1) << i) : 0; - } - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpgt_epu8_mask - #define _mm512_cmpgt_epu8_mask(a, b) simde_mm512_cmpgt_epu8_mask(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm512_cmpgt_epi16_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_cmpgt_epi16_mask(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_cmpgt_epi16(a_.m256i[i], b_.m256i[i]); - } - - return simde_mm512_movepi16_mask(simde__m512i_from_private(r_)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpgt_epi16_mask - #define _mm512_cmpgt_epi16_mask(a, b) simde_mm512_cmpgt_epi16_mask(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm512_cmpgt_epi32_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cmpgt_epi32_mask(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_cmpgt_epi32(a_.m256i[i], b_.m256i[i]); - } - - return simde_mm512_movepi32_mask(simde__m512i_from_private(r_)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpgt_epi32_mask - #define _mm512_cmpgt_epi32_mask(a, b) simde_mm512_cmpgt_epi32_mask(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm512_mask_cmpgt_epi32_mask (simde__mmask16 k1, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cmpgt_epi32_mask(k1, a, b); - #else - return simde_mm512_cmpgt_epi32_mask(a, b) & k1; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmpgt_epi32_mask - #define _mm512_mask_cmpgt_epi32_mask(k1, a, b) simde_mm512_mask_cmpgt_epi32_mask(k1, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm512_cmpgt_epi64_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cmpgt_epi64_mask(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_cmpgt_epi64(a_.m256i[i], b_.m256i[i]); - } - - return simde_mm512_movepi64_mask(simde__m512i_from_private(r_)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmpgt_epi64_mask - #define _mm512_cmpgt_epi64_mask(a, b) simde_mm512_cmpgt_epi64_mask(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm512_mask_cmpgt_epi64_mask (simde__mmask8 k1, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cmpgt_epi64_mask(k1, a, b); - #else - return simde_mm512_cmpgt_epi64_mask(a, b) & k1; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmpgt_epi64_mask - #define _mm512_mask_cmpgt_epi64_mask(k1, a, b) simde_mm512_mask_cmpgt_epi64_mask(k1, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_CMPGT_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/cmple.h b/ffi-deps/simde/simde/x86/avx512/cmple.h deleted file mode 100644 index 9b3c3aa..0000000 --- a/ffi-deps/simde/simde/x86/avx512/cmple.h +++ /dev/null @@ -1,1432 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020-2021 Evan Nemerson - */ - -#if !defined(SIMDE_X86_AVX512_CMPLE_H) -#define SIMDE_X86_AVX512_CMPLE_H - -#include "types.h" -#include "mov.h" -#include "mov_mask.h" -#include "movm.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_cmple_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_movm_epi8(_mm_cmple_epi8_mask(a, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vcleq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_le(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmple(a_.altivec_i8, b_.altivec_i8)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), a_.i8 <= b_.i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] <= b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm_cmple_epi8_mask (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_cmple_epi8_mask(a, b); - #else - return simde_mm_movepi8_mask(simde_x_mm_cmple_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_cmple_epi8_mask - #define _mm_cmple_epi8_mask(a, b) simde_mm_cmple_epi8_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm_mask_cmple_epi8_mask(simde__mmask16 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_cmple_epi8_mask(k, a, b); - #else - return k & simde_mm_cmple_epi8_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VBW_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmple_epi8_mask - #define _mm_mask_cmple_epi8_mask(k, a, b) simde_mm_mask_cmple_epi8_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_cmple_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return simde_mm256_movm_epi8(_mm256_cmple_epi8_mask(a, b)); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmple_epi8(a_.m128i[i], b_.m128i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), a_.i8 <= b_.i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] <= b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm256_cmple_epi8_mask (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_cmple_epi8_mask(a, b); - #else - return simde_mm256_movepi8_mask(simde_x_mm256_cmple_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VBW_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmple_epi8_mask - #define _mm256_cmple_epi8_mask(a, b) simde_mm256_cmple_epi8_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm256_mask_cmple_epi8_mask(simde__mmask32 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_mask_cmple_epi8_mask(k, a, b); - #else - return k & simde_mm256_cmple_epi8_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmple_epi8_mask - #define _mm256_mask_cmple_epi8_mask(k, a, b) simde_mm256_mask_cmple_epi8_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_cmple_epi8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return simde_mm512_movm_epi8(_mm512_cmple_epi8_mask(a, b)); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmple_epi8(a_.m128i[i], b_.m128i[i]); - } - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_x_mm256_cmple_epi8(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), a_.i8 <= b_.i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] <= b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_mm512_cmple_epi8_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_cmple_epi8_mask(a, b); - #else - return simde_mm512_movepi8_mask(simde_x_mm512_cmple_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epi8_mask - #define _mm512_cmple_epi8_mask(a, b) simde_mm512_cmple_epi8_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_mm512_mask_cmple_epi8_mask(simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_cmple_epi8_mask(k, a, b); - #else - return k & simde_mm512_cmple_epi8_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmple_epi8_mask - #define _mm512_mask_cmple_epi8_mask(k, a, b) simde_mm512_mask_cmple_epi8_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_cmple_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_movm_epi8(_mm_cmple_epu8_mask(a, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vcleq_u8(a_.neon_u8, b_.neon_u8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u8x16_le(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_cmple(a_.altivec_u8, b_.altivec_u8)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u8), a_.u8 <= b_.u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] <= b_.u8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm_cmple_epu8_mask (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_cmple_epu8_mask(a, b); - #else - return simde_mm_movepi8_mask(simde_x_mm_cmple_epu8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_cmple_epu8_mask - #define _mm_cmple_epu8_mask(a, b) simde_mm_cmple_epu8_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm_mask_cmple_epu8_mask(simde__mmask16 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_cmple_epu8_mask(k, a, b); - #else - return k & simde_mm_cmple_epu8_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmple_epu8_mask - #define _mm_mask_cmple_epu8_mask(k, a, b) simde_mm_mask_cmple_epu8_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_cmple_epu8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return simde_mm256_movm_epi8(_mm256_cmple_epu8_mask(a, b)); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmple_epu8(a_.m128i[i], b_.m128i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u8), a_.u8 <= b_.u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] <= b_.u8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm256_cmple_epu8_mask (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_cmple_epu8_mask(a, b); - #else - return simde_mm256_movepi8_mask(simde_x_mm256_cmple_epu8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmple_epu8_mask - #define _mm256_cmple_epu8_mask(a, b) simde_mm256_cmple_epu8_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm256_mask_cmple_epu8_mask(simde__mmask32 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_mask_cmple_epu8_mask(k, a, b); - #else - return k & simde_mm256_cmple_epu8_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmple_epu8_mask - #define _mm256_mask_cmple_epu8_mask(k, a, b) simde_mm256_mask_cmple_epu8_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_cmple_epu8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return simde_mm512_movm_epi8(_mm512_cmple_epu8_mask(a, b)); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmple_epu8(a_.m128i[i], b_.m128i[i]); - } - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_x_mm256_cmple_epu8(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u8), a_.u8 <= b_.u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] <= b_.u8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_mm512_cmple_epu8_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_cmple_epu8_mask(a, b); - #else - return simde_mm512_movepi8_mask(simde_x_mm512_cmple_epu8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epu8_mask - #define _mm512_cmple_epu8_mask(a, b) simde_mm512_cmple_epu8_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_mm512_mask_cmple_epu8_mask(simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_cmple_epu8_mask(k, a, b); - #else - return k & simde_mm512_cmple_epu8_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmple_epu8_mask - #define _mm512_mask_cmple_epu8_mask(k, a, b) simde_mm512_mask_cmple_epu8_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_cmple_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_movm_epi16(_mm_cmple_epi16_mask(a, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vcleq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_le(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmple(a_.altivec_i16, b_.altivec_i16)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), a_.i16 <= b_.i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i16) / sizeof(a_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] <= b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_cmple_epi16_mask (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_cmple_epi16_mask(a, b); - #else - return simde_mm_movepi16_mask(simde_x_mm_cmple_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_cmple_epi16_mask - #define _mm_cmple_epi16_mask(a, b) simde_mm_cmple_epi16_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_mask_cmple_epi16_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_cmple_epi16_mask(k, a, b); - #else - return k & simde_mm_cmple_epi16_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmple_epi16_mask - #define _mm_mask_cmple_epi16_mask(k, a, b) simde_mm_mask_cmple_epi16_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_cmple_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return simde_mm256_movm_epi16(_mm256_cmple_epi16_mask(a, b)); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmple_epi16(a_.m128i[i], b_.m128i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), a_.i16 <= b_.i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i16) / sizeof(a_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] <= b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm256_cmple_epi16_mask (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_cmple_epi16_mask(a, b); - #else - return simde_mm256_movepi16_mask(simde_x_mm256_cmple_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmple_epi16_mask - #define _mm256_cmple_epi16_mask(a, b) simde_mm256_cmple_epi16_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm256_mask_cmple_epi16_mask(simde__mmask16 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_mask_cmple_epi16_mask(k, a, b); - #else - return k & simde_mm256_cmple_epi16_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmple_epi16_mask - #define _mm256_mask_cmple_epi16_mask(k, a, b) simde_mm256_mask_cmple_epi16_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_cmple_epi16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return simde_mm512_movm_epi16(_mm512_cmple_epi16_mask(a, b)); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmple_epi16(a_.m128i[i], b_.m128i[i]); - } - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_x_mm256_cmple_epi16(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), a_.i16 <= b_.i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i16) / sizeof(a_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] <= b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm512_cmple_epi16_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_cmple_epi16_mask(a, b); - #else - return simde_mm512_movepi16_mask(simde_x_mm512_cmple_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epi16_mask - #define _mm512_cmple_epi16_mask(a, b) simde_mm512_cmple_epi16_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm512_mask_cmple_epi16_mask(simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_cmple_epi16_mask(k, a, b); - #else - return k & simde_mm512_cmple_epi16_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmple_epi16_mask - #define _mm512_mask_cmple_epi16_mask(k, a, b) simde_mm512_mask_cmple_epi16_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_cmple_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_movm_epi16(_mm_cmple_epu16_mask(a, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vcleq_u16(a_.neon_u16, b_.neon_u16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u16x8_le(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), vec_cmple(a_.altivec_u16, b_.altivec_u16)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), a_.u16 <= b_.u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] <= b_.u16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_cmple_epu16_mask (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_cmple_epu16_mask(a, b); - #else - return simde_mm_movepi16_mask(simde_x_mm_cmple_epu16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_cmple_epu16_mask - #define _mm_cmple_epu16_mask(a, b) simde_mm_cmple_epu16_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_mask_cmple_epu16_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_cmple_epu16_mask(k, a, b); - #else - return k & simde_mm_cmple_epu16_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmple_epu16_mask - #define _mm_mask_cmple_epu16_mask(k, a, b) simde_mm_mask_cmple_epu16_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_cmple_epu16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return simde_mm256_movm_epi16(_mm256_cmple_epu16_mask(a, b)); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmple_epu16(a_.m128i[i], b_.m128i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), a_.u16 <= b_.u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] <= b_.u16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm256_cmple_epu16_mask (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_cmple_epu16_mask(a, b); - #else - return simde_mm256_movepi16_mask(simde_x_mm256_cmple_epu16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmple_epu16_mask - #define _mm256_cmple_epu16_mask(a, b) simde_mm256_cmple_epu16_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm256_mask_cmple_epu16_mask(simde__mmask16 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_mask_cmple_epu16_mask(k, a, b); - #else - return k & simde_mm256_cmple_epu16_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmple_epu16_mask - #define _mm256_mask_cmple_epu16_mask(k, a, b) simde_mm256_mask_cmple_epu16_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_cmple_epu16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return simde_mm512_movm_epi16(_mm512_cmple_epu16_mask(a, b)); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmple_epu16(a_.m128i[i], b_.m128i[i]); - } - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_x_mm256_cmple_epu16(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), a_.u16 <= b_.u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] <= b_.u16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm512_cmple_epu16_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_cmple_epu16_mask(a, b); - #else - return simde_mm512_movepi16_mask(simde_x_mm512_cmple_epu16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epu16_mask - #define _mm512_cmple_epu16_mask(a, b) simde_mm512_cmple_epu16_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm512_mask_cmple_epu16_mask(simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_cmple_epu16_mask(k, a, b); - #else - return k & simde_mm512_cmple_epu16_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmple_epu16_mask - #define _mm512_mask_cmple_epu16_mask(k, a, b) simde_mm512_mask_cmple_epu16_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_cmple_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return simde_mm_movm_epi32(_mm_cmple_epi32_mask(a, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcleq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_le(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmple(a_.altivec_i32, b_.altivec_i32)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.i32 <= b_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] <= b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_cmple_epi32_mask (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_cmple_epi32_mask(a, b); - #else - return simde_mm_movepi32_mask(simde_x_mm_cmple_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_cmple_epi32_mask - #define _mm_cmple_epi32_mask(a, b) simde_mm_cmple_epi32_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_mask_cmple_epi32_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_cmple_epi32_mask(k, a, b); - #else - return k & simde_mm_cmple_epi32_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmple_epi32_mask - #define _mm_mask_cmple_epi32_mask(k, a, b) simde_mm_mask_cmple_epi32_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_cmple_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return simde_mm256_movm_epi32(_mm256_cmple_epi32_mask(a, b)); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmple_epi32(a_.m128i[i], b_.m128i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.i32 <= b_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] <= b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_cmple_epi32_mask (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_cmple_epi32_mask(a, b); - #else - return simde_mm256_movepi32_mask(simde_x_mm256_cmple_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmple_epi32_mask - #define _mm256_cmple_epi32_mask(a, b) simde_mm256_cmple_epi32_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_mask_cmple_epi32_mask(simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_cmple_epi32_mask(k, a, b); - #else - return k & simde_mm256_cmple_epi32_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmple_epi32_mask - #define _mm256_mask_cmple_epi32_mask(k, a, b) simde_mm256_mask_cmple_epi32_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_cmple_epi32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return simde_mm512_movm_epi32(_mm512_cmple_epi32_mask(a, b)); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmple_epi32(a_.m128i[i], b_.m128i[i]); - } - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_x_mm256_cmple_epi32(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.i32 <= b_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] <= b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm512_cmple_epi32_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cmple_epi32_mask(a, b); - #else - return simde_mm512_movepi32_mask(simde_x_mm512_cmple_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epi32_mask - #define _mm512_cmple_epi32_mask(a, b) simde_mm512_cmple_epi32_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm512_mask_cmple_epi32_mask(simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cmple_epi32_mask(k, a, b); - #else - return k & simde_mm512_cmple_epi32_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmple_epi32_mask - #define _mm512_mask_cmple_epi32_mask(k, a, b) simde_mm512_mask_cmple_epi32_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_cmple_epu32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return simde_mm_movm_epi32(_mm_cmple_epu32_mask(a, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcleq_u32(a_.neon_u32, b_.neon_u32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u32x4_le(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), vec_cmple(a_.altivec_u32, b_.altivec_u32)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), a_.u32 <= b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] <= b_.u32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_cmple_epu32_mask (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_cmple_epu32_mask(a, b); - #else - return simde_mm_movepi32_mask(simde_x_mm_cmple_epu32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_cmple_epu32_mask - #define _mm_cmple_epu32_mask(a, b) simde_mm_cmple_epu32_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_mask_cmple_epu32_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_cmple_epu32_mask(k, a, b); - #else - return k & simde_mm_cmple_epu32_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmple_epu32_mask - #define _mm_mask_cmple_epu32_mask(k, a, b) simde_mm_mask_cmple_epu32_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_cmple_epu32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return simde_mm256_movm_epi32(_mm256_cmple_epu32_mask(a, b)); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmple_epu32(a_.m128i[i], b_.m128i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), a_.u32 <= b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] <= b_.u32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_cmple_epu32_mask (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_cmple_epu32_mask(a, b); - #else - return simde_mm256_movepi32_mask(simde_x_mm256_cmple_epu32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmple_epu32_mask - #define _mm256_cmple_epu32_mask(a, b) simde_mm256_cmple_epu32_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_mask_cmple_epu32_mask(simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_cmple_epu32_mask(k, a, b); - #else - return k & simde_mm256_cmple_epu32_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmple_epu32_mask - #define _mm256_mask_cmple_epu32_mask(k, a, b) simde_mm256_mask_cmple_epu32_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_cmple_epu32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return simde_mm512_movm_epi32(_mm512_cmple_epu32_mask(a, b)); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmple_epu32(a_.m128i[i], b_.m128i[i]); - } - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_x_mm256_cmple_epu32(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), a_.u32 <= b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] <= b_.u32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm512_cmple_epu32_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cmple_epu32_mask(a, b); - #else - return simde_mm512_movepi32_mask(simde_x_mm512_cmple_epu32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epu32_mask - #define _mm512_cmple_epu32_mask(a, b) simde_mm512_cmple_epu32_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm512_mask_cmple_epu32_mask(simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cmple_epu32_mask(k, a, b); - #else - return k & simde_mm512_cmple_epu32_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmple_epu32_mask - #define _mm512_mask_cmple_epu32_mask(k, a, b) simde_mm512_mask_cmple_epu32_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_cmple_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return simde_mm_movm_epi64(_mm_cmple_epi64_mask(a, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vcleq_s64(a_.neon_i64, b_.neon_i64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_le(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), vec_cmple(a_.altivec_i64, b_.altivec_i64)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 <= b_.i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r_.i64[i] = (a_.i64[i] <= b_.i64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_cmple_epi64_mask (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_cmple_epi64_mask(a, b); - #else - return simde_mm_movepi64_mask(simde_x_mm_cmple_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_cmple_epi64_mask - #define _mm_cmple_epi64_mask(a, b) simde_mm_cmple_epi64_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_mask_cmple_epi64_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_cmple_epi64_mask(k, a, b); - #else - return k & simde_mm_cmple_epi64_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmple_epi64_mask - #define _mm_mask_cmple_epi64_mask(k, a, b) simde_mm_mask_cmple_epi64_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_cmple_epi64 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return simde_mm256_movm_epi64(_mm256_cmple_epi64_mask(a, b)); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmple_epi64(a_.m128i[i], b_.m128i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 <= b_.i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r_.i64[i] = (a_.i64[i] <= b_.i64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_cmple_epi64_mask (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_cmple_epi64_mask(a, b); - #else - return simde_mm256_movepi64_mask(simde_x_mm256_cmple_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmple_epi64_mask - #define _mm256_cmple_epi64_mask(a, b) simde_mm256_cmple_epi64_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_mask_cmple_epi64_mask(simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_cmple_epi64_mask(k, a, b); - #else - return k & simde_mm256_cmple_epi64_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmple_epi64_mask - #define _mm256_mask_cmple_epi64_mask(k, a, b) simde_mm256_mask_cmple_epi64_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_cmple_epi64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return simde_mm512_movm_epi64(_mm512_cmple_epi64_mask(a, b)); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmple_epi64(a_.m128i[i], b_.m128i[i]); - } - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_x_mm256_cmple_epi64(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 <= b_.i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r_.i64[i] = (a_.i64[i] <= b_.i64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm512_cmple_epi64_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cmple_epi64_mask(a, b); - #else - return simde_mm512_movepi64_mask(simde_x_mm512_cmple_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epi64_mask - #define _mm512_cmple_epi64_mask(a, b) simde_mm512_cmple_epi64_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm512_mask_cmple_epi64_mask(simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cmple_epi64_mask(k, a, b); - #else - return k & simde_mm512_cmple_epi64_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmple_epi64_mask - #define _mm512_mask_cmple_epi64_mask(k, a, b) simde_mm512_mask_cmple_epi64_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_cmple_epu64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return simde_mm_movm_epi64(_mm_cmple_epu64_mask(a, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vcleq_u64(a_.neon_u64, b_.neon_u64); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), vec_cmple(a_.altivec_u64, b_.altivec_u64)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), a_.u64 <= b_.u64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] <= b_.u64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_cmple_epu64_mask (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_cmple_epu64_mask(a, b); - #else - return simde_mm_movepi64_mask(simde_x_mm_cmple_epu64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_cmple_epu64_mask - #define _mm_cmple_epu64_mask(a, b) simde_mm_cmple_epu64_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_mask_cmple_epu64_mask(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_cmple_epu64_mask(k, a, b); - #else - return k & simde_mm_cmple_epu64_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmple_epu64_mask - #define _mm_mask_cmple_epu64_mask(k, a, b) simde_mm_mask_cmple_epu64_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_cmple_epu64 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return simde_mm256_movm_epi64(_mm256_cmple_epu64_mask(a, b)); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmple_epu64(a_.m128i[i], b_.m128i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), a_.u64 <= b_.u64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] <= b_.u64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_cmple_epu64_mask (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_cmple_epu64_mask(a, b); - #else - return simde_mm256_movepi64_mask(simde_x_mm256_cmple_epu64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmple_epu64_mask - #define _mm256_cmple_epu64_mask(a, b) simde_mm256_cmple_epu64_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_mask_cmple_epu64_mask(simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_cmple_epu64_mask(k, a, b); - #else - return k & simde_mm256_cmple_epu64_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmple_epu64_mask - #define _mm256_mask_cmple_epu64_mask(k, a, b) simde_mm256_mask_cmple_epu64_mask((k), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_cmple_epu64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return simde_mm512_movm_epi64(_mm512_cmple_epu64_mask(a, b)); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_cmple_epu64(a_.m128i[i], b_.m128i[i]); - } - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_x_mm256_cmple_epu64(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), a_.u64 <= b_.u64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] <= b_.u64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm512_cmple_epu64_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cmple_epu64_mask(a, b); - #else - return simde_mm512_movepi64_mask(simde_x_mm512_cmple_epu64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmple_epu64_mask - #define _mm512_cmple_epu64_mask(a, b) simde_mm512_cmple_epu64_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm512_mask_cmple_epu64_mask(simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cmple_epu64_mask(k, a, b); - #else - return k & simde_mm512_cmple_epu64_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cmple_epu64_mask - #define _mm512_mask_cmple_epu64_mask(k, a, b) simde_mm512_mask_cmple_epu64_mask((k), (a), (b)) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_CMPLE_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/cmplt.h b/ffi-deps/simde/simde/x86/avx512/cmplt.h deleted file mode 100644 index 550e901..0000000 --- a/ffi-deps/simde/simde/x86/avx512/cmplt.h +++ /dev/null @@ -1,123 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - */ - -#if !defined(SIMDE_X86_AVX512_CMPLT_H) -#define SIMDE_X86_AVX512_CMPLT_H - -#include "types.h" -#include "mov.h" -#include "cmp.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm512_cmplt_ps_mask (simde__m512 a, simde__m512 b) { - return simde_mm512_cmp_ps_mask(a, b, SIMDE_CMP_LT_OQ); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmplt_ps_mask - #define _mm512_cmplt_ps_mask(a, b) simde_mm512_cmp_ps_mask(a, b, SIMDE_CMP_LT_OQ) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm512_cmplt_pd_mask (simde__m512d a, simde__m512d b) { - return simde_mm512_cmp_pd_mask(a, b, SIMDE_CMP_LT_OQ); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmplt_pd_mask - #define _mm512_cmplt_pd_mask(a, b) simde_mm512_cmp_pd_mask(a, b, SIMDE_CMP_LT_OQ) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_mm512_cmplt_epi8_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_cmplt_epi8_mask(a, b); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - simde__mmask64 r = 0; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - simde__m512i_private tmp; - - tmp.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.i8), a_.i8 < b_.i8); - r = simde_mm512_movepi8_mask(simde__m512i_from_private(tmp)); - #else - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) { - r |= (a_.i8[i] < b_.i8[i]) ? (UINT64_C(1) << i) : 0; - } - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmplt_epi8_mask - #define _mm512_cmplt_epi8_mask(a, b) simde_mm512_cmplt_epi8_mask(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_mm512_cmplt_epu8_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_cmplt_epu8_mask(a, b); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - simde__mmask64 r = 0; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - simde__m512i_private tmp; - - tmp.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp.i8), a_.u8 < b_.u8); - r = simde_mm512_movepi8_mask(simde__m512i_from_private(tmp)); - #else - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { - r |= (a_.u8[i] < b_.u8[i]) ? (UINT64_C(1) << i) : 0; - } - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cmplt_epu8_mask - #define _mm512_cmplt_epu8_mask(a, b) simde_mm512_cmplt_epu8_mask(a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_CMPLT_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/cmpneq.h b/ffi-deps/simde/simde/x86/avx512/cmpneq.h deleted file mode 100644 index 6e9bf33..0000000 --- a/ffi-deps/simde/simde/x86/avx512/cmpneq.h +++ /dev/null @@ -1,490 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2021 Evan Nemerson - */ - -#if !defined(SIMDE_X86_AVX512_CMPNEQ_H) -#define SIMDE_X86_AVX512_CMPNEQ_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" -#include "mov_mask.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm_cmpneq_epi8_mask(simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_cmpneq_epi8_mask(a, b); - #else - return ~simde_mm_movepi8_mask(simde_mm_cmpeq_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpneq_epi8_mask - #define _mm_cmpneq_epi8_mask(a, b) simde_mm_cmpneq_epi8_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm_mask_cmpneq_epi8_mask(simde__mmask16 k1, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_cmpneq_epi8_mask(k1, a, b); - #else - return simde_mm_cmpneq_epi8_mask(a, b) & k1; - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmpneq_epi8_mask - #define _mm_mask_cmpneq_epi8_mask(k1, a, b) simde_mm_mask_cmpneq_epi8_mask((k1), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm_cmpneq_epu8_mask(simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_cmpneq_epu8_mask(a, b); - #else - return simde_mm_cmpneq_epi8_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpneq_epu8_mask - #define _mm_cmpneq_epu8_mask(a, b) simde_mm_cmpneq_epu8_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm_mask_cmpneq_epu8_mask(simde__mmask16 k1, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_cmpneq_epu8_mask(k1, a, b); - #else - return simde_mm_mask_cmpneq_epi8_mask(k1, a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmpneq_epu8_mask - #define _mm_mask_cmpneq_epu8_mask(k1, a, b) simde_mm_mask_cmpneq_epu8_mask((k1), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_cmpneq_epi16_mask(simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_cmpneq_epi16_mask(a, b); - #else - return ~simde_mm_movepi16_mask(simde_mm_cmpeq_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpneq_epi16_mask - #define _mm_cmpneq_epi16_mask(a, b) simde_mm_cmpneq_epi16_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_mask_cmpneq_epi16_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_cmpneq_epi16_mask(k1, a, b); - #else - return simde_mm_cmpneq_epi16_mask(a, b) & k1; - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmpneq_epi16_mask - #define _mm_mask_cmpneq_epi16_mask(k1, a, b) simde_mm_mask_cmpneq_epi16_mask((k1), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_cmpneq_epu16_mask(simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_cmpneq_epu16_mask(a, b); - #else - return simde_mm_cmpneq_epi16_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpneq_epu16_mask - #define _mm_cmpneq_epu16_mask(a, b) simde_mm_cmpneq_epu16_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_mask_cmpneq_epu16_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_cmpneq_epu16_mask(k1, a, b); - #else - return simde_mm_mask_cmpneq_epi16_mask(k1, a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmpneq_epu16_mask - #define _mm_mask_cmpneq_epu16_mask(k1, a, b) simde_mm_mask_cmpneq_epu16_mask((k1), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_cmpneq_epi32_mask(simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_cmpneq_epi32_mask(a, b); - #else - return (~simde_mm_movepi32_mask(simde_mm_cmpeq_epi32(a, b))) & 15; - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpneq_epi32_mask - #define _mm_cmpneq_epi32_mask(a, b) simde_mm_cmpneq_epi32_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_mask_cmpneq_epi32_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_cmpneq_epi32_mask(k1, a, b); - #else - return simde_mm_cmpneq_epi32_mask(a, b) & k1; - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmpneq_epi32_mask - #define _mm_mask_cmpneq_epi32_mask(k1, a, b) simde_mm_mask_cmpneq_epi32_mask((k1), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_cmpneq_epu32_mask(simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_cmpneq_epu32_mask(a, b); - #else - return simde_mm_cmpneq_epi32_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpneq_epu32_mask - #define _mm_cmpneq_epu32_mask(a, b) simde_mm_cmpneq_epu32_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_mask_cmpneq_epu32_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_cmpneq_epu32_mask(k1, a, b); - #else - return simde_mm_mask_cmpneq_epi32_mask(k1, a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmpneq_epu32_mask - #define _mm_mask_cmpneq_epu32_mask(k1, a, b) simde_mm_mask_cmpneq_epu32_mask((k1), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_cmpneq_epi64_mask(simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_cmpneq_epi64_mask(a, b); - #else - return (~simde_mm_movepi64_mask(simde_mm_cmpeq_epi64(a, b))) & 3; - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpneq_epi64_mask - #define _mm_cmpneq_epi64_mask(a, b) simde_mm_cmpneq_epi64_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_mask_cmpneq_epi64_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_cmpneq_epi64_mask(k1, a, b); - #else - return simde_mm_cmpneq_epi64_mask(a, b) & k1; - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmpneq_epi64_mask - #define _mm_mask_cmpneq_epi64_mask(k1, a, b) simde_mm_mask_cmpneq_epi64_mask((k1), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_cmpneq_epu64_mask(simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_cmpneq_epu64_mask(a, b); - #else - return simde_mm_cmpneq_epi64_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpneq_epu64_mask - #define _mm_cmpneq_epu64_mask(a, b) simde_mm_cmpneq_epu64_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_mask_cmpneq_epu64_mask(simde__mmask8 k1, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_cmpneq_epu64_mask(k1, a, b); - #else - return simde_mm_mask_cmpneq_epi64_mask(k1, a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cmpneq_epu64_mask - #define _mm_mask_cmpneq_epu64_mask(k1, a, b) simde_mm_mask_cmpneq_epu64_mask((k1), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm256_cmpneq_epi8_mask(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_cmpneq_epi8_mask(a, b); - #else - return ~simde_mm256_movepi8_mask(simde_mm256_cmpeq_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpneq_epi8_mask - #define _mm256_cmpneq_epi8_mask(a, b) simde_mm256_cmpneq_epi8_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm256_mask_cmpneq_epi8_mask(simde__mmask32 k1, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_mask_cmpneq_epi8_mask(k1, a, b); - #else - return simde_mm256_cmpneq_epi8_mask(a, b) & k1; - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmpneq_epi8_mask - #define _mm256_mask_cmpneq_epi8_mask(k1, a, b) simde_mm256_mask_cmpneq_epi8_mask((k1), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm256_cmpneq_epu8_mask(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_cmpneq_epu8_mask(a, b); - #else - return simde_mm256_cmpneq_epi8_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpneq_epu8_mask - #define _mm256_cmpneq_epu8_mask(a, b) simde_mm256_cmpneq_epu8_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm256_mask_cmpneq_epu8_mask(simde__mmask32 k1, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_mask_cmpneq_epu8_mask(k1, a, b); - #else - return simde_mm256_mask_cmpneq_epi8_mask(k1, a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmpneq_epu8_mask - #define _mm256_mask_cmpneq_epu8_mask(k1, a, b) simde_mm256_mask_cmpneq_epu8_mask((k1), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm256_cmpneq_epi16_mask(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_cmpneq_epi16_mask(a, b); - #else - return ~simde_mm256_movepi16_mask(simde_mm256_cmpeq_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpneq_epi16_mask - #define _mm256_cmpneq_epi16_mask(a, b) simde_mm256_cmpneq_epi16_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm256_mask_cmpneq_epi16_mask(simde__mmask16 k1, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_mask_cmpneq_epi16_mask(k1, a, b); - #else - return simde_mm256_cmpneq_epi16_mask(a, b) & k1; - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmpneq_epi16_mask - #define _mm256_mask_cmpneq_epi16_mask(k1, a, b) simde_mm256_mask_cmpneq_epi16_mask((k1), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm256_cmpneq_epu16_mask(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_cmpneq_epu16_mask(a, b); - #else - return simde_mm256_cmpneq_epi16_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpneq_epu16_mask - #define _mm256_cmpneq_epu16_mask(a, b) simde_mm256_cmpneq_epu16_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm256_mask_cmpneq_epu16_mask(simde__mmask16 k1, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_mask_cmpneq_epu16_mask(k1, a, b); - #else - return simde_mm256_mask_cmpneq_epi16_mask(k1, a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmpneq_epu16_mask - #define _mm256_mask_cmpneq_epu16_mask(k1, a, b) simde_mm256_mask_cmpneq_epu16_mask((k1), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_cmpneq_epi32_mask(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_cmpneq_epi32_mask(a, b); - #else - return (~simde_mm256_movepi32_mask(simde_mm256_cmpeq_epi32(a, b))); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpneq_epi32_mask - #define _mm256_cmpneq_epi32_mask(a, b) simde_mm256_cmpneq_epi32_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_mask_cmpneq_epi32_mask(simde__mmask8 k1, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_cmpneq_epi32_mask(k1, a, b); - #else - return simde_mm256_cmpneq_epi32_mask(a, b) & k1; - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmpneq_epi32_mask - #define _mm256_mask_cmpneq_epi32_mask(k1, a, b) simde_mm256_mask_cmpneq_epi32_mask((k1), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_cmpneq_epu32_mask(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_cmpneq_epu32_mask(a, b); - #else - return simde_mm256_cmpneq_epi32_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpneq_epu32_mask - #define _mm256_cmpneq_epu32_mask(a, b) simde_mm256_cmpneq_epu32_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_mask_cmpneq_epu32_mask(simde__mmask8 k1, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_cmpneq_epu32_mask(k1, a, b); - #else - return simde_mm256_mask_cmpneq_epi32_mask(k1, a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmpneq_epu32_mask - #define _mm256_mask_cmpneq_epu32_mask(k1, a, b) simde_mm256_mask_cmpneq_epu32_mask((k1), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_cmpneq_epi64_mask(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_cmpneq_epi64_mask(a, b); - #else - return (~simde_mm256_movepi64_mask(simde_mm256_cmpeq_epi64(a, b))) & 15; - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpneq_epi64_mask - #define _mm256_cmpneq_epi64_mask(a, b) simde_mm256_cmpneq_epi64_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_mask_cmpneq_epi64_mask(simde__mmask8 k1, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_cmpneq_epi64_mask(k1, a, b); - #else - return simde_mm256_cmpneq_epi64_mask(a, b) & k1; - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmpneq_epi64_mask - #define _mm256_mask_cmpneq_epi64_mask(k1, a, b) simde_mm256_mask_cmpneq_epi64_mask((k1), (a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_cmpneq_epu64_mask(simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_cmpneq_epu64_mask(a, b); - #else - return simde_mm256_cmpneq_epi64_mask(a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_cmpneq_epu64_mask - #define _mm256_cmpneq_epu64_mask(a, b) simde_mm256_cmpneq_epu64_mask((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_mask_cmpneq_epu64_mask(simde__mmask8 k1, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_cmpneq_epu64_mask(k1, a, b); - #else - return simde_mm256_mask_cmpneq_epi64_mask(k1, a, b); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_cmpneq_epu64_mask - #define _mm256_mask_cmpneq_epu64_mask(k1, a, b) simde_mm256_mask_cmpneq_epu64_mask((k1), (a), (b)) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_CMPNEQ_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/compress.h b/ffi-deps/simde/simde/x86/avx512/compress.h deleted file mode 100644 index 06fffc7..0000000 --- a/ffi-deps/simde/simde/x86/avx512/compress.h +++ /dev/null @@ -1,755 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_COMPRESS_H) -#define SIMDE_X86_AVX512_COMPRESS_H - -#include "types.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_mask_compress_pd (simde__m256d src, simde__mmask8 k, simde__m256d a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm256_mask_compress_pd(src, k, a); - #else - simde__m256d_private - a_ = simde__m256d_to_private(a), - src_ = simde__m256d_to_private(src); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - if ((k >> i) & 1) { - a_.f64[ri++] = a_.f64[i]; - } - } - - for ( ; ri < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; ri++) { - a_.f64[ri] = src_.f64[ri]; - } - - return simde__m256d_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_compress_pd - #define _mm256_mask_compress_pd(src, k, a) simde_mm256_mask_compress_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_mask_compressstoreu_pd (void* base_addr, simde__mmask8 k, simde__m256d a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) - _mm256_mask_compressstoreu_pd(base_addr, k, a); - #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) - simde__mmask8 store_mask = _pext_u32(-1, k); - _mm256_mask_storeu_pd(base_addr, store_mask, _mm256_maskz_compress_pd(k, a)); - #else - simde__m256d_private - a_ = simde__m256d_to_private(a); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - if ((k >> i) & 1) { - a_.f64[ri++] = a_.f64[i]; - } - } - - simde_memcpy(base_addr, &a_, ri * sizeof(a_.f64[0])); - - return; - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_compressstoreu_pd - #define _mm256_mask_compressstoreu_pd(base_addr, k, a) simde_mm256_mask_compressstoreu_pd(base_addr, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_maskz_compress_pd (simde__mmask8 k, simde__m256d a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm256_maskz_compress_pd(k, a); - #else - simde__m256d_private - a_ = simde__m256d_to_private(a); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - if ((k >> i) & 1) { - a_.f64[ri++] = a_.f64[i]; - } - } - - for ( ; ri < (sizeof(a_.f64) / sizeof(a_.f64[0])); ri++) { - a_.f64[ri] = SIMDE_FLOAT64_C(0.0); - } - - return simde__m256d_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_compress_pd - #define _mm256_maskz_compress_pd(k, a) simde_mm256_maskz_compress_pd(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_mask_compress_ps (simde__m256 src, simde__mmask8 k, simde__m256 a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm256_mask_compress_ps(src, k, a); - #else - simde__m256_private - a_ = simde__m256_to_private(a), - src_ = simde__m256_to_private(src); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - if ((k >> i) & 1) { - a_.f32[ri++] = a_.f32[i]; - } - } - - for ( ; ri < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; ri++) { - a_.f32[ri] = src_.f32[ri]; - } - - return simde__m256_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_compress_ps - #define _mm256_mask_compress_ps(src, k, a) simde_mm256_mask_compress_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_mask_compressstoreu_ps (void* base_addr, simde__mmask8 k, simde__m256 a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) - _mm256_mask_compressstoreu_ps(base_addr, k, a); - #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) - simde__mmask8 store_mask = _pext_u32(-1, k); - _mm256_mask_storeu_ps(base_addr, store_mask, _mm256_maskz_compress_ps(k, a)); - #else - simde__m256_private - a_ = simde__m256_to_private(a); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - if ((k >> i) & 1) { - a_.f32[ri++] = a_.f32[i]; - } - } - - simde_memcpy(base_addr, &a_, ri * sizeof(a_.f32[0])); - - return; - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_compressstoreu_ps - #define _mm256_mask_compressstoreu_ps(base_addr, k, a) simde_mm256_mask_compressstoreu_ps(base_addr, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_maskz_compress_ps (simde__mmask8 k, simde__m256 a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm256_maskz_compress_ps(k, a); - #else - simde__m256_private - a_ = simde__m256_to_private(a); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - if ((k >> i) & 1) { - a_.f32[ri++] = a_.f32[i]; - } - } - - for ( ; ri < (sizeof(a_.f32) / sizeof(a_.f32[0])); ri++) { - a_.f32[ri] = SIMDE_FLOAT32_C(0.0); - } - - return simde__m256_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_compress_ps - #define _mm256_maskz_compress_ps(k, a) simde_mm256_maskz_compress_ps(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_compress_epi32 (simde__m256i src, simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm256_mask_compress_epi32(src, k, a); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a), - src_ = simde__m256i_to_private(src); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - if ((k >> i) & 1) { - a_.i32[ri++] = a_.i32[i]; - } - } - - for ( ; ri < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; ri++) { - a_.i32[ri] = src_.i32[ri]; - } - - return simde__m256i_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_compress_epi32 - #define _mm256_mask_compress_epi32(src, k, a) simde_mm256_mask_compress_epi32(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_mask_compressstoreu_epi32 (void* base_addr, simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) - _mm256_mask_compressstoreu_epi32(base_addr, k, a); - #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) - simde__mmask8 store_mask = _pext_u32(-1, k); - _mm256_mask_storeu_epi32(base_addr, store_mask, _mm256_maskz_compress_epi32(k, a)); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - if ((k >> i) & 1) { - a_.i32[ri++] = a_.i32[i]; - } - } - - simde_memcpy(base_addr, &a_, ri * sizeof(a_.i32[0])); - - return; - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_compressstoreu_epi32 - #define _mm256_mask_compressstoreu_epi32(base_addr, k, a) simde_mm256_mask_compressstoreu_epi32(base_addr, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_compress_epi32 (simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm256_maskz_compress_epi32(k, a); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - if ((k >> i) & 1) { - a_.i32[ri++] = a_.i32[i]; - } - } - - for ( ; ri < (sizeof(a_.i32) / sizeof(a_.i32[0])); ri++) { - a_.f32[ri] = INT32_C(0); - } - - return simde__m256i_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_compress_epi32 - #define _mm256_maskz_compress_epi32(k, a) simde_mm256_maskz_compress_epi32(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_compress_epi64 (simde__m256i src, simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm256_mask_compress_epi64(src, k, a); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a), - src_ = simde__m256i_to_private(src); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - if ((k >> i) & 1) { - a_.i64[ri++] = a_.i64[i]; - } - } - - for ( ; ri < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; ri++) { - a_.i64[ri] = src_.i64[ri]; - } - - return simde__m256i_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_compress_epi64 - #define _mm256_mask_compress_epi64(src, k, a) simde_mm256_mask_compress_epi64(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_mask_compressstoreu_epi64 (void* base_addr, simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) - _mm256_mask_compressstoreu_epi64(base_addr, k, a); - #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) - simde__mmask8 store_mask = _pext_u32(-1, k); - _mm256_mask_storeu_epi64(base_addr, store_mask, _mm256_maskz_compress_epi64(k, a)); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - if ((k >> i) & 1) { - a_.i64[ri++] = a_.i64[i]; - } - } - - simde_memcpy(base_addr, &a_, ri * sizeof(a_.i64[0])); - - return; - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_compressstoreu_epi64 - #define _mm256_mask_compressstoreu_epi64(base_addr, k, a) simde_mm256_mask_compressstoreu_epi64(base_addr, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_compress_epi64 (simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm256_maskz_compress_epi64(k, a); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - if ((k >> i) & 1) { - a_.i64[ri++] = a_.i64[i]; - } - } - - for ( ; ri < (sizeof(a_.i64) / sizeof(a_.i64[0])); ri++) { - a_.i64[ri] = INT64_C(0); - } - - return simde__m256i_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_compress_epi64 - #define _mm256_maskz_compress_epi64(k, a) simde_mm256_maskz_compress_epi64(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_compress_pd (simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_compress_pd(src, k, a); - #else - simde__m512d_private - a_ = simde__m512d_to_private(a), - src_ = simde__m512d_to_private(src); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - if ((k >> i) & 1) { - a_.f64[ri++] = a_.f64[i]; - } - } - - for ( ; ri < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; ri++) { - a_.f64[ri] = src_.f64[ri]; - } - - return simde__m512d_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_compress_pd - #define _mm512_mask_compress_pd(src, k, a) simde_mm512_mask_compress_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm512_mask_compressstoreu_pd (void* base_addr, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) - _mm512_mask_compressstoreu_pd(base_addr, k, a); - #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) - simde__mmask8 store_mask = _pext_u32(-1, k); - _mm512_mask_storeu_pd(base_addr, store_mask, _mm512_maskz_compress_pd(k, a)); - #else - simde__m512d_private - a_ = simde__m512d_to_private(a); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - if ((k >> i) & 1) { - a_.f64[ri++] = a_.f64[i]; - } - } - - simde_memcpy(base_addr, &a_, ri * sizeof(a_.f64[0])); - - return; - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_compressstoreu_pd - #define _mm512_mask_compressstoreu_pd(base_addr, k, a) simde_mm512_mask_compressstoreu_pd(base_addr, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_maskz_compress_pd (simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_compress_pd(k, a); - #else - simde__m512d_private - a_ = simde__m512d_to_private(a); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - if ((k >> i) & 1) { - a_.f64[ri++] = a_.f64[i]; - } - } - - for ( ; ri < (sizeof(a_.f64) / sizeof(a_.f64[0])); ri++) { - a_.f64[ri] = SIMDE_FLOAT64_C(0.0); - } - - return simde__m512d_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_compress_pd - #define _mm512_maskz_compress_pd(k, a) simde_mm512_maskz_compress_pd(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_compress_ps (simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_compress_ps(src, k, a); - #else - simde__m512_private - a_ = simde__m512_to_private(a), - src_ = simde__m512_to_private(src); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - if ((k >> i) & 1) { - a_.f32[ri++] = a_.f32[i]; - } - } - - for ( ; ri < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; ri++) { - a_.f32[ri] = src_.f32[ri]; - } - - return simde__m512_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_compress_ps - #define _mm512_mask_compress_ps(src, k, a) simde_mm512_mask_compress_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm512_mask_compressstoreu_ps (void* base_addr, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) - _mm512_mask_compressstoreu_ps(base_addr, k, a); - #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) - simde__mmask16 store_mask = _pext_u32(-1, k); - _mm512_mask_storeu_ps(base_addr, store_mask, _mm512_maskz_compress_ps(k, a)); - #else - simde__m512_private - a_ = simde__m512_to_private(a); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - if ((k >> i) & 1) { - a_.f32[ri++] = a_.f32[i]; - } - } - - simde_memcpy(base_addr, &a_, ri * sizeof(a_.f32[0])); - - return; - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_compressstoreu_ps - #define _mm512_mask_compressstoreu_ps(base_addr, k, a) simde_mm512_mask_compressstoreu_ps(base_addr, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_compress_ps (simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_compress_ps(k, a); - #else - simde__m512_private - a_ = simde__m512_to_private(a); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - if ((k >> i) & 1) { - a_.f32[ri++] = a_.f32[i]; - } - } - - for ( ; ri < (sizeof(a_.f32) / sizeof(a_.f32[0])); ri++) { - a_.f32[ri] = SIMDE_FLOAT32_C(0.0); - } - - return simde__m512_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_compress_ps - #define _mm512_maskz_compress_ps(k, a) simde_mm512_maskz_compress_ps(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_compress_epi32 (simde__m512i src, simde__mmask16 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_compress_epi32(src, k, a); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - src_ = simde__m512i_to_private(src); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - if ((k >> i) & 1) { - a_.i32[ri++] = a_.i32[i]; - } - } - - for ( ; ri < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; ri++) { - a_.i32[ri] = src_.i32[ri]; - } - - return simde__m512i_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_compress_epi32 - #define _mm512_mask_compress_epi32(src, k, a) simde_mm512_mask_compress_epi32(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm512_mask_compressstoreu_epi16 (void* base_addr, simde__mmask32 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512VBMI2_NATIVE) && !defined(__znver4__) - _mm512_mask_compressstoreu_epi16(base_addr, k, a); - #elif defined(SIMDE_X86_AVX512VBMI2_NATIVE) && defined(__znver4__) - simde__mmask32 store_mask = _pext_u32(-1, k); - _mm512_mask_storeu_epi16(base_addr, store_mask, _mm512_maskz_compress_epi16(k, a)); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i16) / sizeof(a_.i16[0])) ; i++) { - if ((k >> i) & 1) { - a_.i16[ri++] = a_.i16[i]; - } - } - - simde_memcpy(base_addr, &a_, ri * sizeof(a_.i16[0])); - - return; - #endif -} -#if defined(SIMDE_X86_AVX512VBMI2_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_compressstoreu_epi16 - #define _mm512_mask_compressstoreu_epi16(base_addr, k, a) simde_mm512_mask_compressstoreu_epi16(base_addr, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm512_mask_compressstoreu_epi32 (void* base_addr, simde__mmask16 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) - _mm512_mask_compressstoreu_epi32(base_addr, k, a); - #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) - simde__mmask16 store_mask = _pext_u32(-1, k); - _mm512_mask_storeu_epi32(base_addr, store_mask, _mm512_maskz_compress_epi32(k, a)); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - if ((k >> i) & 1) { - a_.i32[ri++] = a_.i32[i]; - } - } - - simde_memcpy(base_addr, &a_, ri * sizeof(a_.i32[0])); - - return; - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_compressstoreu_epi32 - #define _mm512_mask_compressstoreu_epi32(base_addr, k, a) simde_mm512_mask_compressstoreu_epi32(base_addr, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_compress_epi32 (simde__mmask16 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_compress_epi32(k, a); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - if ((k >> i) & 1) { - a_.i32[ri++] = a_.i32[i]; - } - } - - for ( ; ri < (sizeof(a_.i32) / sizeof(a_.i32[0])); ri++) { - a_.f32[ri] = INT32_C(0); - } - - return simde__m512i_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_compress_epi32 - #define _mm512_maskz_compress_epi32(k, a) simde_mm512_maskz_compress_epi32(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_compress_epi64 (simde__m512i src, simde__mmask8 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_compress_epi64(src, k, a); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - src_ = simde__m512i_to_private(src); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - if ((k >> i) & 1) { - a_.i64[ri++] = a_.i64[i]; - } - } - - for ( ; ri < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; ri++) { - a_.i64[ri] = src_.i64[ri]; - } - - return simde__m512i_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_compress_epi64 - #define _mm512_mask_compress_epi64(src, k, a) simde_mm512_mask_compress_epi64(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm512_mask_compressstoreu_epi64 (void* base_addr, simde__mmask8 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && !defined(__znver4__) - _mm512_mask_compressstoreu_epi64(base_addr, k, a); - #elif defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) && defined(__znver4__) - simde__mmask8 store_mask = _pext_u32(-1, k); - _mm512_mask_storeu_epi64(base_addr, store_mask, _mm512_maskz_compress_epi64(k, a)); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - if ((k >> i) & 1) { - a_.i64[ri++] = a_.i64[i]; - } - } - - simde_memcpy(base_addr, &a_, ri * sizeof(a_.i64[0])); - - return; - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_compressstoreu_epi64 - #define _mm512_mask_compressstoreu_epi64(base_addr, k, a) simde_mm512_mask_compressstoreu_epi64(base_addr, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_compress_epi64 (simde__mmask8 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_compress_epi64(k, a); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a); - size_t ri = 0; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - if ((k >> i) & 1) { - a_.i64[ri++] = a_.i64[i]; - } - } - - for ( ; ri < (sizeof(a_.i64) / sizeof(a_.i64[0])); ri++) { - a_.i64[ri] = INT64_C(0); - } - - return simde__m512i_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_compress_epi64 - #define _mm512_maskz_compress_epi64(k, a) simde_mm512_maskz_compress_epi64(k, a) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_COMPRESS_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/conflict.h b/ffi-deps/simde/simde/x86/avx512/conflict.h deleted file mode 100644 index 239aef9..0000000 --- a/ffi-deps/simde/simde/x86/avx512/conflict.h +++ /dev/null @@ -1,351 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_CONFLICT_H) -#define SIMDE_X86_AVX512_CONFLICT_H - -#include "types.h" -#include "mov_mask.h" -#include "mov.h" -#include "cmpeq.h" -#include "set1.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_conflict_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) - return _mm_conflict_epi32(a); - #else - simde__m128i_private - r_ = simde__m128i_to_private(simde_mm_setzero_si128()), - a_ = simde__m128i_to_private(a); - - for (size_t i = 1 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = - simde_mm_movemask_ps( - simde_mm_castsi128_ps( - simde_mm_cmpeq_epi32(simde_mm_set1_epi32(a_.i32[i]), a) - ) - ) & ((1 << i) - 1); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) - #undef _mm_conflict_epi32 - #define _mm_conflict_epi32(a) simde_mm_conflict_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_conflict_epi32 (simde__m128i src, simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) - return _mm_mask_conflict_epi32(src, k, a); - #else - return simde_mm_mask_mov_epi32(src, k, simde_mm_conflict_epi32(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_conflict_epi32 - #define _mm_mask_conflict_epi32(src, k, a) simde_mm_mask_conflict_epi32(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_conflict_epi32 (simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) - return _mm_maskz_conflict_epi32(k, a); - #else - return simde_mm_maskz_mov_epi32(k, simde_mm_conflict_epi32(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_conflict_epi32 - #define _mm_maskz_conflict_epi32(k, a) simde_mm_maskz_conflict_epi32(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_conflict_epi32 (simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) - return _mm256_conflict_epi32(a); - #else - simde__m256i_private - r_ = simde__m256i_to_private(simde_mm256_setzero_si256()), - a_ = simde__m256i_to_private(a); - - for (size_t i = 1 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = - simde_mm256_movemask_ps( - simde_mm256_castsi256_ps( - simde_mm256_cmpeq_epi32(simde_mm256_set1_epi32(a_.i32[i]), a) - ) - ) & ((1 << i) - 1); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) - #undef _mm256_conflict_epi32 - #define _mm256_conflict_epi32(a) simde_mm256_conflict_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_conflict_epi32 (simde__m256i src, simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) - return _mm256_mask_conflict_epi32(src, k, a); - #else - return simde_mm256_mask_mov_epi32(src, k, simde_mm256_conflict_epi32(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_conflict_epi32 - #define _mm256_mask_conflict_epi32(src, k, a) simde_mm256_mask_conflict_epi32(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_conflict_epi32 (simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) - return _mm256_maskz_conflict_epi32(k, a); - #else - return simde_mm256_maskz_mov_epi32(k, simde_mm256_conflict_epi32(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_conflict_epi32 - #define _mm256_maskz_conflict_epi32(k, a) simde_mm256_maskz_conflict_epi32(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_conflict_epi32 (simde__m512i a) { - #if defined(SIMDE_X86_AVX512CD_NATIVE) - return _mm512_conflict_epi32(a); - #else - simde__m512i_private - r_ = simde__m512i_to_private(simde_mm512_setzero_si512()), - a_ = simde__m512i_to_private(a); - - for (size_t i = 1 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = - HEDLEY_STATIC_CAST( - int32_t, - simde_mm512_cmpeq_epi32_mask(simde_mm512_set1_epi32(a_.i32[i]), a) - ) & ((1 << i) - 1); - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) - #undef _mm512_conflict_epi32 - #define _mm512_conflict_epi32(a) simde_mm512_conflict_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_conflict_epi32 (simde__m512i src, simde__mmask16 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512CD_NATIVE) - return _mm512_mask_conflict_epi32(src, k, a); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_conflict_epi32(a)); - #endif -} -#if defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_conflict_epi32 - #define _mm512_mask_conflict_epi32(src, k, a) simde_mm512_mask_conflict_epi32(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_conflict_epi32 (simde__mmask16 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512CD_NATIVE) - return _mm512_maskz_conflict_epi32(k, a); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_conflict_epi32(a)); - #endif -} -#if defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_conflict_epi32 - #define _mm512_maskz_conflict_epi32(k, a) simde_mm512_maskz_conflict_epi32(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_conflict_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) - return _mm_conflict_epi64(a); - #else - simde__m128i_private - r_ = simde__m128i_to_private(simde_mm_setzero_si128()), - a_ = simde__m128i_to_private(a); - - for (size_t i = 1 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = - HEDLEY_STATIC_CAST( - int64_t, - simde_mm_movemask_pd( - simde_mm_castsi128_pd( - simde_mm_cmpeq_epi64(simde_mm_set1_epi64x(a_.i64[i]), a) - ) - ) - ) & ((1 << i) - 1); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) - #undef _mm_conflict_epi64 - #define _mm_conflict_epi64(a) simde_mm_conflict_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_conflict_epi64 (simde__m128i src, simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) - return _mm_mask_conflict_epi64(src, k, a); - #else - return simde_mm_mask_mov_epi64(src, k, simde_mm_conflict_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_conflict_epi64 - #define _mm_mask_conflict_epi64(src, k, a) simde_mm_mask_conflict_epi64(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_conflict_epi64 (simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) - return _mm_maskz_conflict_epi64(k, a); - #else - return simde_mm_maskz_mov_epi64(k, simde_mm_conflict_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_conflict_epi64 - #define _mm_maskz_conflict_epi64(k, a) simde_mm_maskz_conflict_epi64(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_conflict_epi64 (simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) - return _mm256_conflict_epi64(a); - #else - simde__m256i_private - r_ = simde__m256i_to_private(simde_mm256_setzero_si256()), - a_ = simde__m256i_to_private(a); - - for (size_t i = 1 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = - HEDLEY_STATIC_CAST( - int64_t, - simde_mm256_movemask_pd( - simde_mm256_castsi256_pd( - simde_mm256_cmpeq_epi64(simde_mm256_set1_epi64x(a_.i64[i]), a) - ) - ) - ) & ((1 << i) - 1); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) - #undef _mm256_conflict_epi64 - #define _mm256_conflict_epi64(a) simde_mm256_conflict_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_conflict_epi64 (simde__m256i src, simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) - return _mm256_mask_conflict_epi64(src, k, a); - #else - return simde_mm256_mask_mov_epi64(src, k, simde_mm256_conflict_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_conflict_epi64 - #define _mm256_mask_conflict_epi64(src, k, a) simde_mm256_mask_conflict_epi64(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_conflict_epi64 (simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) - return _mm256_maskz_conflict_epi64(k, a); - #else - return simde_mm256_maskz_mov_epi64(k, simde_mm256_conflict_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_conflict_epi64 - #define _mm256_maskz_conflict_epi64(k, a) simde_mm256_maskz_conflict_epi64(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_conflict_epi64 (simde__m512i a) { - #if defined(SIMDE_X86_AVX512CD_NATIVE) - return _mm512_conflict_epi64(a); - #else - simde__m512i_private - r_ = simde__m512i_to_private(simde_mm512_setzero_si512()), - a_ = simde__m512i_to_private(a); - - for (size_t i = 1 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = - HEDLEY_STATIC_CAST( - int64_t, - simde_mm512_cmpeq_epi64_mask(simde_mm512_set1_epi64(a_.i64[i]), a) - ) & ((1 << i) - 1); - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) - #undef _mm512_conflict_epi64 - #define _mm512_conflict_epi64(a) simde_mm512_conflict_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_conflict_epi64 (simde__m512i src, simde__mmask8 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512CD_NATIVE) - return _mm512_mask_conflict_epi64(src, k, a); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_conflict_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_conflict_epi64 - #define _mm512_mask_conflict_epi64(src, k, a) simde_mm512_mask_conflict_epi64(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_conflict_epi64 (simde__mmask8 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512CD_NATIVE) - return _mm512_maskz_conflict_epi64(k, a); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_conflict_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_conflict_epi64 - #define _mm512_maskz_conflict_epi64(k, a) simde_mm512_maskz_conflict_epi64(k, a) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_CONFLICT_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/copysign.h b/ffi-deps/simde/simde/x86/avx512/copysign.h deleted file mode 100644 index eba37e9..0000000 --- a/ffi-deps/simde/simde/x86/avx512/copysign.h +++ /dev/null @@ -1,86 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Himanshi Mathur - */ - -#if !defined(SIMDE_X86_AVX512_COPYSIGN_H) -#define SIMDE_X86_AVX512_COPYSIGN_H - -#include "types.h" -#include "mov.h" -#include "and.h" -#include "andnot.h" -#include "xor.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_x_mm512_copysign_ps(simde__m512 dest, simde__m512 src) { - simde__m512_private - r_, - dest_ = simde__m512_to_private(dest), - src_ = simde__m512_to_private(src); - - #if defined(simde_math_copysignf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_copysignf(dest_.f32[i], src_.f32[i]); - } - #else - simde__m512 sgnbit = simde_mm512_xor_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C(0.0)), simde_mm512_set1_ps(-SIMDE_FLOAT32_C(0.0))); - return simde_mm512_xor_ps(simde_mm512_and_ps(sgnbit, src), simde_mm512_andnot_ps(sgnbit, dest)); - #endif - - return simde__m512_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_x_mm512_copysign_pd(simde__m512d dest, simde__m512d src) { - simde__m512d_private - r_, - dest_ = simde__m512d_to_private(dest), - src_ = simde__m512d_to_private(src); - - #if defined(simde_math_copysign) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_copysign(dest_.f64[i], src_.f64[i]); - } - #else - simde__m512d sgnbit = simde_mm512_xor_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C(0.0)), simde_mm512_set1_pd(-SIMDE_FLOAT64_C(0.0))); - return simde_mm512_xor_pd(simde_mm512_and_pd(sgnbit, src), simde_mm512_andnot_pd(sgnbit, dest)); - #endif - - return simde__m512d_from_private(r_); -} - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_COPYSIGN_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/cvt.h b/ffi-deps/simde/simde/x86/avx512/cvt.h deleted file mode 100644 index 579bcac..0000000 --- a/ffi-deps/simde/simde/x86/avx512/cvt.h +++ /dev/null @@ -1,402 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020-2021 Evan Nemerson - * 2020 Himanshi Mathur - * 2020 Hidayat Khan - * 2021 Andrew Rodriguez - */ - -#if !defined(SIMDE_X86_AVX512_CVT_H) -#define SIMDE_X86_AVX512_CVT_H - -#include "types.h" -#include "mov.h" -#include "../../simde-f16.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cvtepi64_pd (simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm_cvtepi64_pd(a); - #else - simde__m128d_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_X86_SSE2_NATIVE) - /* https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx */ - __m128i xH = _mm_srai_epi32(a_.n, 16); - #if defined(SIMDE_X86_SSE4_2_NATIVE) - xH = _mm_blend_epi16(xH, _mm_setzero_si128(), 0x33); - #else - xH = _mm_and_si128(xH, _mm_set_epi16(~INT16_C(0), ~INT16_C(0), INT16_C(0), INT16_C(0), ~INT16_C(0), ~INT16_C(0), INT16_C(0), INT16_C(0))); - #endif - xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.0))); - const __m128i e = _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)); - #if defined(SIMDE_X86_SSE4_2_NATIVE) - __m128i xL = _mm_blend_epi16(a_.n, e, 0x88); - #else - __m128i m = _mm_set_epi16(INT16_C(0), ~INT16_C(0), ~INT16_C(0), ~INT16_C(0), INT16_C(0), ~INT16_C(0), ~INT16_C(0), ~INT16_C(0)); - __m128i xL = _mm_or_si128(_mm_and_si128(m, a_.n), _mm_andnot_si128(m, e)); - #endif - __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.0)); - return _mm_add_pd(f, _mm_castsi128_pd(xL)); - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.f64, a_.i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = HEDLEY_STATIC_CAST(simde_float64, a_.i64[i]); - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepi64_pd - #define _mm_cvtepi64_pd(a) simde_mm_cvtepi64_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_mask_cvtepi64_pd(simde__m128d src, simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm_mask_cvtepi64_pd(src, k, a); - #else - return simde_mm_mask_mov_pd(src, k, simde_mm_cvtepi64_pd(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cvtepi64_pd - #define _mm_mask_cvtepi64_pd(src, k, a) simde_mm_mask_cvtepi64_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_maskz_cvtepi64_pd(simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm_maskz_cvtepi64_pd(k, a); - #else - return simde_mm_maskz_mov_pd(k, simde_mm_cvtepi64_pd(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_cvtepi64_pd - #define _mm_maskz_cvtepi64_pd(k, a) simde_mm_maskz_cvtepi64_pd((k), (a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_cvtepi16_epi32 (simde__m256i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cvtepi16_epi32(a); - #else - simde__m512i_private r_; - simde__m256i_private a_ = simde__m256i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i32, a_.i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i16[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cvtepi16_epi32 - #define _mm512_cvtepi16_epi32(a) simde_mm512_cvtepi16_epi32(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm512_cvtepi16_epi8 (simde__m512i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_cvtepi16_epi8(a); - #else - simde__m256i_private r_; - simde__m512i_private a_ = simde__m512i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i8, a_.i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, a_.i16[i]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_cvtepi16_epi8 - #define _mm512_cvtepi16_epi8(a) simde_mm512_cvtepi16_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm512_mask_cvtepi16_epi8 (simde__m256i src, simde__mmask32 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_cvtepi16_epi8(src, k, a); - #else - return simde_mm256_mask_mov_epi8(src, k, simde_mm512_cvtepi16_epi8(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cvtepi16_epi8 - #define _mm512_mask_cvtepi16_epi8(src, k, a) simde_mm512_mask_cvtepi16_epi8(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm512_maskz_cvtepi16_epi8 (simde__mmask32 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_cvtepi16_epi8(k, a); - #else - return simde_mm256_maskz_mov_epi8(k, simde_mm512_cvtepi16_epi8(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_cvtepi16_epi8 - #define _mm512_maskz_cvtepi16_epi8(k, a) simde_mm512_maskz_cvtepi16_epi8(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_cvtepi8_epi16 (simde__m256i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_cvtepi8_epi16(a); - #else - simde__m512i_private r_; - simde__m256i_private a_ = simde__m256i_to_private(a); - - #if defined(SIMDE_X86_AVX2_NATIVE) - r_.m256i[0] = _mm256_cvtepi8_epi16(a_.m128i[0]); - r_.m256i[1] = _mm256_cvtepi8_epi16(a_.m128i[1]); - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i16, a_.i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i8[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cvtepi8_epi16 - #define _mm512_cvtepi8_epi16(a) simde_mm512_cvtepi8_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_cvtepi32_ps (simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cvtepi32_ps(a); - #else - simde__m512_private r_; - simde__m512i_private a_ = simde__m512i_to_private(a); - - #if defined(SIMDE_X86_AVX_NATIVE) - r_.m256[0] = _mm256_cvtepi32_ps(a_.m256i[0]); - r_.m256[1] = _mm256_cvtepi32_ps(a_.m256i[1]); - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.f32, a_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.f32[i] = HEDLEY_STATIC_CAST(simde_float32, a_.i32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cvtepi32_ps - #define _mm512_cvtepi32_ps(a) simde_mm512_cvtepi32_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm512_cvtepi64_epi32 (simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cvtepi64_epi32(a); - #else - simde__m256i_private r_; - simde__m512i_private a_ = simde__m512i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i32, a_.i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.i64[i]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cvtepi64_epi32 - #define _mm512_cvtepi64_epi32(a) simde_mm512_cvtepi64_epi32((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_cvtepu16_epi32 (simde__m256i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cvtepu16_epi32(a); - #else - simde__m512i_private r_; - simde__m256i_private a_ = simde__m256i_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i32, a_.u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.u16[i]); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cvtepu16_epi32 - #define _mm512_cvtepu16_epi32(a) simde_mm512_cvtepu16_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_cvtepu32_ps (simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cvtepu32_ps(a); - #else - simde__m512_private r_; - simde__m512i_private a_ = simde__m512i_to_private(a); - - #if defined(SIMDE_X86_SSE2_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - /* https://stackoverflow.com/a/34067907/501126 */ - const __m128 tmp = _mm_cvtepi32_ps(_mm_srli_epi32(a_.m128i[i], 1)); - r_.m128[i] = - _mm_add_ps( - _mm_add_ps(tmp, tmp), - _mm_cvtepi32_ps(_mm_and_si128(a_.m128i[i], _mm_set1_epi32(1))) - ); - } - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.f32, a_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.f32[i] = HEDLEY_STATIC_CAST(float, a_.u32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cvtepu32_ps - #define _mm512_cvtepu32_ps(a) simde_mm512_cvtepu32_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_cvtph_ps(simde__m256i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cvtph_ps(a); - #endif - simde__m256i_private a_ = simde__m256i_to_private(a); - simde__m512_private r_; - - #if defined(SIMDE_X86_F16C_NATIVE) - r_.m256[0] = _mm256_cvtph_ps(a_.m128i[0]); - r_.m256[1] = _mm256_cvtph_ps(a_.m128i[1]); - #elif defined(SIMDE_FLOAT16_VECTOR) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_float16_to_float32(a_.f16[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_float16_to_float32(simde_uint16_as_float16(a_.u16[i])); - } - #endif - - return simde__m512_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cvtph_ps - #define _mm512_cvtph_ps(a) simde_mm512_cvtph_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_cvtps_epi32(simde__m512 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cvtps_epi32(a); - #endif - simde__m512_private a_ = simde__m512_to_private(a); - simde__m512i_private r_; - - #if defined(SIMDE_X86_AVX_NATIVE) - r_.m256i[0] = _mm256_cvtps_epi32(a_.m256[0]); - r_.m256i[1] = _mm256_cvtps_epi32(a_.m256[1]); - #elif defined(simde_math_nearbyintf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_nearbyintf(a_.f32[i])); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m512i_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cvtps_epi32 - #define _mm512_cvtps_epi32(a) simde_mm512_cvtps_epi32(a) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_CVT_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/cvts.h b/ffi-deps/simde/simde/x86/avx512/cvts.h deleted file mode 100644 index 0194889..0000000 --- a/ffi-deps/simde/simde/x86/avx512/cvts.h +++ /dev/null @@ -1,781 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Himanshi Mathur - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_CVTS_H) -#define SIMDE_X86_AVX512_CVTS_H - -#include "types.h" -#include "mov.h" -#include "storeu.h" -#include "loadu.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtsepi16_epi8 (simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_cvtsepi16_epi8(a); - #else - simde__m128i_private r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - simde__m128i_private a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i16) / sizeof(a_.i16[0])) ; i++) { - r_.i8[i] = - (a_.i16[i] < INT8_MIN) - ? (INT8_MIN) - : ((a_.i16[i] > INT8_MAX) - ? (INT8_MAX) - : HEDLEY_STATIC_CAST(int8_t, a_.i16[i])); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtsepi16_epi8 - #define _mm_cvtsepi16_epi8(a) simde_mm_cvtsepi16_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm256_cvtsepi16_epi8 (simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_cvtsepi16_epi8(a); - #else - simde__m128i_private r_; - simde__m256i_private a_ = simde__m256i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = - (a_.i16[i] < INT8_MIN) - ? (INT8_MIN) - : ((a_.i16[i] > INT8_MAX) - ? (INT8_MAX) - : HEDLEY_STATIC_CAST(int8_t, a_.i16[i])); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtsepi16_epi8 - #define _mm256_cvtsepi16_epi8(a) simde_mm256_cvtsepi16_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtsepi32_epi8 (simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_cvtsepi32_epi8(a); - #else - simde__m128i_private r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - simde__m128i_private a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r_.i8[i] = - (a_.i32[i] < INT8_MIN) - ? (INT8_MIN) - : ((a_.i32[i] > INT8_MAX) - ? (INT8_MAX) - : HEDLEY_STATIC_CAST(int8_t, a_.i32[i])); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtsepi32_epi8 - #define _mm_cvtsepi32_epi8(a) simde_mm_cvtsepi32_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm256_cvtsepi32_epi8 (simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_cvtsepi32_epi8(a); - #else - simde__m128i_private r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - simde__m256i_private a_ = simde__m256i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r_.i8[i] = - (a_.i32[i] < INT8_MIN) - ? (INT8_MIN) - : ((a_.i32[i] > INT8_MAX) - ? (INT8_MAX) - : HEDLEY_STATIC_CAST(int8_t, a_.i32[i])); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtsepi32_epi8 - #define _mm256_cvtsepi32_epi8(a) simde_mm256_cvtsepi32_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtsepi32_epi16 (simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_cvtsepi32_epi16(a); - #else - simde__m128i_private r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - simde__m128i_private a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r_.i16[i] = - (a_.i32[i] < INT16_MIN) - ? (INT16_MIN) - : ((a_.i32[i] > INT16_MAX) - ? (INT16_MAX) - : HEDLEY_STATIC_CAST(int16_t, a_.i32[i])); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtsepi32_epi16 - #define _mm_cvtsepi32_epi16(a) simde_mm_cvtsepi32_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm256_cvtsepi32_epi16 (simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_cvtsepi32_epi16(a); - #else - simde__m128i_private r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - simde__m256i_private a_ = simde__m256i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r_.i16[i] = - (a_.i32[i] < INT16_MIN) - ? (INT16_MIN) - : ((a_.i32[i] > INT16_MAX) - ? (INT16_MAX) - : HEDLEY_STATIC_CAST(int16_t, a_.i32[i])); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtsepi32_epi16 - #define _mm256_cvtsepi32_epi16(a) simde_mm256_cvtsepi32_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtsepi64_epi8 (simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_cvtsepi64_epi8(a); - #else - simde__m128i_private r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - simde__m128i_private a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r_.i8[i] = - (a_.i64[i] < INT8_MIN) - ? (INT8_MIN) - : ((a_.i64[i] > INT8_MAX) - ? (INT8_MAX) - : HEDLEY_STATIC_CAST(int8_t, a_.i64[i])); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtsepi64_epi8 - #define _mm_cvtsepi64_epi8(a) simde_mm_cvtsepi64_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm256_cvtsepi64_epi8 (simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_cvtsepi64_epi8(a); - #else - simde__m128i_private r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - simde__m256i_private a_ = simde__m256i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r_.i8[i] = - (a_.i64[i] < INT8_MIN) - ? (INT8_MIN) - : ((a_.i64[i] > INT8_MAX) - ? (INT8_MAX) - : HEDLEY_STATIC_CAST(int8_t, a_.i64[i])); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_cvtsepi64_epi8 - #define _mm256_cvtsepi64_epi8(a) simde_mm256_cvtsepi64_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm512_cvtsepi16_epi8 (simde__m512i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_cvtsepi16_epi8(a); - #else - simde__m256i_private r_; - simde__m512i_private a_ = simde__m512i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = - (a_.i16[i] < INT8_MIN) - ? (INT8_MIN) - : ((a_.i16[i] > INT8_MAX) - ? (INT8_MAX) - : HEDLEY_STATIC_CAST(int8_t, a_.i16[i])); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_cvtsepi16_epi8 - #define _mm512_cvtsepi16_epi8(a) simde_mm512_cvtsepi16_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm512_mask_cvtsepi16_epi8 (simde__m256i src, simde__mmask32 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_cvtsepi16_epi8(src, k, a); - #else - return simde_mm256_mask_mov_epi8(src, k, simde_mm512_cvtsepi16_epi8(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cvtsepi16_epi8 - #define _mm512_mask_cvtsepi16_epi8(src, k, a) simde_mm512_mask_cvtsepi16_epi8(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm512_maskz_cvtsepi16_epi8 (simde__mmask32 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_cvtsepi16_epi8(k, a); - #else - return simde_mm256_maskz_mov_epi8(k, simde_mm512_cvtsepi16_epi8(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_cvtsepi16_epi8 - #define _mm512_maskz_cvtsepi16_epi8(k, a) simde_mm512_maskz_cvtsepi16_epi8(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm512_cvtsepi32_epi8 (simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cvtsepi32_epi8(a); - #else - simde__m128i_private r_; - simde__m512i_private a_ = simde__m512i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r_.i8[i] = - (a_.i32[i] < INT8_MIN) - ? (INT8_MIN) - : ((a_.i32[i] > INT8_MAX) - ? (INT8_MAX) - : HEDLEY_STATIC_CAST(int8_t, a_.i32[i])); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cvtsepi32_epi8 - #define _mm512_cvtsepi32_epi8(a) simde_mm512_cvtsepi32_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm512_mask_cvtsepi32_epi8 (simde__m128i src, simde__mmask16 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cvtsepi32_epi8(src, k, a); - #else - simde__m128i_private r_; - simde__m128i_private src_ = simde__m128i_to_private(src); - simde__m512i_private a_ = simde__m512i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r_.i8[i] = ((k>>i) &1 ) ? - ((a_.i32[i] < INT8_MIN) - ? (INT8_MIN) - : ((a_.i32[i] > INT8_MAX) - ? (INT8_MAX) - : HEDLEY_STATIC_CAST(int8_t, a_.i32[i]))) : src_.i8[i] ; - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cvtsepi32_epi8 - #define _mm512_mask_cvtsepi32_epi8(src, k, a) simde_mm512_mask_cvtsepi32_epi8(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm512_mask_cvtsepi32_storeu_epi8 (void* base_addr, simde__mmask16 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - _mm512_mask_cvtsepi32_storeu_epi8(base_addr, k, a); - #else - simde__m256i_private r_ = simde__m256i_to_private(simde_mm256_loadu_epi8(base_addr)); - simde__m512i_private a_ = simde__m512i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r_.i8[i] = ((k>>i) &1 ) ? - ((a_.i32[i] < INT8_MIN) - ? (INT8_MIN) - : ((a_.i32[i] > INT8_MAX) - ? (INT8_MAX) - : HEDLEY_STATIC_CAST(int8_t, a_.i32[i]))) : r_.i8[i]; - } - - simde_mm256_storeu_epi8(base_addr, simde__m256i_from_private(r_)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cvtsepi32_storeu_epi8 - #define _mm512_mask_cvtsepi32_storeu_epi8(base_addr, k, a) simde_mm512_mask_cvtsepi32_storeu_epi8(base_addr, k, a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm512_maskz_cvtsepi32_epi8 (simde__mmask16 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_cvtsepi32_epi8(k, a); - #else - simde__m128i_private r_; - simde__m512i_private a_ = simde__m512i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r_.i8[i] = ((k>>i) &1 ) ? - ((a_.i32[i] < INT8_MIN) - ? (INT8_MIN) - : ((a_.i32[i] > INT8_MAX) - ? (INT8_MAX) - : HEDLEY_STATIC_CAST(int8_t, a_.i32[i]))) : INT8_C(0) ; - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_cvtsepi32_epi8 - #define _mm512_maskz_cvtsepi32_epi8(k, a) simde_mm512_maskz_cvtsepi32_epi8(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm512_cvtsepi32_epi16 (simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cvtsepi32_epi16(a); - #else - simde__m256i_private r_; - simde__m512i_private a_ = simde__m512i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r_.i16[i] = - (a_.i32[i] < INT16_MIN) - ? (INT16_MIN) - : ((a_.i32[i] > INT16_MAX) - ? (INT16_MAX) - : HEDLEY_STATIC_CAST(int16_t, a_.i32[i])); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cvtsepi32_epi16 - #define _mm512_cvtsepi32_epi16(a) simde_mm512_cvtsepi32_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm512_mask_cvtsepi32_epi16 (simde__m256i src, simde__mmask16 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cvtsepi32_epi16(src, k, a); - #else - simde__m256i_private r_; - simde__m256i_private src_ = simde__m256i_to_private(src); - simde__m512i_private a_ = simde__m512i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r_.i16[i] = ((k>>i) &1 ) ? - ((a_.i32[i] < INT16_MIN) - ? (INT16_MIN) - : ((a_.i32[i] > INT16_MAX) - ? (INT16_MAX) - : HEDLEY_STATIC_CAST(int16_t, a_.i32[i]))) : src_.i16[i]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cvtsepi32_epi16 - #define _mm512_mask_cvtsepi32_epi16(src, k, a) simde_mm512_mask_cvtsepi32_epi16(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm512_mask_cvtsepi32_storeu_epi16 (void* base_addr, simde__mmask16 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - _mm512_mask_cvtsepi32_storeu_epi16(base_addr, k, a); - #else - simde__m256i_private r_; - simde__m256i_private src_ = simde__m256i_to_private(simde_mm256_loadu_epi16(base_addr)); - simde__m512i_private a_ = simde__m512i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r_.i16[i] = ((k>>i) &1 ) ? - ((a_.i32[i] < INT16_MIN) - ? (INT16_MIN) - : ((a_.i32[i] > INT16_MAX) - ? (INT16_MAX) - : HEDLEY_STATIC_CAST(int16_t, a_.i32[i]))) : src_.i16[i]; - } - - simde_mm256_storeu_epi16(base_addr, simde__m256i_from_private(r_)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cvtsepi32_storeu_epi16 - #define _mm512_mask_cvtsepi32_storeu_epi16(base_addr, k, a) simde_mm512_mask_cvtsepi32_storeu_epi16(base_addr, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm512_maskz_cvtsepi32_epi16 (simde__mmask16 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_cvtsepi32_epi16(k, a); - #else - simde__m256i_private r_; - simde__m512i_private a_ = simde__m512i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r_.i16[i] = ((k>>i) &1 ) ? - ((a_.i32[i] < INT16_MIN) - ? (INT16_MIN) - : ((a_.i32[i] > INT16_MAX) - ? (INT16_MAX) - : HEDLEY_STATIC_CAST(int16_t, a_.i32[i]))) : INT16_C(0); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_cvtsepi32_epi16 - #define _mm512_maskz_cvtsepi32_epi16(k, a) simde_mm512_maskz_cvtsepi32_epi16(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm512_cvtsepi64_epi8 (simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cvtsepi64_epi8(a); - #else - simde__m128i_private r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - simde__m512i_private a_ = simde__m512i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r_.i8[i] = - (a_.i64[i] < INT8_MIN) - ? (INT8_MIN) - : ((a_.i64[i] > INT8_MAX) - ? (INT8_MAX) - : HEDLEY_STATIC_CAST(int8_t, a_.i64[i])); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cvtsepi64_epi8 - #define _mm512_cvtsepi64_epi8(a) simde_mm512_cvtsepi64_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm512_mask_cvtsepi64_epi8 (simde__m128i src, simde__mmask8 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cvtsepi64_epi8(src, k, a); - #else - simde__m128i_private r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - simde__m128i_private src_ = simde__m128i_to_private(src); - simde__m512i_private a_ = simde__m512i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r_.i8[i] = ((k>>i) &1 ) ? - ((a_.i64[i] < INT8_MIN) - ? (INT8_MIN) - : ((a_.i64[i] > INT8_MAX) - ? (INT8_MAX) - : HEDLEY_STATIC_CAST(int8_t, a_.i64[i]))) : src_.i8[i]; - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cvtsepi64_epi8 - #define _mm512_mask_cvtsepi64_epi8(src, k, a) simde_mm512_mask_cvtsepi64_epi8(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm512_maskz_cvtsepi64_epi8 (simde__mmask8 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_cvtsepi64_epi8(k, a); - #else - simde__m128i_private r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - simde__m512i_private a_ = simde__m512i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r_.i8[i] = ((k>>i) &1 ) ? - ((a_.i64[i] < INT8_MIN) - ? (INT8_MIN) - : ((a_.i64[i] > INT8_MAX) - ? (INT8_MAX) - : HEDLEY_STATIC_CAST(int8_t, a_.i64[i]))) : INT8_C(0); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_cvtsepi64_epi8 - #define _mm512_maskz_cvtsepi64_epi8(k, a) simde_mm512_maskz_cvtsepi64_epi8(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm512_cvtsepi64_epi16 (simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cvtsepi64_epi16(a); - #else - simde__m128i_private r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - simde__m512i_private a_ = simde__m512i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r_.i16[i] = - (a_.i64[i] < INT16_MIN) - ? (INT16_MIN) - : ((a_.i64[i] > INT16_MAX) - ? (INT16_MAX) - : HEDLEY_STATIC_CAST(int16_t, a_.i64[i])); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cvtsepi64_epi16 - #define _mm512_cvtsepi64_epi16(a) simde_mm512_cvtsepi64_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm512_mask_cvtsepi64_epi16 (simde__m128i src, simde__mmask8 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cvtsepi64_epi16(src, k, a); - #else - simde__m128i_private r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - simde__m128i_private src_ = simde__m128i_to_private(src); - simde__m512i_private a_ = simde__m512i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r_.i16[i] = ((k>>i) & 1) ? - ((a_.i64[i] < INT16_MIN) - ? (INT16_MIN) - : ((a_.i64[i] > INT16_MAX) - ? (INT16_MAX) - : HEDLEY_STATIC_CAST(int16_t, a_.i64[i]))) : src_.i16[i]; - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cvtsepi64_epi16 - #define _mm512_mask_cvtsepi64_epi16(src, k, a) simde_mm512_mask_cvtsepi64_epi16(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm512_maskz_cvtsepi64_epi16 (simde__mmask8 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_cvtsepi64_epi16(k, a); - #else - simde__m128i_private r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - simde__m512i_private a_ = simde__m512i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r_.i16[i] = ((k>>i) & 1) ? - ((a_.i64[i] < INT16_MIN) - ? (INT16_MIN) - : ((a_.i64[i] > INT16_MAX) - ? (INT16_MAX) - : HEDLEY_STATIC_CAST(int16_t, a_.i64[i]))) : INT16_C(0); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_cvtsepi64_epi16 - #define _mm512_maskz_cvtsepi64_epi16(k, a) simde_mm512_maskz_cvtsepi64_epi16(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm512_cvtsepi64_epi32 (simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cvtsepi64_epi32(a); - #else - simde__m256i_private r_; - simde__m512i_private a_ = simde__m512i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r_.i32[i] = - (a_.i64[i] < INT32_MIN) - ? (INT32_MIN) - : ((a_.i64[i] > INT32_MAX) - ? (INT32_MAX) - : HEDLEY_STATIC_CAST(int32_t, a_.i64[i])); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cvtsepi64_epi32 - #define _mm512_cvtsepi64_epi32(a) simde_mm512_cvtsepi64_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm512_mask_cvtsepi64_epi32 (simde__m256i src, simde__mmask8 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cvtsepi64_epi32(src, k, a); - #else - simde__m256i_private r_; - simde__m256i_private src_ = simde__m256i_to_private(src); - simde__m512i_private a_ = simde__m512i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r_.i32[i] = ((k>>i) & 1) ? - ((a_.i64[i] < INT32_MIN) - ? (INT32_MIN) - : ((a_.i64[i] > INT32_MAX) - ? (INT32_MAX) - : HEDLEY_STATIC_CAST(int32_t, a_.i64[i]))) : src_.i32[i]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cvtsepi64_epi32 - #define _mm512_mask_cvtsepi64_epi32(src, k, a) simde_mm512_mask_cvtsepi64_epi32(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm512_maskz_cvtsepi64_epi32 (simde__mmask8 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_cvtsepi64_epi32(k, a); - #else - simde__m256i_private r_; - simde__m512i_private a_ = simde__m512i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r_.i32[i] = ((k>>i) & 1) ? - ((a_.i64[i] < INT32_MIN) - ? (INT32_MIN) - : ((a_.i64[i] > INT32_MAX) - ? (INT32_MAX) - : HEDLEY_STATIC_CAST(int32_t, a_.i64[i]))) : INT32_C(0); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_cvtsepi64_epi32 - #define _mm512_maskz_cvtsepi64_epi32(k, a) simde_mm512_maskz_cvtsepi64_epi32(k, a) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_CVTS_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/cvtt.h b/ffi-deps/simde/simde/x86/avx512/cvtt.h deleted file mode 100644 index 937f7fb..0000000 --- a/ffi-deps/simde/simde/x86/avx512/cvtt.h +++ /dev/null @@ -1,130 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2021 Evan Nemerson - */ - -#if !defined(SIMDE_X86_AVX512_CVTT_H) -#define SIMDE_X86_AVX512_CVTT_H - -#include "types.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvttpd_epi64 (simde__m128d a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm_cvttpd_epi64(a); - #else - simde__m128i_private r_; - simde__m128d_private a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) - r_.n = - _mm_set_epi64x( - _mm_cvttsd_si64(_mm_unpackhi_pd(a_.n, a_.n)), - _mm_cvttsd_si64(a_.n) - ); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i64 = vcvtq_s64_f64(a_.neon_f64); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i64 = vec_signed(a_.altivec_f64); - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i64, a_.f64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = HEDLEY_STATIC_CAST(int64_t, a_.f64[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm_cvttpd_epi64 - #define _mm_cvttpd_epi64(a) simde_mm_cvttpd_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_cvttpd_epi64(simde__m128i src, simde__mmask8 k, simde__m128d a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm_mask_cvttpd_epi64(src, k, a); - #else - return simde_mm_mask_mov_epi64(src, k, simde_mm_cvttpd_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_cvttpd_epi64 - #define _mm_mask_cvttpd_epi64(src, k, a) simde_mm_mask_cvttpd_epi64(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_cvttpd_epi64(simde__mmask8 k, simde__m128d a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm_maskz_cvttpd_epi64(k, a); - #else - return simde_mm_maskz_mov_epi64(k, simde_mm_cvttpd_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_cvttpd_epi64 - #define _mm_maskz_cvttpd_epi64(k, a) simde_mm_maskz_cvttpd_epi64(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_cvttps_epi32 (simde__m512 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cvttps_epi32(a); - #else - simde__m512i_private r_; - simde__m512_private a_ = simde__m512_to_private(a); - - #if defined(simde_math_truncf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, simde_math_truncf(a_.f32[i])); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_cvttps_epi32 - #define _mm512_cvttps_epi32(a) simde_mm512_cvttps_epi32(a) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_CVTT_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/cvtus.h b/ffi-deps/simde/simde/x86/avx512/cvtus.h deleted file mode 100644 index ce423f6..0000000 --- a/ffi-deps/simde/simde/x86/avx512/cvtus.h +++ /dev/null @@ -1,67 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2023 Michael R. Crusoe - */ - -#if !defined(SIMDE_X86_AVX512_CVTUS_H) -#define SIMDE_X86_AVX512_CVTUS_H - -#include "types.h" -#include "mov.h" -#include "storeu.h" -#include "loadu.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm512_mask_cvtusepi32_storeu_epi8 (void* base_addr, simde__mmask16 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - _mm512_mask_cvtusepi32_storeu_epi8(base_addr, k, a); - #else - simde__m256i_private r_ = simde__m256i_to_private(simde_mm256_loadu_epi8(base_addr)); - simde__m512i_private a_ = simde__m512i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - r_.i8[i] = ((k>>i) &1 ) ? - ((a_.u32[i] > UINT8_MAX) - ? (HEDLEY_STATIC_CAST(int8_t, UINT8_MAX)) - : HEDLEY_STATIC_CAST(int8_t, a_.u32[i])) : r_.i8[i]; - } - - simde_mm256_storeu_epi8(base_addr, simde__m256i_from_private(r_)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cvtusepi32_storeu_epi8 - #define _mm512_mask_cvtusepi32_storeu_epi8(base_addr, k, a) simde_mm512_mask_cvtusepi32_storeu_epi8((base_addr), (k), (a)) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_CVTUS_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/dbsad.h b/ffi-deps/simde/simde/x86/avx512/dbsad.h deleted file mode 100644 index c9a8e66..0000000 --- a/ffi-deps/simde/simde/x86/avx512/dbsad.h +++ /dev/null @@ -1,388 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_DBSAD_H) -#define SIMDE_X86_AVX512_DBSAD_H - -#include "types.h" -#include "mov.h" -#include "../avx2.h" -#include "shuffle.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -#if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_dbsad_epu8(a, b, imm8) _mm_dbsad_epu8((a), (b), (imm8)) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128i - simde_mm_dbsad_epu8_internal_ (simde__m128i a, simde__m128i b) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) - uint8_t a1 SIMDE_VECTOR(16) = - SIMDE_SHUFFLE_VECTOR_( - 8, 16, a_.u8, a_.u8, - 0, 1, 0, 1, - 4, 5, 4, 5, - 8, 9, 8, 9, - 12, 13, 12, 13); - uint8_t b1 SIMDE_VECTOR(16) = - SIMDE_SHUFFLE_VECTOR_( - 8, 16, b_.u8, b_.u8, - 0, 1, 1, 2, - 2, 3, 3, 4, - 8, 9, 9, 10, - 10, 11, 11, 12); - - __typeof__(r_.u8) abd1_mask = HEDLEY_REINTERPRET_CAST(__typeof__(abd1_mask), a1 < b1); - __typeof__(r_.u8) abd1 = (((b1 - a1) & abd1_mask) | ((a1 - b1) & ~abd1_mask)); - - r_.u16 = - __builtin_convertvector(__builtin_shufflevector(abd1, abd1, 0, 2, 4, 6, 8, 10, 12, 14), __typeof__(r_.u16)) + - __builtin_convertvector(__builtin_shufflevector(abd1, abd1, 1, 3, 5, 7, 9, 11, 13, 15), __typeof__(r_.u16)); - - uint8_t a2 SIMDE_VECTOR(16) = - SIMDE_SHUFFLE_VECTOR_( - 8, 16, a_.u8, a_.u8, - 2, 3, 2, 3, - 6, 7, 6, 7, - 10, 11, 10, 11, - 14, 15, 14, 15); - uint8_t b2 SIMDE_VECTOR(16) = - SIMDE_SHUFFLE_VECTOR_( - 8, 16, b_.u8, b_.u8, - 2, 3, 3, 4, - 4, 5, 5, 6, - 10, 11, 11, 12, - 12, 13, 13, 14); - - __typeof__(r_.u8) abd2_mask = HEDLEY_REINTERPRET_CAST(__typeof__(abd2_mask), a2 < b2); - __typeof__(r_.u8) abd2 = (((b2 - a2) & abd2_mask) | ((a2 - b2) & ~abd2_mask)); - - r_.u16 += - __builtin_convertvector(__builtin_shufflevector(abd2, abd2, 0, 2, 4, 6, 8, 10, 12, 14), __typeof__(r_.u16)) + - __builtin_convertvector(__builtin_shufflevector(abd2, abd2, 1, 3, 5, 7, 9, 11, 13, 15), __typeof__(r_.u16)); - #else - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = 0; - for (size_t j = 0 ; j < 4 ; j++) { - uint16_t A = HEDLEY_STATIC_CAST(uint16_t, a_.u8[((i << 1) & 12) + j]); - uint16_t B = HEDLEY_STATIC_CAST(uint16_t, b_.u8[((i & 3) | ((i << 1) & 8)) + j]); - r_.u16[i] += (A < B) ? (B - A) : (A - B); - } - } - #endif - - return simde__m128i_from_private(r_); - } - #define simde_mm_dbsad_epu8(a, b, imm8) simde_mm_dbsad_epu8_internal_((a), simde_mm_shuffle_epi32((b), (imm8))) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_dbsad_epu8 - #define _mm_dbsad_epu8(a, b, imm8) simde_mm_dbsad_epu8(a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_mask_dbsad_epu8(src, k, a, b, imm8) _mm_mask_dbsad_epu8((src), (k), (a), (b), (imm8)) -#else - #define simde_mm_mask_dbsad_epu8(src, k, a, b, imm8) simde_mm_mask_mov_epi16(src, k, simde_mm_dbsad_epu8(a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_dbsad_epu8 - #define _mm_mask_dbsad_epu8(src, k, a, b, imm8) simde_mm_mask_dbsad_epu8(src, k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_maskz_dbsad_epu8(k, a, b, imm8) _mm_maskz_dbsad_epu8((k), (a), (b), (imm8)) -#else - #define simde_mm_maskz_dbsad_epu8(k, a, b, imm8) simde_mm_maskz_mov_epi16(k, simde_mm_dbsad_epu8(a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_dbsad_epu8 - #define _mm_maskz_dbsad_epu8(k, a, b, imm8) simde_mm_maskz_dbsad_epu8(k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_dbsad_epu8(a, b, imm8) _mm256_dbsad_epu8((a), (b), (imm8)) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm256_dbsad_epu8(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m256i_private \ - simde_mm256_dbsad_epu8_a_ = simde__m256i_to_private(a), \ - simde_mm256_dbsad_epu8_b_ = simde__m256i_to_private(b); \ - \ - simde_mm256_dbsad_epu8_a_.m128i[0] = simde_mm_dbsad_epu8(simde_mm256_dbsad_epu8_a_.m128i[0], simde_mm256_dbsad_epu8_b_.m128i[0], imm8); \ - simde_mm256_dbsad_epu8_a_.m128i[1] = simde_mm_dbsad_epu8(simde_mm256_dbsad_epu8_a_.m128i[1], simde_mm256_dbsad_epu8_b_.m128i[1], imm8); \ - \ - simde__m256i_from_private(simde_mm256_dbsad_epu8_a_); \ - })) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m256i - simde_mm256_dbsad_epu8_internal_ (simde__m256i a, simde__m256i b) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) - uint8_t a1 SIMDE_VECTOR(32) = - SIMDE_SHUFFLE_VECTOR_( - 8, 32, a_.u8, a_.u8, - 0, 1, 0, 1, - 4, 5, 4, 5, - 8, 9, 8, 9, - 12, 13, 12, 13, - 16, 17, 16, 17, - 20, 21, 20, 21, - 24, 25, 24, 25, - 28, 29, 28, 29); - uint8_t b1 SIMDE_VECTOR(32) = - SIMDE_SHUFFLE_VECTOR_( - 8, 16, b_.u8, b_.u8, - 0, 1, 1, 2, - 2, 3, 3, 4, - 8, 9, 9, 10, - 10, 11, 11, 12, - 16, 17, 17, 18, - 18, 19, 19, 20, - 24, 25, 25, 26, - 26, 27, 27, 28); - - __typeof__(r_.u8) abd1_mask = HEDLEY_REINTERPRET_CAST(__typeof__(abd1_mask), a1 < b1); - __typeof__(r_.u8) abd1 = (((b1 - a1) & abd1_mask) | ((a1 - b1) & ~abd1_mask)); - - r_.u16 = - __builtin_convertvector(__builtin_shufflevector(abd1, abd1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), __typeof__(r_.u16)) + - __builtin_convertvector(__builtin_shufflevector(abd1, abd1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), __typeof__(r_.u16)); - - uint8_t a2 SIMDE_VECTOR(32) = - SIMDE_SHUFFLE_VECTOR_( - 8, 32, a_.u8, a_.u8, - 2, 3, 2, 3, - 6, 7, 6, 7, - 10, 11, 10, 11, - 14, 15, 14, 15, - 18, 19, 18, 19, - 22, 23, 22, 23, - 26, 27, 26, 27, - 30, 31, 30, 31); - uint8_t b2 SIMDE_VECTOR(32) = - SIMDE_SHUFFLE_VECTOR_( - 8, 16, b_.u8, b_.u8, - 2, 3, 3, 4, - 4, 5, 5, 6, - 10, 11, 11, 12, - 12, 13, 13, 14, - 18, 19, 19, 20, - 20, 21, 21, 22, - 26, 27, 27, 28, - 28, 29, 29, 30); - - __typeof__(r_.u8) abd2_mask = HEDLEY_REINTERPRET_CAST(__typeof__(abd2_mask), a2 < b2); - __typeof__(r_.u8) abd2 = (((b2 - a2) & abd2_mask) | ((a2 - b2) & ~abd2_mask)); - - r_.u16 += - __builtin_convertvector(__builtin_shufflevector(abd2, abd2, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30), __typeof__(r_.u16)) + - __builtin_convertvector(__builtin_shufflevector(abd2, abd2, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31), __typeof__(r_.u16)); - #else - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = 0; - for (size_t j = 0 ; j < 4 ; j++) { - uint16_t A = HEDLEY_STATIC_CAST(uint16_t, a_.u8[(((i << 1) & 12) | ((i & 8) << 1)) + j]); - uint16_t B = HEDLEY_STATIC_CAST(uint16_t, b_.u8[((i & 3) | ((i << 1) & 8) | ((i & 8) << 1)) + j]); - r_.u16[i] += (A < B) ? (B - A) : (A - B); - } - } - #endif - - return simde__m256i_from_private(r_); - } - #define simde_mm256_dbsad_epu8(a, b, imm8) simde_mm256_dbsad_epu8_internal_((a), simde_mm256_shuffle_epi32(b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_dbsad_epu8 - #define _mm256_dbsad_epu8(a, b, imm8) simde_mm256_dbsad_epu8(a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_mask_dbsad_epu8(src, k, a, b, imm8) _mm256_mask_dbsad_epu8((src), (k), (a), (b), (imm8)) -#else - #define simde_mm256_mask_dbsad_epu8(src, k, a, b, imm8) simde_mm256_mask_mov_epi16(src, k, simde_mm256_dbsad_epu8(a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_dbsad_epu8 - #define _mm256_mask_dbsad_epu8(src, k, a, b, imm8) simde_mm256_mask_dbsad_epu8(src, k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_maskz_dbsad_epu8(k, a, b, imm8) _mm256_maskz_dbsad_epu8((k), (a), (b), (imm8)) -#else - #define simde_mm256_maskz_dbsad_epu8(k, a, b, imm8) simde_mm256_maskz_mov_epi16(k, simde_mm256_dbsad_epu8(a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_dbsad_epu8 - #define _mm256_maskz_dbsad_epu8(k, a, b, imm8) simde_mm256_maskz_dbsad_epu8(k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512BW_NATIVE) - #define simde_mm512_dbsad_epu8(a, b, imm8) _mm512_dbsad_epu8((a), (b), (imm8)) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm512_dbsad_epu8(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512i_private \ - simde_mm512_dbsad_epu8_a_ = simde__m512i_to_private(a), \ - simde_mm512_dbsad_epu8_b_ = simde__m512i_to_private(b); \ - \ - simde_mm512_dbsad_epu8_a_.m256i[0] = simde_mm256_dbsad_epu8(simde_mm512_dbsad_epu8_a_.m256i[0], simde_mm512_dbsad_epu8_b_.m256i[0], imm8); \ - simde_mm512_dbsad_epu8_a_.m256i[1] = simde_mm256_dbsad_epu8(simde_mm512_dbsad_epu8_a_.m256i[1], simde_mm512_dbsad_epu8_b_.m256i[1], imm8); \ - \ - simde__m512i_from_private(simde_mm512_dbsad_epu8_a_); \ - })) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512i - simde_mm512_dbsad_epu8_internal_ (simde__m512i a, simde__m512i b) { - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) - uint8_t a1 SIMDE_VECTOR(64) = - SIMDE_SHUFFLE_VECTOR_( - 8, 64, a_.u8, a_.u8, - 0, 1, 0, 1, - 4, 5, 4, 5, - 8, 9, 8, 9, - 12, 13, 12, 13, - 16, 17, 16, 17, - 20, 21, 20, 21, - 24, 25, 24, 25, - 28, 29, 28, 29, - 32, 33, 32, 33, - 36, 37, 36, 37, - 40, 41, 40, 41, - 44, 45, 44, 45, - 48, 49, 48, 49, - 52, 53, 52, 53, - 56, 57, 56, 57, - 60, 61, 60, 61); - uint8_t b1 SIMDE_VECTOR(64) = - SIMDE_SHUFFLE_VECTOR_( - 8, 64, b_.u8, b_.u8, - 0, 1, 1, 2, - 2, 3, 3, 4, - 8, 9, 9, 10, - 10, 11, 11, 12, - 16, 17, 17, 18, - 18, 19, 19, 20, - 24, 25, 25, 26, - 26, 27, 27, 28, - 32, 33, 33, 34, - 34, 35, 35, 36, - 40, 41, 41, 42, - 42, 43, 43, 44, - 48, 49, 49, 50, - 50, 51, 51, 52, - 56, 57, 57, 58, - 58, 59, 59, 60); - - __typeof__(r_.u8) abd1_mask = HEDLEY_REINTERPRET_CAST(__typeof__(abd1_mask), a1 < b1); - __typeof__(r_.u8) abd1 = (((b1 - a1) & abd1_mask) | ((a1 - b1) & ~abd1_mask)); - - r_.u16 = - __builtin_convertvector(__builtin_shufflevector(abd1, abd1, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62), __typeof__(r_.u16)) + - __builtin_convertvector(__builtin_shufflevector(abd1, abd1, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63), __typeof__(r_.u16)); - - uint8_t a2 SIMDE_VECTOR(64) = - SIMDE_SHUFFLE_VECTOR_( - 8, 64, a_.u8, a_.u8, - 2, 3, 2, 3, - 6, 7, 6, 7, - 10, 11, 10, 11, - 14, 15, 14, 15, - 18, 19, 18, 19, - 22, 23, 22, 23, - 26, 27, 26, 27, - 30, 31, 30, 31, - 34, 35, 34, 35, - 38, 39, 38, 39, - 42, 43, 42, 43, - 46, 47, 46, 47, - 50, 51, 50, 51, - 54, 55, 54, 55, - 58, 59, 58, 59, - 62, 63, 62, 63); - uint8_t b2 SIMDE_VECTOR(64) = - SIMDE_SHUFFLE_VECTOR_( - 8, 64, b_.u8, b_.u8, - 2, 3, 3, 4, - 4, 5, 5, 6, - 10, 11, 11, 12, - 12, 13, 13, 14, - 18, 19, 19, 20, - 20, 21, 21, 22, - 26, 27, 27, 28, - 28, 29, 29, 30, - 34, 35, 35, 36, - 36, 37, 37, 38, - 42, 43, 43, 44, - 44, 45, 45, 46, - 50, 51, 51, 52, - 52, 53, 53, 54, - 58, 59, 59, 60, - 60, 61, 61, 62); - - __typeof__(r_.u8) abd2_mask = HEDLEY_REINTERPRET_CAST(__typeof__(abd2_mask), a2 < b2); - __typeof__(r_.u8) abd2 = (((b2 - a2) & abd2_mask) | ((a2 - b2) & ~abd2_mask)); - - r_.u16 += - __builtin_convertvector(__builtin_shufflevector(abd2, abd2, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62), __typeof__(r_.u16)) + - __builtin_convertvector(__builtin_shufflevector(abd2, abd2, 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63), __typeof__(r_.u16)); - #else - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = 0; - for (size_t j = 0 ; j < 4 ; j++) { - uint16_t A = HEDLEY_STATIC_CAST(uint16_t, a_.u8[(((i << 1) & 12) | ((i & 8) << 1) | ((i & 16) << 1)) + j]); - uint16_t B = HEDLEY_STATIC_CAST(uint16_t, b_.u8[((i & 3) | ((i << 1) & 8) | ((i & 8) << 1) | ((i & 16) << 1)) + j]); - r_.u16[i] += (A < B) ? (B - A) : (A - B); - } - } - #endif - - return simde__m512i_from_private(r_); - } - #define simde_mm512_dbsad_epu8(a, b, imm8) simde_mm512_dbsad_epu8_internal_((a), simde_mm512_castps_si512(simde_mm512_shuffle_ps(simde_mm512_castsi512_ps(b), simde_mm512_castsi512_ps(b), imm8))) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_dbsad_epu8 - #define _mm512_dbsad_epu8(a, b, imm8) simde_mm512_dbsad_epu8(a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512BW_NATIVE) - #define simde_mm512_mask_dbsad_epu8(src, k, a, b, imm8) _mm512_mask_dbsad_epu8((src), (k), (a), (b), (imm8)) -#else - #define simde_mm512_mask_dbsad_epu8(src, k, a, b, imm8) simde_mm512_mask_mov_epi16(src, k, simde_mm512_dbsad_epu8(a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_dbsad_epu8 - #define _mm512_mask_dbsad_epu8(src, k, a, b, imm8) simde_mm512_mask_dbsad_epu8(src, k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512BW_NATIVE) - #define simde_mm512_maskz_dbsad_epu8(k, a, b, imm8) _mm512_maskz_dbsad_epu8((k), (a), (b), (imm8)) -#else - #define simde_mm512_maskz_dbsad_epu8(k, a, b, imm8) simde_mm512_maskz_mov_epi16(k, simde_mm512_dbsad_epu8(a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_dbsad_epu8 - #define _mm512_maskz_dbsad_epu8(k, a, b, imm8) simde_mm512_maskz_dbsad_epu8(k, a, b, imm8) -#endif - - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_DBSAD_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/div.h b/ffi-deps/simde/simde/x86/avx512/div.h deleted file mode 100644 index 5e6349a..0000000 --- a/ffi-deps/simde/simde/x86/avx512/div.h +++ /dev/null @@ -1,162 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Himanshi Mathur - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_DIV_H) -#define SIMDE_X86_AVX512_DIV_H - -#include "types.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_div_ps (simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_div_ps(a, b); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a), - b_ = simde__m512_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_div_ps(a_.m256[i], b_.m256[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = a_.f32 / b_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_div_ps(a_.m256[i], b_.m256[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_div_ps - #define _mm512_div_ps(a, b) simde_mm512_div_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_div_ps(simde__m512 src, simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_div_ps(src, k, a, b); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_div_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_div_ps - #define _mm512_mask_div_ps(src, k, a, b) simde_mm512_mask_div_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_div_ps(simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_div_ps(k, a, b); - #else - return simde_mm512_maskz_mov_ps(k, simde_mm512_div_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_div_ps - #define _mm512_maskz_div_ps(k, a, b) simde_mm512_maskz_div_ps(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_div_pd (simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_div_pd(a, b); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a), - b_ = simde__m512d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_div_pd(a_.m256d[i], b_.m256d[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 / b_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_div_pd(a_.m256d[i], b_.m256d[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_div_pd - #define _mm512_div_pd(a, b) simde_mm512_div_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_div_pd(simde__m512d src, simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_div_pd(src, k, a, b); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_div_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_div_pd - #define _mm512_mask_div_pd(src, k, a, b) simde_mm512_mask_div_pd(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_maskz_div_pd(simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_div_pd(k, a, b); - #else - return simde_mm512_maskz_mov_pd(k, simde_mm512_div_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_div_pd - #define _mm512_maskz_div_pd(k, a, b) simde_mm512_maskz_div_pd(k, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_DIV_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/dpbf16.h b/ffi-deps/simde/simde/x86/avx512/dpbf16.h deleted file mode 100644 index 81e2aea..0000000 --- a/ffi-deps/simde/simde/x86/avx512/dpbf16.h +++ /dev/null @@ -1,281 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_DPBF16_H) -#define SIMDE_X86_AVX512_DPBF16_H - -#include "types.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_dpbf16_ps (simde__m128 src, simde__m128bh a, simde__m128bh b) { - #if defined(SIMDE_X86_AVX512BF16_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_dpbf16_ps(src, a, b); - #else - simde__m128_private - src_ = simde__m128_to_private(src); - simde__m128bh_private - a_ = simde__m128bh_to_private(a), - b_ = simde__m128bh_to_private(b); - - #if ! ( defined(SIMDE_ARCH_X86) && defined(HEDLEY_GCC_VERSION) ) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_SHUFFLE_VECTOR_) - uint32_t x1 SIMDE_VECTOR(32); - uint32_t x2 SIMDE_VECTOR(32); - simde__m128_private - r1_[2], - r2_[2]; - - a_.u16 = - SIMDE_SHUFFLE_VECTOR_( - 16, 16, - a_.u16, a_.u16, - 0, 2, 4, 6, - 1, 3, 5, 7 - ); - b_.u16 = - SIMDE_SHUFFLE_VECTOR_( - 16, 16, - b_.u16, b_.u16, - 0, 2, 4, 6, - 1, 3, 5, 7 - ); - - SIMDE_CONVERT_VECTOR_(x1, a_.u16); - SIMDE_CONVERT_VECTOR_(x2, b_.u16); - - x1 <<= 16; - x2 <<= 16; - - simde_memcpy(&r1_, &x1, sizeof(x1)); - simde_memcpy(&r2_, &x2, sizeof(x2)); - - src_.f32 += - HEDLEY_REINTERPRET_CAST(__typeof__(a_.f32), r1_[0].u32) * HEDLEY_REINTERPRET_CAST(__typeof__(a_.f32), r2_[0].u32) + - HEDLEY_REINTERPRET_CAST(__typeof__(a_.f32), r1_[1].u32) * HEDLEY_REINTERPRET_CAST(__typeof__(a_.f32), r2_[1].u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { - src_.f32[i / 2] += (simde_uint32_as_float32(HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) << 16) * simde_uint32_as_float32(HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]) << 16)); - } - #endif - - return simde__m128_from_private(src_); - #endif -} -#if defined(SIMDE_X86_AVX512BF16_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_dpbf16_ps - #define _mm_dpbf16_ps(src, a, b) simde_mm_dpbf16_ps(src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_mask_dpbf16_ps (simde__m128 src, simde__mmask8 k, simde__m128bh a, simde__m128bh b) { - #if defined(SIMDE_X86_AVX512BF16_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_dpbf16_ps(src, k, a, b); - #else - return simde_mm_mask_mov_ps(src, k, simde_mm_dpbf16_ps(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BF16_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_dpbf16_ps - #define _mm_mask_dpbf16_ps(src, k, a, b) simde_mm_mask_dpbf16_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_maskz_dpbf16_ps (simde__mmask8 k, simde__m128 src, simde__m128bh a, simde__m128bh b) { - #if defined(SIMDE_X86_AVX512BF16_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_dpbf16_ps(k, src, a, b); - #else - return simde_mm_maskz_mov_ps(k, simde_mm_dpbf16_ps(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BF16_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_dpbf16_ps - #define _mm_maskz_dpbf16_ps(k, src, a, b) simde_mm_maskz_dpbf16_ps(k, src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_dpbf16_ps (simde__m256 src, simde__m256bh a, simde__m256bh b) { - #if defined(SIMDE_X86_AVX512BF16_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_dpbf16_ps(src, a, b); - #else - simde__m256_private - src_ = simde__m256_to_private(src); - simde__m256bh_private - a_ = simde__m256bh_to_private(a), - b_ = simde__m256bh_to_private(b); - - #if ! ( defined(SIMDE_ARCH_X86) && defined(HEDLEY_GCC_VERSION) ) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_SHUFFLE_VECTOR_) - uint32_t x1 SIMDE_VECTOR(64); - uint32_t x2 SIMDE_VECTOR(64); - simde__m256_private - r1_[2], - r2_[2]; - - a_.u16 = - SIMDE_SHUFFLE_VECTOR_( - 16, 32, - a_.u16, a_.u16, - 0, 2, 4, 6, 8, 10, 12, 14, - 1, 3, 5, 7, 9, 11, 13, 15 - ); - b_.u16 = - SIMDE_SHUFFLE_VECTOR_( - 16, 32, - b_.u16, b_.u16, - 0, 2, 4, 6, 8, 10, 12, 14, - 1, 3, 5, 7, 9, 11, 13, 15 - ); - - SIMDE_CONVERT_VECTOR_(x1, a_.u16); - SIMDE_CONVERT_VECTOR_(x2, b_.u16); - - x1 <<= 16; - x2 <<= 16; - - simde_memcpy(&r1_, &x1, sizeof(x1)); - simde_memcpy(&r2_, &x2, sizeof(x2)); - - src_.f32 += - HEDLEY_REINTERPRET_CAST(__typeof__(a_.f32), r1_[0].u32) * HEDLEY_REINTERPRET_CAST(__typeof__(a_.f32), r2_[0].u32) + - HEDLEY_REINTERPRET_CAST(__typeof__(a_.f32), r1_[1].u32) * HEDLEY_REINTERPRET_CAST(__typeof__(a_.f32), r2_[1].u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { - src_.f32[i / 2] += (simde_uint32_as_float32(HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) << 16) * simde_uint32_as_float32(HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]) << 16)); - } - #endif - - return simde__m256_from_private(src_); - #endif -} -#if defined(SIMDE_X86_AVX512BF16_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_dpbf16_ps - #define _mm256_dpbf16_ps(src, a, b) simde_mm256_dpbf16_ps(src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_mask_dpbf16_ps (simde__m256 src, simde__mmask8 k, simde__m256bh a, simde__m256bh b) { - #if defined(SIMDE_X86_AVX512BF16_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_dpbf16_ps(src, k, a, b); - #else - return simde_mm256_mask_mov_ps(src, k, simde_mm256_dpbf16_ps(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BF16_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_dpbf16_ps - #define _mm256_mask_dpbf16_ps(src, k, a, b) simde_mm256_mask_dpbf16_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_maskz_dpbf16_ps (simde__mmask8 k, simde__m256 src, simde__m256bh a, simde__m256bh b) { - #if defined(SIMDE_X86_AVX512BF16_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_dpbf16_ps(k, src, a, b); - #else - return simde_mm256_maskz_mov_ps(k, simde_mm256_dpbf16_ps(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BF16_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_dpbf16_ps - #define _mm256_maskz_dpbf16_ps(k, src, a, b) simde_mm256_maskz_dpbf16_ps(k, src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_dpbf16_ps (simde__m512 src, simde__m512bh a, simde__m512bh b) { - #if defined(SIMDE_X86_AVX512BF16_NATIVE) - return _mm512_dpbf16_ps(src, a, b); - #else - simde__m512_private - src_ = simde__m512_to_private(src); - simde__m512bh_private - a_ = simde__m512bh_to_private(a), - b_ = simde__m512bh_to_private(b); - - #if ! ( defined(SIMDE_ARCH_X86) && defined(HEDLEY_GCC_VERSION) ) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_SHUFFLE_VECTOR_) - uint32_t x1 SIMDE_VECTOR(128); - uint32_t x2 SIMDE_VECTOR(128); - simde__m512_private - r1_[2], - r2_[2]; - - a_.u16 = - SIMDE_SHUFFLE_VECTOR_( - 16, 64, - a_.u16, a_.u16, - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 - ); - b_.u16 = - SIMDE_SHUFFLE_VECTOR_( - 16, 64, - b_.u16, b_.u16, - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 - ); - - SIMDE_CONVERT_VECTOR_(x1, a_.u16); - SIMDE_CONVERT_VECTOR_(x2, b_.u16); - - x1 <<= 16; - x2 <<= 16; - - simde_memcpy(&r1_, &x1, sizeof(x1)); - simde_memcpy(&r2_, &x2, sizeof(x2)); - - src_.f32 += - HEDLEY_REINTERPRET_CAST(__typeof__(a_.f32), r1_[0].u32) * HEDLEY_REINTERPRET_CAST(__typeof__(a_.f32), r2_[0].u32) + - HEDLEY_REINTERPRET_CAST(__typeof__(a_.f32), r1_[1].u32) * HEDLEY_REINTERPRET_CAST(__typeof__(a_.f32), r2_[1].u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.u16[0])) ; i++) { - src_.f32[i / 2] += (simde_uint32_as_float32(HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) << 16) * simde_uint32_as_float32(HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]) << 16)); - } - #endif - - return simde__m512_from_private(src_); - #endif -} -#if defined(SIMDE_X86_AVX512BF16_ENABLE_NATIVE_ALIASES) - #undef _mm512_dpbf16_ps - #define _mm512_dpbf16_ps(src, a, b) simde_mm512_dpbf16_ps(src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_dpbf16_ps (simde__m512 src, simde__mmask16 k, simde__m512bh a, simde__m512bh b) { - #if defined(SIMDE_X86_AVX512BF16_NATIVE) - return _mm512_mask_dpbf16_ps(src, k, a, b); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_dpbf16_ps(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BF16_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_dpbf16_ps - #define _mm512_mask_dpbf16_ps(src, k, a, b) simde_mm512_mask_dpbf16_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_dpbf16_ps (simde__mmask16 k, simde__m512 src, simde__m512bh a, simde__m512bh b) { - #if defined(SIMDE_X86_AVX512BF16_NATIVE) - return _mm512_maskz_dpbf16_ps(k, src, a, b); - #else - return simde_mm512_maskz_mov_ps(k, simde_mm512_dpbf16_ps(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BF16_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_dpbf16_ps - #define _mm512_maskz_dpbf16_ps(k, src, a, b) simde_mm512_maskz_dpbf16_ps(k, src, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_DPBF16_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/dpbusd.h b/ffi-deps/simde/simde/x86/avx512/dpbusd.h deleted file mode 100644 index c45f3ca..0000000 --- a/ffi-deps/simde/simde/x86/avx512/dpbusd.h +++ /dev/null @@ -1,292 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_DPBUSD_H) -#define SIMDE_X86_AVX512_DPBUSD_H - -#include "types.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_dpbusd_epi32(simde__m128i src, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm_dpbusd_epi32(src, a, b); - #else - simde__m128i_private - src_ = simde__m128i_to_private(src), - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - #if defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_CONVERT_VECTOR_) - uint32_t x1_ SIMDE_VECTOR(64); - int32_t x2_ SIMDE_VECTOR(64); - simde__m128i_private - r1_[4], - r2_[4]; - - a_.u8 = - SIMDE_SHUFFLE_VECTOR_( - 8, 16, - a_.u8, a_.u8, - 0, 4, 8, 12, - 1, 5, 9, 13, - 2, 6, 10, 14, - 3, 7, 11, 15 - ); - b_.i8 = - SIMDE_SHUFFLE_VECTOR_( - 8, 16, - b_.i8, b_.i8, - 0, 4, 8, 12, - 1, 5, 9, 13, - 2, 6, 10, 14, - 3, 7, 11, 15 - ); - - SIMDE_CONVERT_VECTOR_(x1_, a_.u8); - SIMDE_CONVERT_VECTOR_(x2_, b_.i8); - - simde_memcpy(&r1_, &x1_, sizeof(x1_)); - simde_memcpy(&r2_, &x2_, sizeof(x2_)); - - src_.i32 += - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[0].u32) * r2_[0].i32) + - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[1].u32) * r2_[1].i32) + - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[2].u32) * r2_[2].i32) + - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[3].u32) * r2_[3].i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { - src_.i32[i / 4] += HEDLEY_STATIC_CAST(uint16_t, a_.u8[i]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[i]); - } - #endif - - return simde__m128i_from_private(src_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm_dpbusd_epi32 - #define _mm_dpbusd_epi32(src, a, b) simde_mm_dpbusd_epi32(src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_dpbusd_epi32(simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm_mask_dpbusd_epi32(src, k, a, b); - #else - return simde_mm_mask_mov_epi32(src, k, simde_mm_dpbusd_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_dpbusd_epi32 - #define _mm_mask_dpbusd_epi32(src, k, a, b) simde_mm_mask_dpbusd_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_dpbusd_epi32(simde__mmask8 k, simde__m128i src, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm_maskz_dpbusd_epi32(k, src, a, b); - #else - return simde_mm_maskz_mov_epi32(k, simde_mm_dpbusd_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_dpbusd_epi32 - #define _mm_maskz_dpbusd_epi32(k, src, a, b) simde_mm_maskz_dpbusd_epi32(k, src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_dpbusd_epi32(simde__m256i src, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm256_dpbusd_epi32(src, a, b); - #else - simde__m256i_private - src_ = simde__m256i_to_private(src), - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - src_.m128i[0] = simde_mm_dpbusd_epi32(src_.m128i[0], a_.m128i[0], b_.m128i[0]); - src_.m128i[1] = simde_mm_dpbusd_epi32(src_.m128i[1], a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_CONVERT_VECTOR_) - uint32_t x1_ SIMDE_VECTOR(128); - int32_t x2_ SIMDE_VECTOR(128); - simde__m256i_private - r1_[4], - r2_[4]; - - a_.u8 = - SIMDE_SHUFFLE_VECTOR_( - 8, 32, - a_.u8, a_.u8, - 0, 4, 8, 12, 16, 20, 24, 28, - 1, 5, 9, 13, 17, 21, 25, 29, - 2, 6, 10, 14, 18, 22, 26, 30, - 3, 7, 11, 15, 19, 23, 27, 31 - ); - b_.i8 = - SIMDE_SHUFFLE_VECTOR_( - 8, 32, - b_.i8, b_.i8, - 0, 4, 8, 12, 16, 20, 24, 28, - 1, 5, 9, 13, 17, 21, 25, 29, - 2, 6, 10, 14, 18, 22, 26, 30, - 3, 7, 11, 15, 19, 23, 27, 31 - ); - - SIMDE_CONVERT_VECTOR_(x1_, a_.u8); - SIMDE_CONVERT_VECTOR_(x2_, b_.i8); - - simde_memcpy(&r1_, &x1_, sizeof(x1_)); - simde_memcpy(&r2_, &x2_, sizeof(x2_)); - - src_.i32 += - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[0].u32) * r2_[0].i32) + - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[1].u32) * r2_[1].i32) + - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[2].u32) * r2_[2].i32) + - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[3].u32) * r2_[3].i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { - src_.i32[i / 4] += HEDLEY_STATIC_CAST(uint16_t, a_.u8[i]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[i]); - } - #endif - - return simde__m256i_from_private(src_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm256_dpbusd_epi32 - #define _mm256_dpbusd_epi32(src, a, b) simde_mm256_dpbusd_epi32(src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_dpbusd_epi32(simde__m256i src, simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm256_mask_dpbusd_epi32(src, k, a, b); - #else - return simde_mm256_mask_mov_epi32(src, k, simde_mm256_dpbusd_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_dpbusd_epi32 - #define _mm256_mask_dpbusd_epi32(src, k, a, b) simde_mm256_mask_dpbusd_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_dpbusd_epi32(simde__mmask8 k, simde__m256i src, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm256_maskz_dpbusd_epi32(k, src, a, b); - #else - return simde_mm256_maskz_mov_epi32(k, simde_mm256_dpbusd_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_dpbusd_epi32 - #define _mm256_maskz_dpbusd_epi32(k, src, a, b) simde_mm256_maskz_dpbusd_epi32(k, src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_dpbusd_epi32(simde__m512i src, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm512_dpbusd_epi32(src, a, b); - #else - simde__m512i_private - src_ = simde__m512i_to_private(src), - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - src_.m256i[0] = simde_mm256_dpbusd_epi32(src_.m256i[0], a_.m256i[0], b_.m256i[0]); - src_.m256i[1] = simde_mm256_dpbusd_epi32(src_.m256i[1], a_.m256i[1], b_.m256i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_CONVERT_VECTOR_) - uint32_t x1_ SIMDE_VECTOR(256); - int32_t x2_ SIMDE_VECTOR(256); - simde__m512i_private - r1_[4], - r2_[4]; - - a_.u8 = - SIMDE_SHUFFLE_VECTOR_( - 8, 64, - a_.u8, a_.u8, - 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, - 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, - 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, - 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 - ); - b_.i8 = - SIMDE_SHUFFLE_VECTOR_( - 8, 64, - b_.i8, b_.i8, - 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, - 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, - 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, - 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 - ); - - SIMDE_CONVERT_VECTOR_(x1_, a_.u8); - SIMDE_CONVERT_VECTOR_(x2_, b_.i8); - - simde_memcpy(&r1_, &x1_, sizeof(x1_)); - simde_memcpy(&r2_, &x2_, sizeof(x2_)); - - src_.i32 += - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[0].u32) * r2_[0].i32) + - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[1].u32) * r2_[1].i32) + - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[2].u32) * r2_[2].i32) + - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[3].u32) * r2_[3].i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { - src_.i32[i / 4] += HEDLEY_STATIC_CAST(uint16_t, a_.u8[i]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[i]); - } - #endif - - return simde__m512i_from_private(src_); - #endif -} -#if defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm512_dpbusd_epi32 - #define _mm512_dpbusd_epi32(src, a, b) simde_mm512_dpbusd_epi32(src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_dpbusd_epi32(simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm512_mask_dpbusd_epi32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_dpbusd_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_dpbusd_epi32 - #define _mm512_mask_dpbusd_epi32(src, k, a, b) simde_mm512_mask_dpbusd_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_dpbusd_epi32(simde__mmask16 k, simde__m512i src, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm512_maskz_dpbusd_epi32(k, src, a, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_dpbusd_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_dpbusd_epi32 - #define _mm512_maskz_dpbusd_epi32(k, src, a, b) simde_mm512_maskz_dpbusd_epi32(k, src, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_DPBUSD_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/dpbusds.h b/ffi-deps/simde/simde/x86/avx512/dpbusds.h deleted file mode 100644 index 0168fed..0000000 --- a/ffi-deps/simde/simde/x86/avx512/dpbusds.h +++ /dev/null @@ -1,344 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_DPBUSDS_H) -#define SIMDE_X86_AVX512_DPBUSDS_H - -#include "types.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_dpbusds_epi32(simde__m128i src, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm_dpbusds_epi32(src, a, b); - #else - simde__m128i_private - src_ = simde__m128i_to_private(src), - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - uint32_t x1_ SIMDE_VECTOR(64); - int32_t x2_ SIMDE_VECTOR(64); - simde__m128i_private - r1_[4], - r2_[4]; - - a_.u8 = - SIMDE_SHUFFLE_VECTOR_( - 8, 16, - a_.u8, a_.u8, - 0, 4, 8, 12, - 1, 5, 9, 13, - 2, 6, 10, 14, - 3, 7, 11, 15 - ); - b_.i8 = - SIMDE_SHUFFLE_VECTOR_( - 8, 16, - b_.i8, b_.i8, - 0, 4, 8, 12, - 1, 5, 9, 13, - 2, 6, 10, 14, - 3, 7, 11, 15 - ); - - SIMDE_CONVERT_VECTOR_(x1_, a_.u8); - SIMDE_CONVERT_VECTOR_(x2_, b_.i8); - - simde_memcpy(&r1_, &x1_, sizeof(x1_)); - simde_memcpy(&r2_, &x2_, sizeof(x2_)); - - uint32_t au SIMDE_VECTOR(16) = - HEDLEY_REINTERPRET_CAST( - __typeof__(au), - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[0].u32) * r2_[0].i32) + - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[1].u32) * r2_[1].i32) + - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[2].u32) * r2_[2].i32) + - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[3].u32) * r2_[3].i32) - ); - uint32_t bu SIMDE_VECTOR(16) = HEDLEY_REINTERPRET_CAST(__typeof__(bu), src_.i32); - uint32_t ru SIMDE_VECTOR(16) = au + bu; - - au = (au >> 31) + INT32_MAX; - - uint32_t m SIMDE_VECTOR(16) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(src_.i32), (au ^ bu) | ~(bu ^ ru)) < 0); - src_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(src_.i32), (au & ~m) | (ru & m)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0]) / 4) ; i++) { - src_.i32[i] = - simde_math_adds_i32( - src_.i32[i], - HEDLEY_STATIC_CAST(uint16_t, a_.u8[(4 * i) ]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[(4 * i) ]) + - HEDLEY_STATIC_CAST(uint16_t, a_.u8[(4 * i) + 1]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[(4 * i) + 1]) + - HEDLEY_STATIC_CAST(uint16_t, a_.u8[(4 * i) + 2]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[(4 * i) + 2]) + - HEDLEY_STATIC_CAST(uint16_t, a_.u8[(4 * i) + 3]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[(4 * i) + 3]) - ); - } - #endif - - return simde__m128i_from_private(src_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm_dpbusds_epi32 - #define _mm_dpbusds_epi32(src, a, b) simde_mm_dpbusds_epi32(src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_dpbusds_epi32(simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm_mask_dpbusds_epi32(src, k, a, b); - #else - return simde_mm_mask_mov_epi32(src, k, simde_mm_dpbusds_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_dpbusds_epi32 - #define _mm_mask_dpbusds_epi32(src, k, a, b) simde_mm_mask_dpbusds_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_dpbusds_epi32(simde__mmask8 k, simde__m128i src, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm_maskz_dpbusds_epi32(k, src, a, b); - #else - return simde_mm_maskz_mov_epi32(k, simde_mm_dpbusds_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_dpbusds_epi32 - #define _mm_maskz_dpbusds_epi32(k, src, a, b) simde_mm_maskz_dpbusds_epi32(k, src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_dpbusds_epi32(simde__m256i src, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm256_dpbusds_epi32(src, a, b); - #else - simde__m256i_private - src_ = simde__m256i_to_private(src), - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - src_.m128i[0] = simde_mm_dpbusds_epi32(src_.m128i[0], a_.m128i[0], b_.m128i[0]); - src_.m128i[1] = simde_mm_dpbusds_epi32(src_.m128i[1], a_.m128i[1], b_.m128i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - uint32_t x1_ SIMDE_VECTOR(128); - int32_t x2_ SIMDE_VECTOR(128); - simde__m256i_private - r1_[4], - r2_[4]; - - a_.u8 = - SIMDE_SHUFFLE_VECTOR_( - 8, 32, - a_.u8, a_.u8, - 0, 4, 8, 12, 16, 20, 24, 28, - 1, 5, 9, 13, 17, 21, 25, 29, - 2, 6, 10, 14, 18, 22, 26, 30, - 3, 7, 11, 15, 19, 23, 27, 31 - ); - b_.i8 = - SIMDE_SHUFFLE_VECTOR_( - 8, 32, - b_.i8, b_.i8, - 0, 4, 8, 12, 16, 20, 24, 28, - 1, 5, 9, 13, 17, 21, 25, 29, - 2, 6, 10, 14, 18, 22, 26, 30, - 3, 7, 11, 15, 19, 23, 27, 31 - ); - - SIMDE_CONVERT_VECTOR_(x1_, a_.u8); - SIMDE_CONVERT_VECTOR_(x2_, b_.i8); - - simde_memcpy(&r1_, &x1_, sizeof(x1_)); - simde_memcpy(&r2_, &x2_, sizeof(x2_)); - - uint32_t au SIMDE_VECTOR(32) = - HEDLEY_REINTERPRET_CAST( - __typeof__(au), - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[0].u32) * r2_[0].i32) + - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[1].u32) * r2_[1].i32) + - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[2].u32) * r2_[2].i32) + - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[3].u32) * r2_[3].i32) - ); - uint32_t bu SIMDE_VECTOR(32) = HEDLEY_REINTERPRET_CAST(__typeof__(bu), src_.i32); - uint32_t ru SIMDE_VECTOR(32) = au + bu; - - au = (au >> 31) + INT32_MAX; - - uint32_t m SIMDE_VECTOR(32) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(src_.i32), (au ^ bu) | ~(bu ^ ru)) < 0); - src_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(src_.i32), (au & ~m) | (ru & m)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0]) / 4) ; i++) { - src_.i32[i] = - simde_math_adds_i32( - src_.i32[i], - HEDLEY_STATIC_CAST(uint16_t, a_.u8[(4 * i) ]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[(4 * i) ]) + - HEDLEY_STATIC_CAST(uint16_t, a_.u8[(4 * i) + 1]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[(4 * i) + 1]) + - HEDLEY_STATIC_CAST(uint16_t, a_.u8[(4 * i) + 2]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[(4 * i) + 2]) + - HEDLEY_STATIC_CAST(uint16_t, a_.u8[(4 * i) + 3]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[(4 * i) + 3]) - ); - } - #endif - - return simde__m256i_from_private(src_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm256_dpbusds_epi32 - #define _mm256_dpbusds_epi32(src, a, b) simde_mm256_dpbusds_epi32(src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_dpbusds_epi32(simde__m256i src, simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm256_mask_dpbusds_epi32(src, k, a, b); - #else - return simde_mm256_mask_mov_epi32(src, k, simde_mm256_dpbusds_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_dpbusds_epi32 - #define _mm256_mask_dpbusds_epi32(src, k, a, b) simde_mm256_mask_dpbusds_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_dpbusds_epi32(simde__mmask8 k, simde__m256i src, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm256_maskz_dpbusds_epi32(k, src, a, b); - #else - return simde_mm256_maskz_mov_epi32(k, simde_mm256_dpbusds_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_dpbusds_epi32 - #define _mm256_maskz_dpbusds_epi32(k, src, a, b) simde_mm256_maskz_dpbusds_epi32(k, src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_dpbusds_epi32(simde__m512i src, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm512_dpbusds_epi32(src, a, b); - #else - simde__m512i_private - src_ = simde__m512i_to_private(src), - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - src_.m256i[0] = simde_mm256_dpbusds_epi32(src_.m256i[0], a_.m256i[0], b_.m256i[0]); - src_.m256i[1] = simde_mm256_dpbusds_epi32(src_.m256i[1], a_.m256i[1], b_.m256i[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - uint32_t x1_ SIMDE_VECTOR(256); - int32_t x2_ SIMDE_VECTOR(256); - simde__m512i_private - r1_[4], - r2_[4]; - - a_.u8 = - SIMDE_SHUFFLE_VECTOR_( - 8, 64, - a_.u8, a_.u8, - 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, - 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, - 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, - 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 - ); - b_.i8 = - SIMDE_SHUFFLE_VECTOR_( - 8, 64, - b_.i8, b_.i8, - 0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, - 1, 5, 9, 13, 17, 21, 25, 29, 33, 37, 41, 45, 49, 53, 57, 61, - 2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46, 50, 54, 58, 62, - 3, 7, 11, 15, 19, 23, 27, 31, 35, 39, 43, 47, 51, 55, 59, 63 - ); - - SIMDE_CONVERT_VECTOR_(x1_, a_.u8); - SIMDE_CONVERT_VECTOR_(x2_, b_.i8); - - simde_memcpy(&r1_, &x1_, sizeof(x1_)); - simde_memcpy(&r2_, &x2_, sizeof(x2_)); - - uint32_t au SIMDE_VECTOR(64) = - HEDLEY_REINTERPRET_CAST( - __typeof__(au), - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[0].u32) * r2_[0].i32) + - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[1].u32) * r2_[1].i32) + - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[2].u32) * r2_[2].i32) + - (HEDLEY_REINTERPRET_CAST(__typeof__(a_.i32), r1_[3].u32) * r2_[3].i32) - ); - uint32_t bu SIMDE_VECTOR(64) = HEDLEY_REINTERPRET_CAST(__typeof__(bu), src_.i32); - uint32_t ru SIMDE_VECTOR(64) = au + bu; - - au = (au >> 31) + INT32_MAX; - - uint32_t m SIMDE_VECTOR(64) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(src_.i32), (au ^ bu) | ~(bu ^ ru)) < 0); - src_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(src_.i32), (au & ~m) | (ru & m)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0]) / 4) ; i++) { - src_.i32[i] = - simde_math_adds_i32( - src_.i32[i], - HEDLEY_STATIC_CAST(uint16_t, a_.u8[(4 * i) ]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[(4 * i) ]) + - HEDLEY_STATIC_CAST(uint16_t, a_.u8[(4 * i) + 1]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[(4 * i) + 1]) + - HEDLEY_STATIC_CAST(uint16_t, a_.u8[(4 * i) + 2]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[(4 * i) + 2]) + - HEDLEY_STATIC_CAST(uint16_t, a_.u8[(4 * i) + 3]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[(4 * i) + 3]) - ); - } - #endif - - return simde__m512i_from_private(src_); - #endif -} -#if defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm512_dpbusds_epi32 - #define _mm512_dpbusds_epi32(src, a, b) simde_mm512_dpbusds_epi32(src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_dpbusds_epi32(simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm512_mask_dpbusds_epi32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_dpbusds_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_dpbusds_epi32 - #define _mm512_mask_dpbusds_epi32(src, k, a, b) simde_mm512_mask_dpbusds_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_dpbusds_epi32(simde__mmask16 k, simde__m512i src, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm512_maskz_dpbusds_epi32(k, src, a, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_dpbusds_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_dpbusds_epi32 - #define _mm512_maskz_dpbusds_epi32(k, src, a, b) simde_mm512_maskz_dpbusds_epi32(k, src, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_DPBUSDS_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/dpwssd.h b/ffi-deps/simde/simde/x86/avx512/dpwssd.h deleted file mode 100644 index 33b0ce5..0000000 --- a/ffi-deps/simde/simde/x86/avx512/dpwssd.h +++ /dev/null @@ -1,269 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_DPWSSD_H) -#define SIMDE_X86_AVX512_DPWSSD_H - -#include "types.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_dpwssd_epi32(simde__m128i src, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm_dpwssd_epi32(src, a, b); - #else - simde__m128i_private - src_ = simde__m128i_to_private(src), - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_CONVERT_VECTOR_) - int32_t x1_ SIMDE_VECTOR(32); - int32_t x2_ SIMDE_VECTOR(32); - simde__m128i_private - r1_[2], - r2_[2]; - - a_.i16 = - SIMDE_SHUFFLE_VECTOR_( - 16, 16, - a_.i16, a_.i16, - 0, 2, 4, 6, - 1, 3, 5, 7 - ); - b_.i16 = - SIMDE_SHUFFLE_VECTOR_( - 16, 16, - b_.i16, b_.i16, - 0, 2, 4, 6, - 1, 3, 5, 7 - ); - - SIMDE_CONVERT_VECTOR_(x1_, a_.i16); - SIMDE_CONVERT_VECTOR_(x2_, b_.i16); - - simde_memcpy(&r1_, &x1_, sizeof(x1_)); - simde_memcpy(&r2_, &x2_, sizeof(x2_)); - - src_.i32 += - (r1_[0].i32 * r2_[0].i32) + - (r1_[1].i32 * r2_[1].i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.i16[0])) ; i++) { - src_.i32[i / 2] += HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i]); - } - #endif - - return simde__m128i_from_private(src_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm_dpwssd_epi32 - #define _mm_dpwssd_epi32(src, a, b) simde_mm_dpwssd_epi32(src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_dpwssd_epi32(simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm_mask_dpwssd_epi32(src, k, a, b); - #else - return simde_mm_mask_mov_epi32(src, k, simde_mm_dpwssd_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_dpwssd_epi32 - #define _mm_mask_dpwssd_epi32(src, k, a, b) simde_mm_mask_dpwssd_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_dpwssd_epi32(simde__mmask8 k, simde__m128i src, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm_maskz_dpwssd_epi32(k, src, a, b); - #else - return simde_mm_maskz_mov_epi32(k, simde_mm_dpwssd_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_dpwssd_epi32 - #define _mm_maskz_dpwssd_epi32(k, src, a, b) simde_mm_maskz_dpwssd_epi32(k, src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_dpwssd_epi32(simde__m256i src, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm256_dpwssd_epi32(src, a, b); - #else - simde__m256i_private - src_ = simde__m256i_to_private(src), - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_CONVERT_VECTOR_) - int32_t x1_ SIMDE_VECTOR(64); - int32_t x2_ SIMDE_VECTOR(64); - simde__m256i_private - r1_[2], - r2_[2]; - - a_.i16 = - SIMDE_SHUFFLE_VECTOR_( - 16, 32, - a_.i16, a_.i16, - 0, 2, 4, 6, 8, 10, 12, 14, - 1, 3, 5, 7, 9, 11, 13, 15 - ); - b_.i16 = - SIMDE_SHUFFLE_VECTOR_( - 16, 32, - b_.i16, b_.i16, - 0, 2, 4, 6, 8, 10, 12, 14, - 1, 3, 5, 7, 9, 11, 13, 15 - ); - - SIMDE_CONVERT_VECTOR_(x1_, a_.i16); - SIMDE_CONVERT_VECTOR_(x2_, b_.i16); - - simde_memcpy(&r1_, &x1_, sizeof(x1_)); - simde_memcpy(&r2_, &x2_, sizeof(x2_)); - - src_.i32 += - (r1_[0].i32 * r2_[0].i32) + - (r1_[1].i32 * r2_[1].i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.i16[0])) ; i++) { - src_.i32[i / 2] += HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i]); - } - #endif - - return simde__m256i_from_private(src_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm256_dpwssd_epi32 - #define _mm256_dpwssd_epi32(src, a, b) simde_mm256_dpwssd_epi32(src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_dpwssd_epi32(simde__m256i src, simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm256_mask_dpwssd_epi32(src, k, a, b); - #else - return simde_mm256_mask_mov_epi32(src, k, simde_mm256_dpwssd_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_dpwssd_epi32 - #define _mm256_mask_dpwssd_epi32(src, k, a, b) simde_mm256_mask_dpwssd_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_dpwssd_epi32(simde__mmask8 k, simde__m256i src, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm256_maskz_dpwssd_epi32(k, src, a, b); - #else - return simde_mm256_maskz_mov_epi32(k, simde_mm256_dpwssd_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_dpwssd_epi32 - #define _mm256_maskz_dpwssd_epi32(k, src, a, b) simde_mm256_maskz_dpwssd_epi32(k, src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_dpwssd_epi32(simde__m512i src, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm512_dpwssd_epi32(src, a, b); - #else - simde__m512i_private - src_ = simde__m512i_to_private(src), - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_CONVERT_VECTOR_) - int32_t x1_ SIMDE_VECTOR(128); - int32_t x2_ SIMDE_VECTOR(128); - simde__m512i_private - r1_[2], - r2_[2]; - - a_.i16 = - SIMDE_SHUFFLE_VECTOR_( - 16, 64, - a_.i16, a_.i16, - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 - ); - b_.i16 = - SIMDE_SHUFFLE_VECTOR_( - 16, 64, - b_.i16, b_.i16, - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 - ); - - SIMDE_CONVERT_VECTOR_(x1_, a_.i16); - SIMDE_CONVERT_VECTOR_(x2_, b_.i16); - - simde_memcpy(&r1_, &x1_, sizeof(x1_)); - simde_memcpy(&r2_, &x2_, sizeof(x2_)); - - src_.i32 += - (r1_[0].i32 * r2_[0].i32) + - (r1_[1].i32 * r2_[1].i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u16) / sizeof(a_.i16[0])) ; i++) { - src_.i32[i / 2] += HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i]); - } - #endif - - return simde__m512i_from_private(src_); - #endif -} -#if defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm512_dpwssd_epi32 - #define _mm512_dpwssd_epi32(src, a, b) simde_mm512_dpwssd_epi32(src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_dpwssd_epi32(simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm512_mask_dpwssd_epi32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_dpwssd_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_dpwssd_epi32 - #define _mm512_mask_dpwssd_epi32(src, k, a, b) simde_mm512_mask_dpwssd_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_dpwssd_epi32(simde__mmask16 k, simde__m512i src, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm512_maskz_dpwssd_epi32(k, src, a, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_dpwssd_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_dpwssd_epi32 - #define _mm512_maskz_dpwssd_epi32(k, src, a, b) simde_mm512_maskz_dpwssd_epi32(k, src, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_DPWSSD_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/dpwssds.h b/ffi-deps/simde/simde/x86/avx512/dpwssds.h deleted file mode 100644 index ea72091..0000000 --- a/ffi-deps/simde/simde/x86/avx512/dpwssds.h +++ /dev/null @@ -1,299 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_DPWSSDS_H) -#define SIMDE_X86_AVX512_DPWSSDS_H - -#include "types.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_dpwssds_epi32 (simde__m128i src, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VNNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_dpwssds_epi32(src, a, b); - #else - simde__m128i_private - src_ = simde__m128i_to_private(src), - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - int32_t x1_ SIMDE_VECTOR(32); - int32_t x2_ SIMDE_VECTOR(32); - simde__m128i_private - r1_[2], - r2_[2]; - - a_.i16 = - SIMDE_SHUFFLE_VECTOR_( - 16, 16, - a_.i16, a_.i16, - 0, 2, 4, 6, - 1, 3, 5, 7 - ); - b_.i16 = - SIMDE_SHUFFLE_VECTOR_( - 16, 16, - b_.i16, b_.i16, - 0, 2, 4, 6, - 1, 3, 5, 7 - ); - - SIMDE_CONVERT_VECTOR_(x1_, a_.i16); - SIMDE_CONVERT_VECTOR_(x2_, b_.i16); - - simde_memcpy(&r1_, &x1_, sizeof(x1_)); - simde_memcpy(&r2_, &x2_, sizeof(x2_)); - - uint32_t au SIMDE_VECTOR(16) = HEDLEY_REINTERPRET_CAST(__typeof__(src_.u32), ((r1_[0].i32 * r2_[0].i32) + (r1_[1].i32 * r2_[1].i32))); - uint32_t bu SIMDE_VECTOR(16) = HEDLEY_REINTERPRET_CAST(__typeof__(src_.u32), src_.i32); - uint32_t ru SIMDE_VECTOR(16) = au + bu; - - au = (au >> 31) + INT32_MAX; - - uint32_t m SIMDE_VECTOR(16) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(src_.i32), (au ^ bu) | ~(bu ^ ru)) < 0); - src_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(src_.i32), (au & ~m) | (ru & m)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i16) / sizeof(a_.i16[0]) / 2) ; i++) { - src_.i32[i] = - simde_math_adds_i32( - src_.i32[i], - HEDLEY_STATIC_CAST(int32_t, a_.i16[(2 * i) ]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[(2 * i) ]) + - HEDLEY_STATIC_CAST(int32_t, a_.i16[(2 * i) + 1]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[(2 * i) + 1]) - ); - } - #endif - - return simde__m128i_from_private(src_); - #endif -} -#if defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_dpwssds_epi32 - #define _mm_dpwssds_epi32(src, a, b) simde_mm_dpwssds_epi32(src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_dpwssds_epi32 (simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VNNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_dpwssds_epi32(src, k, a, b); - #else - return simde_mm_mask_mov_epi32(src, k, simde_mm_dpwssds_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_dpwssds_epi32 - #define _mm_mask_dpwssds_epi32(src, k, a, b) simde_mm_mask_dpwssds_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_dpwssds_epi32 (simde__mmask8 k, simde__m128i src, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VNNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_dpwssds_epi32(k, src, a, b); - #else - return simde_mm_maskz_mov_epi32(k, simde_mm_dpwssds_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_dpwssds_epi32 - #define _mm_maskz_dpwssds_epi32(k, src, a, b) simde_mm_maskz_dpwssds_epi32(k, src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_dpwssds_epi32 (simde__m256i src, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VNNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_dpwssds_epi32(src, a, b); - #else - simde__m256i_private - src_ = simde__m256i_to_private(src), - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - int32_t x1_ SIMDE_VECTOR(64); - int32_t x2_ SIMDE_VECTOR(64); - simde__m256i_private - r1_[2], - r2_[2]; - - a_.i16 = - SIMDE_SHUFFLE_VECTOR_( - 16, 32, - a_.i16, a_.i16, - 0, 2, 4, 6, 8, 10, 12, 14, - 1, 3, 5, 7, 9, 11, 13, 15 - ); - b_.i16 = - SIMDE_SHUFFLE_VECTOR_( - 16, 32, - b_.i16, b_.i16, - 0, 2, 4, 6, 8, 10, 12, 14, - 1, 3, 5, 7, 9, 11, 13, 15 - ); - - SIMDE_CONVERT_VECTOR_(x1_, a_.i16); - SIMDE_CONVERT_VECTOR_(x2_, b_.i16); - - simde_memcpy(&r1_, &x1_, sizeof(x1_)); - simde_memcpy(&r2_, &x2_, sizeof(x2_)); - - uint32_t au SIMDE_VECTOR(32) = HEDLEY_REINTERPRET_CAST(__typeof__(src_.u32), ((r1_[0].i32 * r2_[0].i32) + (r1_[1].i32 * r2_[1].i32))); - uint32_t bu SIMDE_VECTOR(32) = HEDLEY_REINTERPRET_CAST(__typeof__(src_.u32), src_.i32); - uint32_t ru SIMDE_VECTOR(32) = au + bu; - - au = (au >> 31) + INT32_MAX; - - uint32_t m SIMDE_VECTOR(32) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(src_.i32), (au ^ bu) | ~(bu ^ ru)) < 0); - src_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(src_.i32), (au & ~m) | (ru & m)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i16) / sizeof(a_.i16[0]) / 2) ; i++) { - src_.i32[i] = - simde_math_adds_i32( - src_.i32[i], - HEDLEY_STATIC_CAST(int32_t, a_.i16[(2 * i) ]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[(2 * i) ]) + - HEDLEY_STATIC_CAST(int32_t, a_.i16[(2 * i) + 1]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[(2 * i) + 1]) - ); - } - #endif - - return simde__m256i_from_private(src_); - #endif -} -#if defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_dpwssds_epi32 - #define _mm256_dpwssds_epi32(src, a, b) simde_mm256_dpwssds_epi32(src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_dpwssds_epi32 (simde__m256i src, simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VNNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_dpwssds_epi32(src, k, a, b); - #else - return simde_mm256_mask_mov_epi32(src, k, simde_mm256_dpwssds_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_dpwssds_epi32 - #define _mm256_mask_dpwssds_epi32(src, k, a, b) simde_mm256_mask_dpwssds_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_dpwssds_epi32 (simde__mmask8 k, simde__m256i src, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VNNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_dpwssds_epi32(k, src, a, b); - #else - return simde_mm256_maskz_mov_epi32(k, simde_mm256_dpwssds_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_dpwssds_epi32 - #define _mm256_maskz_dpwssds_epi32(k, src, a, b) simde_mm256_maskz_dpwssds_epi32(k, src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_dpwssds_epi32 (simde__m512i src, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm512_dpwssds_epi32(src, a, b); - #else - simde__m512i_private - src_ = simde__m512i_to_private(src), - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - int32_t x1_ SIMDE_VECTOR(128); - int32_t x2_ SIMDE_VECTOR(128); - simde__m512i_private - r1_[2], - r2_[2]; - - a_.i16 = - SIMDE_SHUFFLE_VECTOR_( - 16, 64, - a_.i16, a_.i16, - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 - ); - b_.i16 = - SIMDE_SHUFFLE_VECTOR_( - 16, 64, - b_.i16, b_.i16, - 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, - 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 - ); - - SIMDE_CONVERT_VECTOR_(x1_, a_.i16); - SIMDE_CONVERT_VECTOR_(x2_, b_.i16); - - simde_memcpy(&r1_, &x1_, sizeof(x1_)); - simde_memcpy(&r2_, &x2_, sizeof(x2_)); - - uint32_t au SIMDE_VECTOR(64) = HEDLEY_REINTERPRET_CAST(__typeof__(src_.u32), ((r1_[0].i32 * r2_[0].i32) + (r1_[1].i32 * r2_[1].i32))); - uint32_t bu SIMDE_VECTOR(64) = HEDLEY_REINTERPRET_CAST(__typeof__(src_.u32), src_.i32); - uint32_t ru SIMDE_VECTOR(64) = au + bu; - - au = (au >> 31) + INT32_MAX; - - uint32_t m SIMDE_VECTOR(64) = HEDLEY_REINTERPRET_CAST(__typeof__(m), HEDLEY_REINTERPRET_CAST(__typeof__(src_.i32), (au ^ bu) | ~(bu ^ ru)) < 0); - src_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(src_.i32), (au & ~m) | (ru & m)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i16) / sizeof(a_.i16[0]) / 2) ; i++) { - src_.i32[i] = - simde_math_adds_i32( - src_.i32[i], - HEDLEY_STATIC_CAST(int32_t, a_.i16[(2 * i) ]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[(2 * i) ]) + - HEDLEY_STATIC_CAST(int32_t, a_.i16[(2 * i) + 1]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[(2 * i) + 1]) - ); - } - #endif - - return simde__m512i_from_private(src_); - #endif -} -#if defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm512_dpwssds_epi32 - #define _mm512_dpwssds_epi32(src, a, b) simde_mm512_dpwssds_epi32(src, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_dpwssds_epi32 (simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm512_mask_dpwssds_epi32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_dpwssds_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_dpwssds_epi32 - #define _mm512_mask_dpwssds_epi32(src, k, a, b) simde_mm512_mask_dpwssds_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_dpwssds_epi32 (simde__mmask16 k, simde__m512i src, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512VNNI_NATIVE) - return _mm512_maskz_dpwssds_epi32(k, src, a, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_dpwssds_epi32(src, a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VNNI_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_dpwssds_epi32 - #define _mm512_maskz_dpwssds_epi32(k, src, a, b) simde_mm512_maskz_dpwssds_epi32(k, src, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_DPWSSDS_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/expand.h b/ffi-deps/simde/simde/x86/avx512/expand.h deleted file mode 100644 index 4afba87..0000000 --- a/ffi-deps/simde/simde/x86/avx512/expand.h +++ /dev/null @@ -1,97 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2021 Andrew Rodriguez - * 2021 Evan Nemerson - */ - -#if !defined(SIMDE_X86_AVX512_EXPAND_H) -#define SIMDE_X86_AVX512_EXPAND_H - -#include "types.h" -#include "mov.h" -#include "mov_mask.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_expand_epi32(simde__m256i src, simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_expand_epi32(src, k, a); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a), - src_ = simde__m256i_to_private(src); - simde__m256i_private r_; - - size_t src_idx = 0; - for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - if (k & (UINT64_C(1) << i)) { - r_.i32[i] = a_.i32[src_idx++]; - } else { - r_.i32[i] = src_.i32[i]; - } - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_expand_epi32 - #define _mm256_mask_expand_epi32(src, k, a) simde_mm256_mask_expand_epi32((src), (k), (a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_expand_epi32(simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_expand_epi32(k, a); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a), - r_; - - size_t src_idx = 0; - for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - if (k & (UINT64_C(1) << i)) { - r_.i32[i] = a_.i32[src_idx++]; - } else { - r_.i32[i] = INT32_C(0); - } - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_expand_epi32 - #define _mm256_maskz_expand_epi32(k, a) simde_mm256_maskz_expand_epi32((k), (a)) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_EXPAND_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/extract.h b/ffi-deps/simde/simde/x86/avx512/extract.h deleted file mode 100644 index 251715c..0000000 --- a/ffi-deps/simde/simde/x86/avx512/extract.h +++ /dev/null @@ -1,267 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Christopher Moore - */ - -#if !defined(SIMDE_X86_AVX512_EXTRACT_H) -#define SIMDE_X86_AVX512_EXTRACT_H - -#include "types.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm256_extractf32x4_ps (simde__m256 a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m256_private a_ = simde__m256_to_private(a); - - return a_.m128[imm8 & 1]; -} -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_extractf32x4_ps(a, imm8) _mm256_extractf32x4_ps(a, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_extractf32x4_ps - #define _mm256_extractf32x4_ps(a, imm8) simde_mm256_extractf32x4_ps((a), (imm8)) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm512_extractf32x4_ps (simde__m512 a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m512_private a_ = simde__m512_to_private(a); - - /* GCC 6 generates an ICE */ - #if defined(HEDLEY_GCC_VERSION) && !HEDLEY_GCC_VERSION_CHECK(7,0,0) - return a_.m128[imm8 & 3]; - #else - simde__m128_private r_; - const size_t offset = HEDLEY_STATIC_CAST(size_t, imm8 & 3) * (sizeof(r_.f32) / sizeof(r_.f32[0])); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i + offset]; - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) - #define simde_mm512_extractf32x4_ps(a, imm8) _mm512_extractf32x4_ps(a, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_extractf32x4_ps - #define _mm512_extractf32x4_ps(a, imm8) simde_mm512_extractf32x4_ps((a), (imm8)) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) - #define simde_mm512_mask_extractf32x4_ps(src, k, a, imm8) _mm512_mask_extractf32x4_ps(src, k, a, imm8) -#else - #define simde_mm512_mask_extractf32x4_ps(src, k, a, imm8) simde_mm_mask_mov_ps((src), (k), simde_mm512_extractf32x4_ps((a), (imm8))) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_extractf32x4_ps - #define _mm512_mask_extractf32x4_ps(src, k, a, imm8) simde_mm512_mask_extractf32x4_ps((src), (k), (a), (imm8)) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) - #define simde_mm512_maskz_extractf32x4_ps(k, a, imm8) _mm512_maskz_extractf32x4_ps(k, a, imm8) -#else - #define simde_mm512_maskz_extractf32x4_ps(k, a, imm8) simde_mm_maskz_mov_ps((k), simde_mm512_extractf32x4_ps((a), (imm8))) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_extractf32x4_ps - #define _mm512_maskz_extractf32x4_ps(k, a, imm8) simde_mm512_maskz_extractf32x4_ps((k), (a), (imm8)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm512_extractf32x8_ps (simde__m512 a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m512_private a_ = simde__m512_to_private(a); - - return a_.m256[imm8 & 1]; -} -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm512_extractf32x8_ps(a, imm8) _mm512_extractf32x8_ps(a, imm8) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_extractf32x8_ps - #define _mm512_extractf32x8_ps(a, imm8) simde_mm512_extractf32x8_ps(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm512_extractf64x4_pd (simde__m512d a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m512d_private a_ = simde__m512d_to_private(a); - - return a_.m256d[imm8 & 1]; -} -#if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) - #define simde_mm512_extractf64x4_pd(a, imm8) _mm512_extractf64x4_pd(a, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_extractf64x4_pd - #define _mm512_extractf64x4_pd(a, imm8) simde_mm512_extractf64x4_pd(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) - #define simde_mm512_mask_extractf64x4_pd(src, k, a, imm8) _mm512_mask_extractf64x4_pd(src, k, a, imm8) -#else - #define simde_mm512_mask_extractf64x4_pd(src, k, a, imm8) simde_mm256_mask_mov_pd(src, k, simde_mm512_extractf64x4_pd(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_extractf64x4_pd - #define _mm512_mask_extractf64x4_pd(src, k, a, imm8) simde_mm512_mask_extractf64x4_pd(src, k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) - #define simde_mm512_maskz_extractf64x4_pd(k, a, imm8) _mm512_maskz_extractf64x4_pd(k, a, imm8) -#else - #define simde_mm512_maskz_extractf64x4_pd(k, a, imm8) simde_mm256_maskz_mov_pd(k, simde_mm512_extractf64x4_pd(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_extractf64x4_pd - #define _mm512_maskz_extractf64x4_pd(k, a, imm8) simde_mm512_maskz_extractf64x4_pd(k, a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm512_extracti32x4_epi32 (simde__m512i a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m512i_private a_ = simde__m512i_to_private(a); - - return a_.m128i[imm8 & 3]; -} -#if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_CLANG_REV_299346) - #define simde_mm512_extracti32x4_epi32(a, imm8) _mm512_extracti32x4_epi32(a, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_extracti32x4_epi32 - #define _mm512_extracti32x4_epi32(a, imm8) simde_mm512_extracti32x4_epi32(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_CLANG_REV_299346) - #define simde_mm512_mask_extracti32x4_epi32(src, k, a, imm8) _mm512_mask_extracti32x4_epi32(src, k, a, imm8) -#else - #define simde_mm512_mask_extracti32x4_epi32(src, k, a, imm8) simde_mm_mask_mov_epi32(src, k, simde_mm512_extracti32x4_epi32(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_extracti32x4_epi32 - #define _mm512_mask_extracti32x4_epi32(src, k, a, imm8) simde_mm512_mask_extracti32x4_epi32(src, k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_CLANG_REV_299346) - #define simde_mm512_maskz_extracti32x4_epi32(k, a, imm8) _mm512_maskz_extracti32x4_epi32(k, a, imm8) -#else - #define simde_mm512_maskz_extracti32x4_epi32(k, a, imm8) simde_mm_maskz_mov_epi32(k, simde_mm512_extracti32x4_epi32(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_extracti32x4_epi32 - #define _mm512_maskz_extracti32x4_epi32(k, a, imm8) simde_mm512_maskz_extracti32x4_epi32(k, a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm512_extracti32x8_epi32 (simde__m512i a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m512i_private a_ = simde__m512i_to_private(a); - - return a_.m256i[imm8 & 1]; -} -#if defined(SIMDE_X86_AVX512DQ_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_CLANG_REV_299346) - #define simde_mm512_extracti32x8_epi32(a, imm8) _mm512_extracti32x8_epi32(a, imm8) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_extracti32x8_epi32 - #define _mm512_extracti32x8_epi32(a, imm8) simde_mm512_extracti32x8_epi32((a), (imm8)) -#endif - -#if defined(SIMDE_X86_AVX51FDQ_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_CLANG_REV_299346) - #define simde_mm512_mask_extracti32x8_epi32(src, k, a, imm8) _mm512_mask_extracti32x8_epi32(src, k, a, imm8) -#else - #define simde_mm512_mask_extracti32x8_epi32(src, k, a, imm8) simde_mm256_mask_mov_epi32((src), (k), simde_mm512_extracti32x8_epi32((a), (imm8))) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_extracti32x8_epi32 - #define _mm512_mask_extracti32x8_epi32(src, k, a, imm8) simde_mm512_mask_extracti32x8_epi32((src), (k), (a), (imm8)) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_CLANG_REV_299346) - #define simde_mm512_maskz_extracti32x8_epi32(k, a, imm8) _mm512_maskz_extracti32x8_epi32(k, a, imm8) -#else - #define simde_mm512_maskz_extracti32x8_epi32(k, a, imm8) simde_mm256_maskz_mov_epi32((k), simde_mm512_extracti32x8_epi32((a), (imm8))) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_extracti32x8_epi32 - #define _mm512_maskz_extracti32x8_epi32(k, a, imm8) simde_mm512_maskz_extracti32x8_epi32((k), (a), (imm8)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm512_extracti64x4_epi64 (simde__m512i a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m512i_private a_ = simde__m512i_to_private(a); - - return a_.m256i[imm8 & 1]; -} -#if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_CLANG_REV_299346) - #define simde_mm512_extracti64x4_epi64(a, imm8) _mm512_extracti64x4_epi64(a, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_extracti64x4_epi64 - #define _mm512_extracti64x4_epi64(a, imm8) simde_mm512_extracti64x4_epi64((a), (imm8)) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_CLANG_REV_299346) - #define simde_mm512_mask_extracti64x4_epi64(src, k, a, imm8) _mm512_mask_extracti64x4_epi64(src, k, a, imm8) -#else - #define simde_mm512_mask_extracti64x4_epi64(src, k, a, imm8) simde_mm256_mask_mov_epi64((src), (k), simde_mm512_extracti64x4_epi64((a), (imm8))) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_extracti64x4_epi64 - #define _mm512_mask_extracti64x4_epi64(src, k, a, imm8) simde_mm512_mask_extracti64x4_epi64((src), (k), (a), (imm8)) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && !defined(SIMDE_BUG_CLANG_REV_299346) - #define simde_mm512_maskz_extracti64x4_epi64(k, a, imm8) _mm512_maskz_extracti64x4_epi64(k, a, imm8) -#else - #define simde_mm512_maskz_extracti64x4_epi64(k, a, imm8) simde_mm256_maskz_mov_epi64((k), simde_mm512_extracti64x4_epi64((a), (imm8))) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_extracti64x4_epi64 - #define _mm512_maskz_extracti64x4_epi64(k, a, imm8) simde_mm512_maskz_extracti64x4_epi64((k), (a), (imm8)) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_EXTRACT_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/fixupimm.h b/ffi-deps/simde/simde/x86/avx512/fixupimm.h deleted file mode 100644 index 2ea234b..0000000 --- a/ffi-deps/simde/simde/x86/avx512/fixupimm.h +++ /dev/null @@ -1,900 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_FIXUPIMM_H) -#define SIMDE_X86_AVX512_FIXUPIMM_H - -#include "types.h" -#include "flushsubnormal.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_fixupimm_ps (simde__m128 a, simde__m128 b, simde__m128i c, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - HEDLEY_STATIC_CAST(void, imm8); - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b), - s_ = simde__m128_to_private(simde_x_mm_flushsubnormal_ps(b)); - simde__m128i_private c_ = simde__m128i_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - int32_t select = 1; - switch (simde_math_fpclassifyf(s_.f32[i])) { - case SIMDE_MATH_FP_NORMAL: - select = (s_.f32[i] < SIMDE_FLOAT32_C(0.0)) ? 6 : (s_.f32[i] == SIMDE_FLOAT32_C(1.0)) ? 3 : 7; - break; - case SIMDE_MATH_FP_ZERO: - select = 2; - break; - case SIMDE_MATH_FP_NAN: - select = 0; - break; - case SIMDE_MATH_FP_INFINITE: - select = ((s_.f32[i] > SIMDE_FLOAT32_C(0.0)) ? 5 : 4); - break; - } - - switch (((c_.i32[i] >> (select << 2)) & 15)) { - case 0: - r_.f32[i] = a_.f32[i]; - break; - case 1: - r_.f32[i] = b_.f32[i]; - break; - case 2: - r_.f32[i] = SIMDE_MATH_NANF; - break; - case 3: - r_.f32[i] = -SIMDE_MATH_NANF; - break; - case 4: - r_.f32[i] = -SIMDE_MATH_INFINITYF; - break; - case 5: - r_.f32[i] = SIMDE_MATH_INFINITYF; - break; - case 6: - r_.f32[i] = s_.f32[i] < SIMDE_FLOAT32_C(0.0) ? -SIMDE_MATH_INFINITYF : SIMDE_MATH_INFINITYF; - break; - case 7: - r_.f32[i] = SIMDE_FLOAT32_C(-0.0); - break; - case 8: - r_.f32[i] = SIMDE_FLOAT32_C(0.0); - break; - case 9: - r_.f32[i] = SIMDE_FLOAT32_C(-1.0); - break; - case 10: - r_.f32[i] = SIMDE_FLOAT32_C(1.0); - break; - case 11: - r_.f32[i] = SIMDE_FLOAT32_C(0.5); - break; - case 12: - r_.f32[i] = SIMDE_FLOAT32_C(90.0); - break; - case 13: - r_.f32[i] = SIMDE_MATH_PIF / 2; - break; - case 14: - r_.f32[i] = SIMDE_MATH_FLT_MAX; - break; - case 15: - r_.f32[i] = -SIMDE_MATH_FLT_MAX; - break; - } - } - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_fixupimm_ps(a, b, c, imm8) _mm_fixupimm_ps(a, b, c, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_fixupimm_ps - #define _mm_fixupimm_ps(a, b, c, imm8) simde_mm_fixupimm_ps(a, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_mask_fixupimm_ps(a, k, b, c, imm8) _mm_mask_fixupimm_ps(a, k, b, c, imm8) -#else - #define simde_mm_mask_fixupimm_ps(a, k, b, c, imm8) simde_mm_mask_mov_ps(a, k, simde_mm_fixupimm_ps(a, b, c, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_fixupimm_ps - #define _mm_mask_fixupimm_ps(a, k, b, c, imm8) simde_mm_mask_fixupimm_ps(a, k, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_maskz_fixupimm_ps(k, a, b, c, imm8) _mm_maskz_fixupimm_ps(k, a, b, c, imm8) -#else - #define simde_mm_maskz_fixupimm_ps(k, a, b, c, imm8) simde_mm_maskz_mov_ps(k, simde_mm_fixupimm_ps(a, b, c, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_fixupimm_ps - #define _mm_maskz_fixupimm_ps(k, a, b, c, imm8) simde_mm_maskz_fixupimm_ps(k, a, b, c, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_fixupimm_ps (simde__m256 a, simde__m256 b, simde__m256i c, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - HEDLEY_STATIC_CAST(void, imm8); - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b), - s_ = simde__m256_to_private(simde_x_mm256_flushsubnormal_ps(b)); - simde__m256i_private c_ = simde__m256i_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - int32_t select = 1; - switch (simde_math_fpclassifyf(s_.f32[i])) { - case SIMDE_MATH_FP_NORMAL: - select = (s_.f32[i] < SIMDE_FLOAT32_C(0.0)) ? 6 : (s_.f32[i] == SIMDE_FLOAT32_C(1.0)) ? 3 : 7; - break; - case SIMDE_MATH_FP_ZERO: - select = 2; - break; - case SIMDE_MATH_FP_NAN: - select = 0; - break; - case SIMDE_MATH_FP_INFINITE: - select = ((s_.f32[i] > SIMDE_FLOAT32_C(0.0)) ? 5 : 4); - break; - } - - switch (((c_.i32[i] >> (select << 2)) & 15)) { - case 0: - r_.f32[i] = a_.f32[i]; - break; - case 1: - r_.f32[i] = b_.f32[i]; - break; - case 2: - r_.f32[i] = SIMDE_MATH_NANF; - break; - case 3: - r_.f32[i] = -SIMDE_MATH_NANF; - break; - case 4: - r_.f32[i] = -SIMDE_MATH_INFINITYF; - break; - case 5: - r_.f32[i] = SIMDE_MATH_INFINITYF; - break; - case 6: - r_.f32[i] = s_.f32[i] < SIMDE_FLOAT32_C(0.0) ? -SIMDE_MATH_INFINITYF : SIMDE_MATH_INFINITYF; - break; - case 7: - r_.f32[i] = SIMDE_FLOAT32_C(-0.0); - break; - case 8: - r_.f32[i] = SIMDE_FLOAT32_C(0.0); - break; - case 9: - r_.f32[i] = SIMDE_FLOAT32_C(-1.0); - break; - case 10: - r_.f32[i] = SIMDE_FLOAT32_C(1.0); - break; - case 11: - r_.f32[i] = SIMDE_FLOAT32_C(0.5); - break; - case 12: - r_.f32[i] = SIMDE_FLOAT32_C(90.0); - break; - case 13: - r_.f32[i] = SIMDE_MATH_PIF / 2; - break; - case 14: - r_.f32[i] = SIMDE_MATH_FLT_MAX; - break; - case 15: - r_.f32[i] = -SIMDE_MATH_FLT_MAX; - break; - } - } - - return simde__m256_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_fixupimm_ps(a, b, c, imm8) _mm256_fixupimm_ps(a, b, c, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_fixupimm_ps - #define _mm256_fixupimm_ps(a, b, c, imm8) simde_mm256_fixupimm_ps(a, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_mask_fixupimm_ps(a, k, b, c, imm8) _mm256_mask_fixupimm_ps(a, k, b, c, imm8) -#else - #define simde_mm256_mask_fixupimm_ps(a, k, b, c, imm8) simde_mm256_mask_mov_ps(a, k, simde_mm256_fixupimm_ps(a, b, c, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_fixupimm_ps - #define _mm256_mask_fixupimm_ps(a, k, b, c, imm8) simde_mm256_mask_fixupimm_ps(a, k, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_maskz_fixupimm_ps(k, a, b, c, imm8) _mm256_maskz_fixupimm_ps(k, a, b, c, imm8) -#else - #define simde_mm256_maskz_fixupimm_ps(k, a, b, c, imm8) simde_mm256_maskz_mov_ps(k, simde_mm256_fixupimm_ps(a, b, c, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_fixupimm_ps - #define _mm256_maskz_fixupimm_ps(k, a, b, c, imm8) simde_mm256_maskz_fixupimm_ps(k, a, b, c, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_fixupimm_ps (simde__m512 a, simde__m512 b, simde__m512i c, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - HEDLEY_STATIC_CAST(void, imm8); - simde__m512_private - r_, - a_ = simde__m512_to_private(a), - b_ = simde__m512_to_private(b), - s_ = simde__m512_to_private(simde_x_mm512_flushsubnormal_ps(b)); - simde__m512i_private c_ = simde__m512i_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - int32_t select = 1; - switch (simde_math_fpclassifyf(s_.f32[i])) { - case SIMDE_MATH_FP_NORMAL: - select = (s_.f32[i] < SIMDE_FLOAT32_C(0.0)) ? 6 : (s_.f32[i] == SIMDE_FLOAT32_C(1.0)) ? 3 : 7; - break; - case SIMDE_MATH_FP_ZERO: - select = 2; - break; - case SIMDE_MATH_FP_NAN: - select = 0; - break; - case SIMDE_MATH_FP_INFINITE: - select = ((s_.f32[i] > SIMDE_FLOAT32_C(0.0)) ? 5 : 4); - break; - } - - switch (((c_.i32[i] >> (select << 2)) & 15)) { - case 0: - r_.f32[i] = a_.f32[i]; - break; - case 1: - r_.f32[i] = b_.f32[i]; - break; - case 2: - r_.f32[i] = SIMDE_MATH_NANF; - break; - case 3: - r_.f32[i] = -SIMDE_MATH_NANF; - break; - case 4: - r_.f32[i] = -SIMDE_MATH_INFINITYF; - break; - case 5: - r_.f32[i] = SIMDE_MATH_INFINITYF; - break; - case 6: - r_.f32[i] = s_.f32[i] < SIMDE_FLOAT32_C(0.0) ? -SIMDE_MATH_INFINITYF : SIMDE_MATH_INFINITYF; - break; - case 7: - r_.f32[i] = SIMDE_FLOAT32_C(-0.0); - break; - case 8: - r_.f32[i] = SIMDE_FLOAT32_C(0.0); - break; - case 9: - r_.f32[i] = SIMDE_FLOAT32_C(-1.0); - break; - case 10: - r_.f32[i] = SIMDE_FLOAT32_C(1.0); - break; - case 11: - r_.f32[i] = SIMDE_FLOAT32_C(0.5); - break; - case 12: - r_.f32[i] = SIMDE_FLOAT32_C(90.0); - break; - case 13: - r_.f32[i] = SIMDE_MATH_PIF / 2; - break; - case 14: - r_.f32[i] = SIMDE_MATH_FLT_MAX; - break; - case 15: - r_.f32[i] = -SIMDE_MATH_FLT_MAX; - break; - } - } - - return simde__m512_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_fixupimm_ps(a, b, c, imm8) _mm512_fixupimm_ps(a, b, c, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_fixupimm_ps - #define _mm512_fixupimm_ps(a, b, c, imm8) simde_mm512_fixupimm_ps(a, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_mask_fixupimm_ps(a, k, b, c, imm8) _mm512_mask_fixupimm_ps(a, k, b, c, imm8) -#else - #define simde_mm512_mask_fixupimm_ps(a, k, b, c, imm8) simde_mm512_mask_mov_ps(a, k, simde_mm512_fixupimm_ps(a, b, c, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_fixupimm_ps - #define _mm512_mask_fixupimm_ps(a, k, b, c, imm8) simde_mm512_mask_fixupimm_ps(a, k, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_maskz_fixupimm_ps(k, a, b, c, imm8) _mm512_maskz_fixupimm_ps(k, a, b, c, imm8) -#else - #define simde_mm512_maskz_fixupimm_ps(k, a, b, c, imm8) simde_mm512_maskz_mov_ps(k, simde_mm512_fixupimm_ps(a, b, c, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_fixupimm_ps - #define _mm512_maskz_fixupimm_ps(k, a, b, c, imm8) simde_mm512_maskz_fixupimm_ps(k, a, b, c, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_fixupimm_ss (simde__m128 a, simde__m128 b, simde__m128i c, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - HEDLEY_STATIC_CAST(void, imm8); - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b), - s_ = simde__m128_to_private(simde_x_mm_flushsubnormal_ps(b)); - simde__m128i_private c_ = simde__m128i_to_private(c); - - int32_t select = 1; - switch (simde_math_fpclassifyf(s_.f32[0])) { - case SIMDE_MATH_FP_NORMAL: - select = (s_.f32[0] < SIMDE_FLOAT32_C(0.0)) ? 6 : (s_.f32[0] == SIMDE_FLOAT32_C(1.0)) ? 3 : 7; - break; - case SIMDE_MATH_FP_ZERO: - select = 2; - break; - case SIMDE_MATH_FP_NAN: - select = 0; - break; - case SIMDE_MATH_FP_INFINITE: - select = ((s_.f32[0] > SIMDE_FLOAT32_C(0.0)) ? 5 : 4); - break; - } - - switch (((c_.i32[0] >> (select << 2)) & 15)) { - case 0: - b_.f32[0] = a_.f32[0]; - break; - case 2: - b_.f32[0] = SIMDE_MATH_NANF; - break; - case 3: - b_.f32[0] = -SIMDE_MATH_NANF; - break; - case 4: - b_.f32[0] = -SIMDE_MATH_INFINITYF; - break; - case 5: - b_.f32[0] = SIMDE_MATH_INFINITYF; - break; - case 6: - b_.f32[0] = s_.f32[0] < SIMDE_FLOAT32_C(0.0) ? -SIMDE_MATH_INFINITYF : SIMDE_MATH_INFINITYF; - break; - case 7: - b_.f32[0] = SIMDE_FLOAT32_C(-0.0); - break; - case 8: - b_.f32[0] = SIMDE_FLOAT32_C(0.0); - break; - case 9: - b_.f32[0] = SIMDE_FLOAT32_C(-1.0); - break; - case 10: - b_.f32[0] = SIMDE_FLOAT32_C(1.0); - break; - case 11: - b_.f32[0] = SIMDE_FLOAT32_C(0.5); - break; - case 12: - b_.f32[0] = SIMDE_FLOAT32_C(90.0); - break; - case 13: - b_.f32[0] = SIMDE_MATH_PIF / 2; - break; - case 14: - b_.f32[0] = SIMDE_MATH_FLT_MAX; - break; - case 15: - b_.f32[0] = -SIMDE_MATH_FLT_MAX; - break; - } - - return simde__m128_from_private(b_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm_fixupimm_ss(a, b, c, imm8) _mm_fixupimm_ss(a, b, c, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_fixupimm_ss - #define _mm_fixupimm_ss(a, b, c, imm8) simde_mm_fixupimm_ss(a, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm_mask_fixupimm_ss(a, k, b, c, imm8) _mm_mask_fixupimm_ss(a, k, b, c, imm8) -#else - #define simde_mm_mask_fixupimm_ss(a, k, b, c, imm8) simde_mm_mask_mov_ps(a, ((k) | 14), simde_mm_fixupimm_ss(a, b, c, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_fixupimm_ss - #define _mm_mask_fixupimm_ss(a, k, b, c, imm8) simde_mm_mask_fixupimm_ss(a, k, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm_maskz_fixupimm_ss(k, a, b, c, imm8) _mm_maskz_fixupimm_ss(k, a, b, c, imm8) -#else - #define simde_mm_maskz_fixupimm_ss(k, a, b, c, imm8) simde_mm_maskz_mov_ps(((k) | 14), simde_mm_fixupimm_ss(a, b, c, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_fixupimm_ss - #define _mm_maskz_fixupimm_ss(k, a, b, c, imm8) simde_mm_maskz_fixupimm_ss(k, a, b, c, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_fixupimm_pd (simde__m128d a, simde__m128d b, simde__m128i c, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - HEDLEY_STATIC_CAST(void, imm8); - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b), - s_ = simde__m128d_to_private(simde_x_mm_flushsubnormal_pd(b)); - simde__m128i_private c_ = simde__m128i_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - int32_t select = 1; - switch (simde_math_fpclassify(s_.f64[i])) { - case SIMDE_MATH_FP_NORMAL: - select = (s_.f64[i] < SIMDE_FLOAT64_C(0.0)) ? 6 : (s_.f64[i] == SIMDE_FLOAT64_C(1.0)) ? 3 : 7; - break; - case SIMDE_MATH_FP_ZERO: - select = 2; - break; - case SIMDE_MATH_FP_NAN: - select = 0; - break; - case SIMDE_MATH_FP_INFINITE: - select = ((s_.f64[i] > SIMDE_FLOAT64_C(0.0)) ? 5 : 4); - break; - } - - switch (((c_.i64[i] >> (select << 2)) & 15)) { - case 0: - r_.f64[i] = a_.f64[i]; - break; - case 1: - r_.f64[i] = b_.f64[i]; - break; - case 2: - r_.f64[i] = SIMDE_MATH_NAN; - break; - case 3: - r_.f64[i] = -SIMDE_MATH_NAN; - break; - case 4: - r_.f64[i] = -SIMDE_MATH_INFINITY; - break; - case 5: - r_.f64[i] = SIMDE_MATH_INFINITY; - break; - case 6: - r_.f64[i] = s_.f64[i] < SIMDE_FLOAT64_C(0.0) ? -SIMDE_MATH_INFINITY : SIMDE_MATH_INFINITY; - break; - case 7: - r_.f64[i] = SIMDE_FLOAT64_C(-0.0); - break; - case 8: - r_.f64[i] = SIMDE_FLOAT64_C(0.0); - break; - case 9: - r_.f64[i] = SIMDE_FLOAT64_C(-1.0); - break; - case 10: - r_.f64[i] = SIMDE_FLOAT64_C(1.0); - break; - case 11: - r_.f64[i] = SIMDE_FLOAT64_C(0.5); - break; - case 12: - r_.f64[i] = SIMDE_FLOAT64_C(90.0); - break; - case 13: - r_.f64[i] = SIMDE_MATH_PI / 2; - break; - case 14: - r_.f64[i] = SIMDE_MATH_DBL_MAX; - break; - case 15: - r_.f64[i] = -SIMDE_MATH_DBL_MAX; - break; - } - } - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_fixupimm_pd(a, b, c, imm8) _mm_fixupimm_pd(a, b, c, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_fixupimm_pd - #define _mm_fixupimm_pd(a, b, c, imm8) simde_mm_fixupimm_pd(a, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_mask_fixupimm_pd(a, k, b, c, imm8) _mm_mask_fixupimm_pd(a, k, b, c, imm8) -#else - #define simde_mm_mask_fixupimm_pd(a, k, b, c, imm8) simde_mm_mask_mov_pd(a, k, simde_mm_fixupimm_pd(a, b, c, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_fixupimm_pd - #define _mm_mask_fixupimm_pd(a, k, b, c, imm8) simde_mm_mask_fixupimm_pd(a, k, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_maskz_fixupimm_pd(k, a, b, c, imm8) _mm_maskz_fixupimm_pd(k, a, b, c, imm8) -#else - #define simde_mm_maskz_fixupimm_pd(k, a, b, c, imm8) simde_mm_maskz_mov_pd(k, simde_mm_fixupimm_pd(a, b, c, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_fixupimm_pd - #define _mm_maskz_fixupimm_pd(k, a, b, c, imm8) simde_mm_maskz_fixupimm_pd(k, a, b, c, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_fixupimm_pd (simde__m256d a, simde__m256d b, simde__m256i c, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - HEDLEY_STATIC_CAST(void, imm8); - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b), - s_ = simde__m256d_to_private(simde_x_mm256_flushsubnormal_pd(b)); - simde__m256i_private c_ = simde__m256i_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - int32_t select = 1; - switch (simde_math_fpclassify(s_.f64[i])) { - case SIMDE_MATH_FP_NORMAL: - select = (s_.f64[i] < SIMDE_FLOAT64_C(0.0)) ? 6 : (s_.f64[i] == SIMDE_FLOAT64_C(1.0)) ? 3 : 7; - break; - case SIMDE_MATH_FP_ZERO: - select = 2; - break; - case SIMDE_MATH_FP_NAN: - select = 0; - break; - case SIMDE_MATH_FP_INFINITE: - select = ((s_.f64[i] > SIMDE_FLOAT64_C(0.0)) ? 5 : 4); - break; - } - - switch (((c_.i64[i] >> (select << 2)) & 15)) { - case 0: - r_.f64[i] = a_.f64[i]; - break; - case 1: - r_.f64[i] = b_.f64[i]; - break; - case 2: - r_.f64[i] = SIMDE_MATH_NAN; - break; - case 3: - r_.f64[i] = -SIMDE_MATH_NAN; - break; - case 4: - r_.f64[i] = -SIMDE_MATH_INFINITY; - break; - case 5: - r_.f64[i] = SIMDE_MATH_INFINITY; - break; - case 6: - r_.f64[i] = s_.f64[i] < SIMDE_FLOAT64_C(0.0) ? -SIMDE_MATH_INFINITY : SIMDE_MATH_INFINITY; - break; - case 7: - r_.f64[i] = SIMDE_FLOAT64_C(-0.0); - break; - case 8: - r_.f64[i] = SIMDE_FLOAT64_C(0.0); - break; - case 9: - r_.f64[i] = SIMDE_FLOAT64_C(-1.0); - break; - case 10: - r_.f64[i] = SIMDE_FLOAT64_C(1.0); - break; - case 11: - r_.f64[i] = SIMDE_FLOAT64_C(0.5); - break; - case 12: - r_.f64[i] = SIMDE_FLOAT64_C(90.0); - break; - case 13: - r_.f64[i] = SIMDE_MATH_PI / 2; - break; - case 14: - r_.f64[i] = SIMDE_MATH_DBL_MAX; - break; - case 15: - r_.f64[i] = -SIMDE_MATH_DBL_MAX; - break; - } - } - - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_fixupimm_pd(a, b, c, imm8) _mm256_fixupimm_pd(a, b, c, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_fixupimm_pd - #define _mm256_fixupimm_pd(a, b, c, imm8) simde_mm256_fixupimm_pd(a, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_mask_fixupimm_pd(a, k, b, c, imm8) _mm256_mask_fixupimm_pd(a, k, b, c, imm8) -#else - #define simde_mm256_mask_fixupimm_pd(a, k, b, c, imm8) simde_mm256_mask_mov_pd(a, k, simde_mm256_fixupimm_pd(a, b, c, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_fixupimm_pd - #define _mm256_mask_fixupimm_pd(a, k, b, c, imm8) simde_mm256_mask_fixupimm_pd(a, k, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_maskz_fixupimm_pd(k, a, b, c, imm8) _mm256_maskz_fixupimm_pd(k, a, b, c, imm8) -#else - #define simde_mm256_maskz_fixupimm_pd(k, a, b, c, imm8) simde_mm256_maskz_mov_pd(k, simde_mm256_fixupimm_pd(a, b, c, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_fixupimm_pd - #define _mm256_maskz_fixupimm_pd(k, a, b, c, imm8) simde_mm256_maskz_fixupimm_pd(k, a, b, c, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_fixupimm_pd (simde__m512d a, simde__m512d b, simde__m512i c, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - HEDLEY_STATIC_CAST(void, imm8); - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a), - b_ = simde__m512d_to_private(b), - s_ = simde__m512d_to_private(simde_x_mm512_flushsubnormal_pd(b)); - simde__m512i_private c_ = simde__m512i_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - int32_t select = 1; - switch (simde_math_fpclassify(s_.f64[i])) { - case SIMDE_MATH_FP_NORMAL: - select = (s_.f64[i] < SIMDE_FLOAT64_C(0.0)) ? 6 : (s_.f64[i] == SIMDE_FLOAT64_C(1.0)) ? 3 : 7; - break; - case SIMDE_MATH_FP_ZERO: - select = 2; - break; - case SIMDE_MATH_FP_NAN: - select = 0; - break; - case SIMDE_MATH_FP_INFINITE: - select = ((s_.f64[i] > SIMDE_FLOAT64_C(0.0)) ? 5 : 4); - break; - } - - switch (((c_.i64[i] >> (select << 2)) & 15)) { - case 0: - r_.f64[i] = a_.f64[i]; - break; - case 1: - r_.f64[i] = b_.f64[i]; - break; - case 2: - r_.f64[i] = SIMDE_MATH_NAN; - break; - case 3: - r_.f64[i] = -SIMDE_MATH_NAN; - break; - case 4: - r_.f64[i] = -SIMDE_MATH_INFINITY; - break; - case 5: - r_.f64[i] = SIMDE_MATH_INFINITY; - break; - case 6: - r_.f64[i] = s_.f64[i] < SIMDE_FLOAT64_C(0.0) ? -SIMDE_MATH_INFINITY : SIMDE_MATH_INFINITY; - break; - case 7: - r_.f64[i] = SIMDE_FLOAT64_C(-0.0); - break; - case 8: - r_.f64[i] = SIMDE_FLOAT64_C(0.0); - break; - case 9: - r_.f64[i] = SIMDE_FLOAT64_C(-1.0); - break; - case 10: - r_.f64[i] = SIMDE_FLOAT64_C(1.0); - break; - case 11: - r_.f64[i] = SIMDE_FLOAT64_C(0.5); - break; - case 12: - r_.f64[i] = SIMDE_FLOAT64_C(90.0); - break; - case 13: - r_.f64[i] = SIMDE_MATH_PI / 2; - break; - case 14: - r_.f64[i] = SIMDE_MATH_DBL_MAX; - break; - case 15: - r_.f64[i] = -SIMDE_MATH_DBL_MAX; - break; - } - } - - return simde__m512d_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_fixupimm_pd(a, b, c, imm8) _mm512_fixupimm_pd(a, b, c, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_fixupimm_pd - #define _mm512_fixupimm_pd(a, b, c, imm8) simde_mm512_fixupimm_pd(a, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_mask_fixupimm_pd(a, k, b, c, imm8) _mm512_mask_fixupimm_pd(a, k, b, c, imm8) -#else - #define simde_mm512_mask_fixupimm_pd(a, k, b, c, imm8) simde_mm512_mask_mov_pd(a, k, simde_mm512_fixupimm_pd(a, b, c, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_fixupimm_pd - #define _mm512_mask_fixupimm_pd(a, k, b, c, imm8) simde_mm512_mask_fixupimm_pd(a, k, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_maskz_fixupimm_pd(k, a, b, c, imm8) _mm512_maskz_fixupimm_pd(k, a, b, c, imm8) -#else - #define simde_mm512_maskz_fixupimm_pd(k, a, b, c, imm8) simde_mm512_maskz_mov_pd(k, simde_mm512_fixupimm_pd(a, b, c, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_fixupimm_pd - #define _mm512_maskz_fixupimm_pd(k, a, b, c, imm8) simde_mm512_maskz_fixupimm_pd(k, a, b, c, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_fixupimm_sd (simde__m128d a, simde__m128d b, simde__m128i c, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - HEDLEY_STATIC_CAST(void, imm8); - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b), - s_ = simde__m128d_to_private(simde_x_mm_flushsubnormal_pd(b)); - simde__m128i_private c_ = simde__m128i_to_private(c); - - int32_t select = 1; - switch (simde_math_fpclassify(s_.f64[0])) { - case SIMDE_MATH_FP_NORMAL: - select = (s_.f64[0] < SIMDE_FLOAT64_C(0.0)) ? 6 : (s_.f64[0] == SIMDE_FLOAT64_C(1.0)) ? 3 : 7; - break; - case SIMDE_MATH_FP_ZERO: - select = 2; - break; - case SIMDE_MATH_FP_NAN: - select = 0; - break; - case SIMDE_MATH_FP_INFINITE: - select = ((s_.f64[0] > SIMDE_FLOAT64_C(0.0)) ? 5 : 4); - break; - } - - switch (((c_.i64[0] >> (select << 2)) & 15)) { - case 0: - b_.f64[0] = a_.f64[0]; - break; - case 1: - b_.f64[0] = b_.f64[0]; - break; - case 2: - b_.f64[0] = SIMDE_MATH_NAN; - break; - case 3: - b_.f64[0] = -SIMDE_MATH_NAN; - break; - case 4: - b_.f64[0] = -SIMDE_MATH_INFINITY; - break; - case 5: - b_.f64[0] = SIMDE_MATH_INFINITY; - break; - case 6: - b_.f64[0] = s_.f64[0] < SIMDE_FLOAT64_C(0.0) ? -SIMDE_MATH_INFINITY : SIMDE_MATH_INFINITY; - break; - case 7: - b_.f64[0] = SIMDE_FLOAT64_C(-0.0); - break; - case 8: - b_.f64[0] = SIMDE_FLOAT64_C(0.0); - break; - case 9: - b_.f64[0] = SIMDE_FLOAT64_C(-1.0); - break; - case 10: - b_.f64[0] = SIMDE_FLOAT64_C(1.0); - break; - case 11: - b_.f64[0] = SIMDE_FLOAT64_C(0.5); - break; - case 12: - b_.f64[0] = SIMDE_FLOAT64_C(90.0); - break; - case 13: - b_.f64[0] = SIMDE_MATH_PI / 2; - break; - case 14: - b_.f64[0] = SIMDE_MATH_DBL_MAX; - break; - case 15: - b_.f64[0] = -SIMDE_MATH_DBL_MAX; - break; - } - - return simde__m128d_from_private(b_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm_fixupimm_sd(a, b, c, imm8) _mm_fixupimm_sd(a, b, c, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_fixupimm_sd - #define _mm_fixupimm_sd(a, b, c, imm8) simde_mm_fixupimm_sd(a, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm_mask_fixupimm_sd(a, k, b, c, imm8) _mm_mask_fixupimm_sd(a, k, b, c, imm8) -#else - #define simde_mm_mask_fixupimm_sd(a, k, b, c, imm8) simde_mm_mask_mov_pd(a, ((k) | 2), simde_mm_fixupimm_sd(a, b, c, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_fixupimm_sd - #define _mm_mask_fixupimm_sd(a, k, b, c, imm8) simde_mm_mask_fixupimm_sd(a, k, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm_maskz_fixupimm_sd(k, a, b, c, imm8) _mm_maskz_fixupimm_sd(k, a, b, c, imm8) -#else - #define simde_mm_maskz_fixupimm_sd(k, a, b, c, imm8) simde_mm_maskz_mov_pd(((k) | 2), simde_mm_fixupimm_sd(a, b, c, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_fixupimm_sd - #define _mm_maskz_fixupimm_sd(k, a, b, c, imm8) simde_mm_maskz_fixupimm_sd(k, a, b, c, imm8) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_FIXUPIMM_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/fixupimm_round.h b/ffi-deps/simde/simde/x86/avx512/fixupimm_round.h deleted file mode 100644 index 636b82a..0000000 --- a/ffi-deps/simde/simde/x86/avx512/fixupimm_round.h +++ /dev/null @@ -1,687 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_FIXUPIMM_ROUND_H) -#define SIMDE_X86_AVX512_FIXUPIMM_ROUND_H - -#include "types.h" -#include "fixupimm.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_fixupimm_round_ps(a, b, c, imm8, sae) _mm512_fixupimm_round_ps(a, b, c, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm512_fixupimm_round_ps(a, b, c, imm8, sae) simde_mm512_fixupimm_ps(a, b, c, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm512_fixupimm_round_ps(a, b, c, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512 simde_mm512_fixupimm_round_ps_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm512_fixupimm_round_ps_envp; \ - int simde_mm512_fixupimm_round_ps_x = feholdexcept(&simde_mm512_fixupimm_round_ps_envp); \ - simde_mm512_fixupimm_round_ps_r = simde_mm512_fixupimm_ps(a, b, c, imm8); \ - if (HEDLEY_LIKELY(simde_mm512_fixupimm_round_ps_x == 0)) \ - fesetenv(&simde_mm512_fixupimm_round_ps_envp); \ - } \ - else { \ - simde_mm512_fixupimm_round_ps_r = simde_mm512_fixupimm_ps(a, b, c, imm8); \ - } \ - \ - simde_mm512_fixupimm_round_ps_r; \ - })) - #else - #define simde_mm512_fixupimm_round_ps(a, b, c, imm8, sae) simde_mm512_fixupimm_ps(a, b, c, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512 - simde_mm512_fixupimm_round_ps (simde__m512 a, simde__m512 b, simde__m512i c, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m512 r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm512_fixupimm_ps(a, b, c, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm512_fixupimm_ps(a, b, c, imm8); - #endif - } - else { - r = simde_mm512_fixupimm_ps(a, b, c, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_fixupimm_round_ps - #define _mm512_fixupimm_round_ps(a, b, c, imm8, sae) simde_mm512_fixupimm_round_ps(a, b, c, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_mask_fixupimm_round_ps(a, k, b, c, imm8, sae) _mm512_mask_fixupimm_round_ps(a, k, b, c, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm512_mask_fixupimm_round_ps(a, k, b, c, imm8, sae) simde_mm512_mask_fixupimm_ps(a, k, b, c, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm512_mask_fixupimm_round_ps(a, k, b, c, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512 simde_mm512_mask_fixupimm_round_ps_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm512_mask_fixupimm_round_ps_envp; \ - int simde_mm512_mask_fixupimm_round_ps_x = feholdexcept(&simde_mm512_mask_fixupimm_round_ps_envp); \ - simde_mm512_mask_fixupimm_round_ps_r = simde_mm512_mask_fixupimm_ps(a, k, b, c, imm8); \ - if (HEDLEY_LIKELY(simde_mm512_mask_fixupimm_round_ps_x == 0)) \ - fesetenv(&simde_mm512_mask_fixupimm_round_ps_envp); \ - } \ - else { \ - simde_mm512_mask_fixupimm_round_ps_r = simde_mm512_mask_fixupimm_ps(a, k, b, c, imm8); \ - } \ - \ - simde_mm512_mask_fixupimm_round_ps_r; \ - })) - #else - #define simde_mm512_mask_fixupimm_round_ps(a, k, b, c, imm8, sae) simde_mm512_mask_fixupimm_ps(a, k, b, c, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512 - simde_mm512_mask_fixupimm_round_ps (simde__m512 a, simde__mmask16 k, simde__m512 b, simde__m512i c, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m512 r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm512_mask_fixupimm_ps(a, k, b, c, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm512_mask_fixupimm_ps(a, k, b, c, imm8); - #endif - } - else { - r = simde_mm512_mask_fixupimm_ps(a, k, b, c, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_fixupimm_round_ps - #define _mm512_mask_fixupimm_round_ps(a, k, b, c, imm8, sae) simde_mm512_mask_fixupimm_round_ps(a, k, b, c, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_maskz_fixupimm_round_ps(k, a, b, c, imm8, sae) _mm512_maskz_fixupimm_round_ps(k, a, b, c, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm512_maskz_fixupimm_round_ps(k, a, b, c, imm8, sae) simde_mm512_maskz_fixupimm_ps(k, a, b, c, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm512_maskz_fixupimm_round_ps(k, a, b, c, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512 simde_mm512_maskz_fixupimm_round_ps_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm512_maskz_fixupimm_round_ps_envp; \ - int simde_mm512_maskz_fixupimm_round_ps_x = feholdexcept(&simde_mm512_maskz_fixupimm_round_ps_envp); \ - simde_mm512_maskz_fixupimm_round_ps_r = simde_mm512_maskz_fixupimm_ps(k, a, b, c, imm8); \ - if (HEDLEY_LIKELY(simde_mm512_maskz_fixupimm_round_ps_x == 0)) \ - fesetenv(&simde_mm512_maskz_fixupimm_round_ps_envp); \ - } \ - else { \ - simde_mm512_maskz_fixupimm_round_ps_r = simde_mm512_maskz_fixupimm_ps(k, a, b, c, imm8); \ - } \ - \ - simde_mm512_maskz_fixupimm_round_ps_r; \ - })) - #else - #define simde_mm512_maskz_fixupimm_round_ps(k, a, b, c, imm8, sae) simde_mm512_maskz_fixupimm_ps(k, a, b, c, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512 - simde_mm512_maskz_fixupimm_round_ps (simde__mmask16 k, simde__m512 a, simde__m512 b, simde__m512i c, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m512 r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm512_maskz_fixupimm_ps(k, a, b, c, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm512_maskz_fixupimm_ps(k, a, b, c, imm8); - #endif - } - else { - r = simde_mm512_maskz_fixupimm_ps(k, a, b, c, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_fixupimm_round_ps - #define _mm512_maskz_fixupimm_round_ps(k, a, b, c, imm8, sae) simde_mm512_maskz_fixupimm_round_ps(k, a, b, c, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_fixupimm_round_pd(a, b, c, imm8, sae) _mm512_fixupimm_round_pd(a, b, c, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm512_fixupimm_round_pd(a, b, c, imm8, sae) simde_mm512_fixupimm_pd(a, b, c, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm512_fixupimm_round_pd(a, b, c, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512d simde_mm512_fixupimm_round_pd_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm512_fixupimm_round_pd_envp; \ - int simde_mm512_fixupimm_round_pd_x = feholdexcept(&simde_mm512_fixupimm_round_pd_envp); \ - simde_mm512_fixupimm_round_pd_r = simde_mm512_fixupimm_pd(a, b, c, imm8); \ - if (HEDLEY_LIKELY(simde_mm512_fixupimm_round_pd_x == 0)) \ - fesetenv(&simde_mm512_fixupimm_round_pd_envp); \ - } \ - else { \ - simde_mm512_fixupimm_round_pd_r = simde_mm512_fixupimm_pd(a, b, c, imm8); \ - } \ - \ - simde_mm512_fixupimm_round_pd_r; \ - })) - #else - #define simde_mm512_fixupimm_round_pd(a, b, c, imm8, sae) simde_mm512_fixupimm_pd(a, b, c, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512d - simde_mm512_fixupimm_round_pd (simde__m512d a, simde__m512d b, simde__m512i c, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m512d r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm512_fixupimm_pd(a, b, c, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm512_fixupimm_pd(a, b, c, imm8); - #endif - } - else { - r = simde_mm512_fixupimm_pd(a, b, c, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_fixupimm_round_pd - #define _mm512_fixupimm_round_pd(a, b, c, imm8, sae) simde_mm512_fixupimm_round_pd(a, b, c, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_mask_fixupimm_round_pd(a, k, b, c, imm8, sae) _mm512_mask_fixupimm_round_pd(a, k, b, c, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm512_mask_fixupimm_round_pd(a, k, b, c, imm8, sae) simde_mm512_mask_fixupimm_pd(a, k, b, c, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm512_mask_fixupimm_round_pd(a, k, b, c, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512d simde_mm512_mask_fixupimm_round_pd_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm512_mask_fixupimm_round_pd_envp; \ - int simde_mm512_mask_fixupimm_round_pd_x = feholdexcept(&simde_mm512_mask_fixupimm_round_pd_envp); \ - simde_mm512_mask_fixupimm_round_pd_r = simde_mm512_mask_fixupimm_pd(a, k, b, c, imm8); \ - if (HEDLEY_LIKELY(simde_mm512_mask_fixupimm_round_pd_x == 0)) \ - fesetenv(&simde_mm512_mask_fixupimm_round_pd_envp); \ - } \ - else { \ - simde_mm512_mask_fixupimm_round_pd_r = simde_mm512_mask_fixupimm_pd(a, k, b, c, imm8); \ - } \ - \ - simde_mm512_mask_fixupimm_round_pd_r; \ - })) - #else - #define simde_mm512_mask_fixupimm_round_pd(a, k, b, c, imm8, sae) simde_mm512_mask_fixupimm_pd(a, k, b, c, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512d - simde_mm512_mask_fixupimm_round_pd (simde__m512d a, simde__mmask8 k, simde__m512d b, simde__m512i c, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m512d r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm512_mask_fixupimm_pd(a, k, b, c, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm512_mask_fixupimm_pd(a, k, b, c, imm8); - #endif - } - else { - r = simde_mm512_mask_fixupimm_pd(a, k, b, c, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_fixupimm_round_pd - #define _mm512_mask_fixupimm_round_pd(a, k, b, c, imm8, sae) simde_mm512_mask_fixupimm_round_pd(a, k, b, c, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_maskz_fixupimm_round_pd(k, a, b, c, imm8, sae) _mm512_maskz_fixupimm_round_pd(k, a, b, c, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm512_maskz_fixupimm_round_pd(k, a, b, c, imm8, sae) simde_mm512_maskz_fixupimm_pd(k, a, b, c, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm512_maskz_fixupimm_round_pd(k, a, b, c, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512d simde_mm512_maskz_fixupimm_round_pd_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm512_maskz_fixupimm_round_pd_envp; \ - int simde_mm512_maskz_fixupimm_round_pd_x = feholdexcept(&simde_mm512_maskz_fixupimm_round_pd_envp); \ - simde_mm512_maskz_fixupimm_round_pd_r = simde_mm512_maskz_fixupimm_pd(k, a, b, c, imm8); \ - if (HEDLEY_LIKELY(simde_mm512_maskz_fixupimm_round_pd_x == 0)) \ - fesetenv(&simde_mm512_maskz_fixupimm_round_pd_envp); \ - } \ - else { \ - simde_mm512_maskz_fixupimm_round_pd_r = simde_mm512_maskz_fixupimm_pd(k, a, b, c, imm8); \ - } \ - \ - simde_mm512_maskz_fixupimm_round_pd_r; \ - })) - #else - #define simde_mm512_maskz_fixupimm_round_pd(k, a, b, c, imm8, sae) simde_mm512_maskz_fixupimm_pd(k, a, b, c, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512d - simde_mm512_maskz_fixupimm_round_pd (simde__mmask8 k, simde__m512d a, simde__m512d b, simde__m512i c, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m512d r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm512_maskz_fixupimm_pd(k, a, b, c, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm512_maskz_fixupimm_pd(k, a, b, c, imm8); - #endif - } - else { - r = simde_mm512_maskz_fixupimm_pd(k, a, b, c, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_fixupimm_round_pd - #define _mm512_maskz_fixupimm_round_pd(k, a, b, c, imm8, sae) simde_mm512_maskz_fixupimm_round_pd(k, a, b, c, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm_fixupimm_round_ss(a, b, c, imm8, sae) _mm_fixupimm_round_ss(a, b, c, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm_fixupimm_round_ss(a, b, c, imm8, sae) simde_mm_fixupimm_ss(a, b, c, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm_fixupimm_round_ss(a, b, c, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m128 simde_mm_fixupimm_round_ss_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm_fixupimm_round_ss_envp; \ - int simde_mm_fixupimm_round_ss_x = feholdexcept(&simde_mm_fixupimm_round_ss_envp); \ - simde_mm_fixupimm_round_ss_r = simde_mm_fixupimm_ss(a, b, c, imm8); \ - if (HEDLEY_LIKELY(simde_mm_fixupimm_round_ss_x == 0)) \ - fesetenv(&simde_mm_fixupimm_round_ss_envp); \ - } \ - else { \ - simde_mm_fixupimm_round_ss_r = simde_mm_fixupimm_ss(a, b, c, imm8); \ - } \ - \ - simde_mm_fixupimm_round_ss_r; \ - })) - #else - #define simde_mm_fixupimm_round_ss(a, b, c, imm8, sae) simde_mm_fixupimm_ss(a, b, c, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128 - simde_mm_fixupimm_round_ss (simde__m128 a, simde__m128 b, simde__m128i c, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m128 r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm_fixupimm_ss(a, b, c, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm_fixupimm_ss(a, b, c, imm8); - #endif - } - else { - r = simde_mm_fixupimm_ss(a, b, c, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_fixupimm_round_ss - #define _mm_fixupimm_round_ss(a, b, c, imm8, sae) simde_mm_fixupimm_round_ss(a, b, c, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm_mask_fixupimm_round_ss(a, k, b, c, imm8, sae) _mm_mask_fixupimm_round_ss(a, k, b, c, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm_mask_fixupimm_round_ss(a, k, b, c, imm8, sae) simde_mm_mask_fixupimm_ss(a, k, b, c, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm_mask_fixupimm_round_ss(a, k, b, c, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m128 simde_mm_mask_fixupimm_round_ss_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm_mask_fixupimm_round_ss_envp; \ - int simde_mm_mask_fixupimm_round_ss_x = feholdexcept(&simde_mm_mask_fixupimm_round_ss_envp); \ - simde_mm_mask_fixupimm_round_ss_r = simde_mm_mask_fixupimm_ss(a, k, b, c, imm8); \ - if (HEDLEY_LIKELY(simde_mm_mask_fixupimm_round_ss_x == 0)) \ - fesetenv(&simde_mm_mask_fixupimm_round_ss_envp); \ - } \ - else { \ - simde_mm_mask_fixupimm_round_ss_r = simde_mm_mask_fixupimm_ss(a, k, b, c, imm8); \ - } \ - \ - simde_mm_mask_fixupimm_round_ss_r; \ - })) - #else - #define simde_mm_mask_fixupimm_round_ss(a, k, b, c, imm8, sae) simde_mm_mask_fixupimm_ss(a, k, b, c, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128 - simde_mm_mask_fixupimm_round_ss (simde__m128 a, simde__mmask8 k, simde__m128 b, simde__m128i c, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m128 r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm_mask_fixupimm_ss(a, k, b, c, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm_mask_fixupimm_ss(a, k, b, c, imm8); - #endif - } - else { - r = simde_mm_mask_fixupimm_ss(a, k, b, c, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_fixupimm_round_ss - #define _mm_mask_fixupimm_round_ss(a, k, b, c, imm8, sae) simde_mm_mask_fixupimm_round_ss(a, k, b, c, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm_maskz_fixupimm_round_ss(k, a, b, c, imm8, sae) _mm_maskz_fixupimm_round_ss(k, a, b, c, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm_maskz_fixupimm_round_ss(k, a, b, c, imm8, sae) simde_mm_maskz_fixupimm_ss(k, a, b, c, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm_maskz_fixupimm_round_ss(k, a, b, c, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m128 simde_mm_maskz_fixupimm_round_ss_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm_maskz_fixupimm_round_ss_envp; \ - int simde_mm_maskz_fixupimm_round_ss_x = feholdexcept(&simde_mm_maskz_fixupimm_round_ss_envp); \ - simde_mm_maskz_fixupimm_round_ss_r = simde_mm_maskz_fixupimm_ss(k, a, b, c, imm8); \ - if (HEDLEY_LIKELY(simde_mm_maskz_fixupimm_round_ss_x == 0)) \ - fesetenv(&simde_mm_maskz_fixupimm_round_ss_envp); \ - } \ - else { \ - simde_mm_maskz_fixupimm_round_ss_r = simde_mm_maskz_fixupimm_ss(k, a, b, c, imm8); \ - } \ - \ - simde_mm_maskz_fixupimm_round_ss_r; \ - })) - #else - #define simde_mm_maskz_fixupimm_round_ss(k, a, b, c, imm8, sae) simde_mm_maskz_fixupimm_ss(k, a, b, c, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128 - simde_mm_maskz_fixupimm_round_ss (simde__mmask8 k, simde__m128 a, simde__m128 b, simde__m128i c, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m128 r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm_maskz_fixupimm_ss(k, a, b, c, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm_maskz_fixupimm_ss(k, a, b, c, imm8); - #endif - } - else { - r = simde_mm_maskz_fixupimm_ss(k, a, b, c, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_fixupimm_round_ss - #define _mm_maskz_fixupimm_round_ss(k, a, b, c, imm8, sae) simde_mm_maskz_fixupimm_round_ss(k, a, b, c, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm_fixupimm_round_sd(a, b, c, imm8, sae) _mm_fixupimm_round_sd(a, b, c, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm_fixupimm_round_sd(a, b, c, imm8, sae) simde_mm_fixupimm_sd(a, b, c, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm_fixupimm_round_sd(a, b, c, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m128d simde_mm_fixupimm_round_sd_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm_fixupimm_round_sd_envp; \ - int simde_mm_fixupimm_round_sd_x = feholdexcept(&simde_mm_fixupimm_round_sd_envp); \ - simde_mm_fixupimm_round_sd_r = simde_mm_fixupimm_sd(a, b, c, imm8); \ - if (HEDLEY_LIKELY(simde_mm_fixupimm_round_sd_x == 0)) \ - fesetenv(&simde_mm_fixupimm_round_sd_envp); \ - } \ - else { \ - simde_mm_fixupimm_round_sd_r = simde_mm_fixupimm_sd(a, b, c, imm8); \ - } \ - \ - simde_mm_fixupimm_round_sd_r; \ - })) - #else - #define simde_mm_fixupimm_round_sd(a, b, c, imm8, sae) simde_mm_fixupimm_sd(a, b, c, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128d - simde_mm_fixupimm_round_sd (simde__m128d a, simde__m128d b, simde__m128i c, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m128d r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm_fixupimm_sd(a, b, c, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm_fixupimm_sd(a, b, c, imm8); - #endif - } - else { - r = simde_mm_fixupimm_sd(a, b, c, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_fixupimm_round_sd - #define _mm_fixupimm_round_sd(a, b, c, imm8, sae) simde_mm_fixupimm_round_sd(a, b, c, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm_mask_fixupimm_round_sd(a, k, b, c, imm8, sae) _mm_mask_fixupimm_round_sd(a, k, b, c, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm_mask_fixupimm_round_sd(a, k, b, c, imm8, sae) simde_mm_mask_fixupimm_sd(a, k, b, c, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm_mask_fixupimm_round_sd(a, k, b, c, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m128d simde_mm_mask_fixupimm_round_sd_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm_mask_fixupimm_round_sd_envp; \ - int simde_mm_mask_fixupimm_round_sd_x = feholdexcept(&simde_mm_mask_fixupimm_round_sd_envp); \ - simde_mm_mask_fixupimm_round_sd_r = simde_mm_mask_fixupimm_sd(a, k, b, c, imm8); \ - if (HEDLEY_LIKELY(simde_mm_mask_fixupimm_round_sd_x == 0)) \ - fesetenv(&simde_mm_mask_fixupimm_round_sd_envp); \ - } \ - else { \ - simde_mm_mask_fixupimm_round_sd_r = simde_mm_mask_fixupimm_sd(a, k, b, c, imm8); \ - } \ - \ - simde_mm_mask_fixupimm_round_sd_r; \ - })) - #else - #define simde_mm_mask_fixupimm_round_sd(a, k, b, c, imm8, sae) simde_mm_mask_fixupimm_sd(a, k, b, c, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128d - simde_mm_mask_fixupimm_round_sd (simde__m128d a, simde__mmask8 k, simde__m128d b, simde__m128i c, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m128d r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm_mask_fixupimm_sd(a, k, b, c, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm_mask_fixupimm_sd(a, k, b, c, imm8); - #endif - } - else { - r = simde_mm_mask_fixupimm_sd(a, k, b, c, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_fixupimm_round_sd - #define _mm_mask_fixupimm_round_sd(a, k, b, c, imm8, sae) simde_mm_mask_fixupimm_round_sd(a, k, b, c, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm_maskz_fixupimm_round_sd(k, a, b, c, imm8, sae) _mm_maskz_fixupimm_round_sd(k, a, b, c, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm_maskz_fixupimm_round_sd(k, a, b, c, imm8, sae) simde_mm_maskz_fixupimm_sd(k, a, b, c, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm_maskz_fixupimm_round_sd(k, a, b, c, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m128d simde_mm_maskz_fixupimm_round_sd_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm_maskz_fixupimm_round_sd_envp; \ - int simde_mm_maskz_fixupimm_round_sd_x = feholdexcept(&simde_mm_maskz_fixupimm_round_sd_envp); \ - simde_mm_maskz_fixupimm_round_sd_r = simde_mm_maskz_fixupimm_sd(k, a, b, c, imm8); \ - if (HEDLEY_LIKELY(simde_mm_maskz_fixupimm_round_sd_x == 0)) \ - fesetenv(&simde_mm_maskz_fixupimm_round_sd_envp); \ - } \ - else { \ - simde_mm_maskz_fixupimm_round_sd_r = simde_mm_maskz_fixupimm_sd(k, a, b, c, imm8); \ - } \ - \ - simde_mm_maskz_fixupimm_round_sd_r; \ - })) - #else - #define simde_mm_maskz_fixupimm_round_sd(k, a, b, c, imm8, sae) simde_mm_maskz_fixupimm_sd(k, a, b, c, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128d - simde_mm_maskz_fixupimm_round_sd (simde__mmask8 k, simde__m128d a, simde__m128d b, simde__m128i c, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m128d r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm_maskz_fixupimm_sd(k, a, b, c, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm_maskz_fixupimm_sd(k, a, b, c, imm8); - #endif - } - else { - r = simde_mm_maskz_fixupimm_sd(k, a, b, c, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_fixupimm_round_sd - #define _mm_maskz_fixupimm_round_sd(k, a, b, c, imm8, sae) simde_mm_maskz_fixupimm_round_sd(k, a, b, c, imm8, sae) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_FIXUPIMM_ROUND_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/flushsubnormal.h b/ffi-deps/simde/simde/x86/avx512/flushsubnormal.h deleted file mode 100644 index 6830e7c..0000000 --- a/ffi-deps/simde/simde/x86/avx512/flushsubnormal.h +++ /dev/null @@ -1,91 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_FLUSHSUBNORMAL_H) -#define SIMDE_X86_AVX512_FLUSHSUBNORMAL_H - -#include "types.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_flushsubnormal_ps (simde__m128 a) { - simde__m128_private a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - a_.f32[i] = simde_math_issubnormalf(a_.f32[i]) ? 0 : a_.f32[i]; - } - - return simde__m128_from_private(a_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_x_mm256_flushsubnormal_ps (simde__m256 a) { - simde__m256_private a_ = simde__m256_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - a_.f32[i] = simde_math_issubnormalf(a_.f32[i]) ? 0 : a_.f32[i]; - } - - return simde__m256_from_private(a_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_x_mm512_flushsubnormal_ps (simde__m512 a) { - simde__m512_private a_ = simde__m512_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - a_.f32[i] = simde_math_issubnormalf(a_.f32[i]) ? 0 : a_.f32[i]; - } - - return simde__m512_from_private(a_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_x_mm_flushsubnormal_pd (simde__m128d a) { - simde__m128d_private a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - a_.f64[i] = simde_math_issubnormal(a_.f64[i]) ? 0 : a_.f64[i]; - } - - return simde__m128d_from_private(a_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_x_mm256_flushsubnormal_pd (simde__m256d a) { - simde__m256d_private a_ = simde__m256d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - a_.f64[i] = simde_math_issubnormal(a_.f64[i]) ? 0 : a_.f64[i]; - } - - return simde__m256d_from_private(a_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_x_mm512_flushsubnormal_pd (simde__m512d a) { - simde__m512d_private a_ = simde__m512d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - a_.f64[i] = simde_math_issubnormal(a_.f64[i]) ? 0 : a_.f64[i]; - } - - return simde__m512d_from_private(a_); -} - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_FLUSHSUBNORMAL_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/fmadd.h b/ffi-deps/simde/simde/x86/avx512/fmadd.h deleted file mode 100644 index 0a89b4c..0000000 --- a/ffi-deps/simde/simde/x86/avx512/fmadd.h +++ /dev/null @@ -1,136 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Himanshi Mathur - */ - -#if !defined(SIMDE_X86_AVX512_FMADD_H) -#define SIMDE_X86_AVX512_FMADD_H - -#include "types.h" -#include "mov.h" -#include "../fma.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_fmadd_ps (simde__m512 a, simde__m512 b, simde__m512 c) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_fmadd_ps(a, b, c); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a), - b_ = simde__m512_to_private(b), - c_ = simde__m512_to_private(c); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_fmadd_ps(a_.m256[i], b_.m256[i], c_.m256[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = (a_.f32 * b_.f32) + c_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (a_.f32[i] * b_.f32[i]) + c_.f32[i]; - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_fmadd_ps - #define _mm512_fmadd_ps(a, b, c) simde_mm512_fmadd_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_fmadd_ps(simde__m512 a, simde__mmask16 k, simde__m512 b, simde__m512 c) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_fmadd_ps(a, k, b, c); - #else - return simde_mm512_mask_mov_ps(a, k, simde_mm512_fmadd_ps(a, b, c)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_fmadd_ps - #define _mm512_mask_fmadd_ps(a, k, b, c) simde_mm512_mask_fmadd_ps(a, k, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_fmadd_ps(simde__mmask16 k, simde__m512 a, simde__m512 b, simde__m512 c) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_fmadd_ps(k, a, b, c); - #else - return simde_mm512_maskz_mov_ps(k, simde_mm512_fmadd_ps(a, b, c)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_fmadd_ps - #define _mm512_maskz_fmadd_ps(k, a, b, c) simde_mm512_maskz_fmadd_ps(k, a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_fmadd_pd (simde__m512d a, simde__m512d b, simde__m512d c) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_fmadd_pd(a, b, c); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a), - b_ = simde__m512d_to_private(b), - c_ = simde__m512d_to_private(c); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_fmadd_pd(a_.m256d[i], b_.m256d[i], c_.m256d[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = (a_.f64 * b_.f64) + c_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (a_.f64[i] * b_.f64[i]) + c_.f64[i]; - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_fmadd_pd - #define _mm512_fmadd_pd(a, b, c) simde_mm512_fmadd_pd(a, b, c) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_FMADD_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/fmsub.h b/ffi-deps/simde/simde/x86/avx512/fmsub.h deleted file mode 100644 index 4f52d40..0000000 --- a/ffi-deps/simde/simde/x86/avx512/fmsub.h +++ /dev/null @@ -1,276 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 kitegi - */ - -#if !defined(SIMDE_X86_AVX512_FMSUB_H) -#define SIMDE_X86_AVX512_FMSUB_H - -#include "types.h" -#include "mov.h" -#include "../fma.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_mask3_fmsub_pd (simde__m256d a, simde__m256d b, simde__m256d c, simde__mmask8 k) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask3_fmsub_pd(a, b, c, k); - #else - return simde_mm256_mask_mov_pd(c, k, simde_mm256_fmsub_pd(a, b, c)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask3_fmsub_pd - #define _mm256_mask3_fmsub_pd(a, b, c, k) simde_mm256_mask3_fmsub_pd(a, b, c, k) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_mask_fmsub_pd (simde__m256d a, simde__mmask8 k, simde__m256d b, simde__m256d c) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_fmsub_pd(a, k, b, c); - #else - return simde_mm256_mask_mov_pd(a, k, simde_mm256_fmsub_pd(a, b, c)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_fmsub_pd - #define _mm256_mask_fmsub_pd(a, k, b, c) simde_mm256_mask_fmsub_pd(a, k, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_maskz_fmsub_pd (simde__mmask8 k, simde__m256d a, simde__m256d b, simde__m256d c) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_fmsub_pd(k, a, b, c); - #else - return simde_mm256_maskz_mov_pd(k, simde_mm256_fmsub_pd(a, b, c)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_fmsub_pd - #define _mm256_maskz_fmsub_pd(k, a, b, c) simde_mm256_maskz_fmsub_pd(k, a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_mask3_fmsub_pd (simde__m128d a, simde__m128d b, simde__m128d c, simde__mmask8 k) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask3_fmsub_pd(a, b, c, k); - #else - return simde_mm_mask_mov_pd(c, k, simde_mm_fmsub_pd(a, b, c)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask3_fmsub_pd - #define _mm_mask3_fmsub_pd(a, b, c, k) simde_mm_mask3_fmsub_pd(a, b, c, k) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_mask_fmsub_pd (simde__m128d a, simde__mmask8 k, simde__m128d b, simde__m128d c) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_fmsub_pd(a, k, b, c); - #else - return simde_mm_mask_mov_pd(a, k, simde_mm_fmsub_pd(a, b, c)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_fmsub_pd - #define _mm_mask_fmsub_pd(a, k, b, c) simde_mm_mask_fmsub_pd(a, k, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_maskz_fmsub_pd (simde__mmask8 k, simde__m128d a, simde__m128d b, simde__m128d c) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_fmsub_pd(k, a, b, c); - #else - return simde_mm_maskz_mov_pd(k, simde_mm_fmsub_pd(a, b, c)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_fmsub_pd - #define _mm_maskz_fmsub_pd(k, a, b, c) simde_mm_maskz_fmsub_pd(k, a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_mask3_fmsub_ps (simde__m256 a, simde__m256 b, simde__m256 c, simde__mmask8 k) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask3_fmsub_ps(a, b, c, k); - #else - return simde_mm256_mask_mov_ps(c, k, simde_mm256_fmsub_ps(a, b, c)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask3_fmsub_ps - #define _mm256_mask3_fmsub_ps(a, b, c, k) simde_mm256_mask3_fmsub_ps(a, b, c, k) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_mask_fmsub_ps (simde__m256 a, simde__mmask8 k, simde__m256 b, simde__m256 c) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_fmsub_ps(a, k, b, c); - #else - return simde_mm256_mask_mov_ps(a, k, simde_mm256_fmsub_ps(a, b, c)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_fmsub_ps - #define _mm256_mask_fmsub_ps(a, k, b, c) simde_mm256_mask_fmsub_ps(a, k, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_maskz_fmsub_ps (simde__mmask8 k, simde__m256 a, simde__m256 b, simde__m256 c) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_fmsub_ps(k, a, b, c); - #else - return simde_mm256_maskz_mov_ps(k, simde_mm256_fmsub_ps(a, b, c)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_fmsub_ps - #define _mm256_maskz_fmsub_ps(k, a, b, c) simde_mm256_maskz_fmsub_ps(k, a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_mask3_fmsub_ps (simde__m128 a, simde__m128 b, simde__m128 c, simde__mmask8 k) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask3_fmsub_ps(a, b, c, k); - #else - return simde_mm_mask_mov_ps(c, k, simde_mm_fmsub_ps(a, b, c)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask3_fmsub_ps - #define _mm_mask3_fmsub_ps(a, b, c, k) simde_mm_mask3_fmsub_ps(a, b, c, k) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_mask_fmsub_ps (simde__m128 a, simde__mmask8 k, simde__m128 b, simde__m128 c) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_fmsub_ps(a, k, b, c); - #else - return simde_mm_mask_mov_ps(a, k, simde_mm_fmsub_ps(a, b, c)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_fmsub_ps - #define _mm_mask_fmsub_ps(a, k, b, c) simde_mm_mask_fmsub_ps(a, k, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_maskz_fmsub_ps (simde__mmask8 k, simde__m128 a, simde__m128 b, simde__m128 c) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_fmsub_ps(k, a, b, c); - #else - return simde_mm_maskz_mov_ps(k, simde_mm_fmsub_ps(a, b, c)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_fmsub_ps - #define _mm_maskz_fmsub_ps(k, a, b, c) simde_mm_maskz_fmsub_ps(k, a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_fmsub_ps (simde__m512 a, simde__m512 b, simde__m512 c) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_fmsub_ps(a, b, c); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a), - b_ = simde__m512_to_private(b), - c_ = simde__m512_to_private(c); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_fmsub_ps(a_.m256[i], b_.m256[i], c_.m256[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = (a_.f32 * b_.f32) - c_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (a_.f32[i] * b_.f32[i]) - c_.f32[i]; - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_fmsub_ps - #define _mm512_fmsub_ps(a, b, c) simde_mm512_fmsub_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_fmsub_pd (simde__m512d a, simde__m512d b, simde__m512d c) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_fmsub_pd(a, b, c); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a), - b_ = simde__m512d_to_private(b), - c_ = simde__m512d_to_private(c); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_fmsub_pd(a_.m256d[i], b_.m256d[i], c_.m256d[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = (a_.f64 * b_.f64) - c_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (a_.f64[i] * b_.f64[i]) - c_.f64[i]; - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_fmsub_pd - #define _mm512_fmsub_pd(a, b, c) simde_mm512_fmsub_pd(a, b, c) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_FMSUB_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/fnmadd.h b/ffi-deps/simde/simde/x86/avx512/fnmadd.h deleted file mode 100644 index 6779dbd..0000000 --- a/ffi-deps/simde/simde/x86/avx512/fnmadd.h +++ /dev/null @@ -1,108 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 kitegi - */ - -#if !defined(SIMDE_X86_AVX512_FNMADD_H) -#define SIMDE_X86_AVX512_FNMADD_H - -#include "types.h" -#include "mov.h" -#include "../fma.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_fnmadd_ps (simde__m512 a, simde__m512 b, simde__m512 c) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_fnmadd_ps(a, b, c); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a), - b_ = simde__m512_to_private(b), - c_ = simde__m512_to_private(c); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_fnmadd_ps(a_.m256[i], b_.m256[i], c_.m256[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = -(a_.f32 * b_.f32) + c_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = -(a_.f32[i] * b_.f32[i]) + c_.f32[i]; - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_fnmadd_ps - #define _mm512_fnmadd_ps(a, b, c) simde_mm512_fnmadd_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_fnmadd_pd (simde__m512d a, simde__m512d b, simde__m512d c) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_fnmadd_pd(a, b, c); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a), - b_ = simde__m512d_to_private(b), - c_ = simde__m512d_to_private(c); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_fnmadd_pd(a_.m256d[i], b_.m256d[i], c_.m256d[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = -(a_.f64 * b_.f64) + c_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = -(a_.f64[i] * b_.f64[i]) + c_.f64[i]; - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_fnmadd_pd - #define _mm512_fnmadd_pd(a, b, c) simde_mm512_fnmadd_pd(a, b, c) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_FNMADD_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/fnmsub.h b/ffi-deps/simde/simde/x86/avx512/fnmsub.h deleted file mode 100644 index 8d969de..0000000 --- a/ffi-deps/simde/simde/x86/avx512/fnmsub.h +++ /dev/null @@ -1,108 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 kitegi - */ - -#if !defined(SIMDE_X86_AVX512_FNMSUB_H) -#define SIMDE_X86_AVX512_FNMSUB_H - -#include "types.h" -#include "mov.h" -#include "../fma.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_fnmsub_ps (simde__m512 a, simde__m512 b, simde__m512 c) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_fnmsub_ps(a, b, c); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a), - b_ = simde__m512_to_private(b), - c_ = simde__m512_to_private(c); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_fnmsub_ps(a_.m256[i], b_.m256[i], c_.m256[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = -(a_.f32 * b_.f32) - c_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = -(a_.f32[i] * b_.f32[i]) - c_.f32[i]; - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_fnmsub_ps - #define _mm512_fnmsub_ps(a, b, c) simde_mm512_fnmsub_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_fnmsub_pd (simde__m512d a, simde__m512d b, simde__m512d c) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_fnmsub_pd(a, b, c); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a), - b_ = simde__m512d_to_private(b), - c_ = simde__m512d_to_private(c); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_fnmsub_pd(a_.m256d[i], b_.m256d[i], c_.m256d[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = -(a_.f64 * b_.f64) - c_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = -(a_.f64[i] * b_.f64[i]) - c_.f64[i]; - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_fnmsub_pd - #define _mm512_fnmsub_pd(a, b, c) simde_mm512_fnmsub_pd(a, b, c) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_FNMSUB_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/fpclass.h b/ffi-deps/simde/simde/x86/avx512/fpclass.h deleted file mode 100644 index 1765570..0000000 --- a/ffi-deps/simde/simde/x86/avx512/fpclass.h +++ /dev/null @@ -1,99 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2023 Michael R. Crusoe - */ - -#if !defined(SIMDE_X86_AVX512_FPCLASS_H) -#define SIMDE_X86_AVX512_FPCLASS_H - -#include "types.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_fpclass_ps_mask(simde__m256 a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 0x88) { - simde__mmask8 r = 0; - simde__m256_private a_ = simde__m256_to_private(a); - - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r |= simde_math_fpclassf(a_.f32[i], imm8) ? (UINT8_C(1) << i) : 0; - } - return r; -} -#if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) -# define simde_mm256_fpclass_ps_mask(a, imm8) _mm256_fpclass_ps_mask((a), (imm8)) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) -# undef _mm256_fpclass_ps_mask -# define _mm256_fpclass_ps_mask(a, imm8) simde_mm256_fpclass_ps_mask((a), (imm8)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm512_fpclass_ph_mask(simde__m512h a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 0x88) { - simde__mmask32 r = 0; - simde__m512h_private a_ = simde__m512h_to_private(a); - - for (size_t i = 0 ; i < (sizeof(a_.f16) / sizeof(a_.f16[0])) ; i++) { - r |= simde_fpclasshf(a_.f16[i], imm8) ? (UINT8_C(1) << i) : 0; - } - return r; -} -#if defined(SIMDE_X86_AVX512FP16_NATIVE) -# define simde_mm512_fpclass_ph_mask(a, imm8) _mm512_fpclass_ph_mask((a), (imm8)) -#endif -#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) -# undef _mm512_fpclass_ph_mask -# define _mm512_fpclass_ph_mask(a, imm8) simde_mm512_fpclass_ph_mask((a), (imm8)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm512_fpclass_pd_mask(simde__m512d a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 0x88) { - simde__mmask8 r = 0; - simde__m512d_private a_ = simde__m512d_to_private(a); - - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - r |= simde_math_fpclass(a_.f64[i], imm8) ? (UINT8_C(1) << i) : 0; - } - return r; -} -#if defined(SIMDE_X86_AVX512DQ_NATIVE) -# define simde_mm512_fpclass_pd_mask(a, imm8) _mm512_fpclass_pd_mask((a), (imm8)) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) -# undef _mm512_fpclass_pd_mask -# define _mm512_fpclass_pd_mask(a, imm8) simde_mm512_fpclass_pd_mask((a), (imm8)) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_FPCLASS_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/gather.h b/ffi-deps/simde/simde/x86/avx512/gather.h deleted file mode 100644 index 8dec2ee..0000000 --- a/ffi-deps/simde/simde/x86/avx512/gather.h +++ /dev/null @@ -1,312 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2023 Michael R. Crusoe - */ - -#if !defined(SIMDE_X86_AVX512_GATHER_H) -#define SIMDE_X86_AVX512_GATHER_H - -#include "types.h" -#include "../avx2.h" -#include "extract.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_i32gather_ps(simde__m512i vindex, const void* base_addr, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m512i_private vindex_ = simde__m512i_to_private(vindex); - simde__m512_private r_ = simde__m512_to_private(simde_mm512_setzero_ps()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i32) / sizeof(vindex_.i32[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i32[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float32 dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.f32[i] = dst; - } - - return simde__m512_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(10,0,0)) - #define simde_mm512_i32gather_ps(vindex, base_addr, scale) _mm512_i32gather_ps((vindex), (base_addr), (scale)) -#elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm512_i32gather_ps(vindex, base_addr, scale) SIMDE_STATEMENT_EXPR_(({\ - simde__m512_private simde_mm512_i32gather_ps_r_; \ - simde__m512i_private simde_mm512_i32gather_ps_vindex_ = simde__m512i_to_private((vindex)); \ - simde_mm512_i32gather_ps_r_.m256[0] = _mm256_i32gather_ps( \ - HEDLEY_STATIC_CAST(float const*, (base_addr)), simde_mm512_i32gather_ps_vindex_.m256i[0], (scale)); \ - simde_mm512_i32gather_ps_r_.m256[1] = _mm256_i32gather_ps( \ - HEDLEY_STATIC_CAST(float const*, (base_addr)), simde_mm512_i32gather_ps_vindex_.m256i[1], (scale)); \ - simde__m512_from_private(simde_mm512_i32gather_ps_r_); \ - })) -#elif defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm512_i32gather_ps(vindex, base_addr, scale) \ - simde_x_mm512_set_m256( \ - _mm256_i32gather_ps(HEDLEY_STATIC_CAST(float const*, (base_addr)), \ - simde_mm512_extracti32x8_epi32((vindex), 1), (scale)), \ - _mm256_i32gather_ps(HEDLEY_STATIC_CAST(float const*, (base_addr)), \ - simde_mm512_extracti32x8_epi32((vindex), 0), (scale)) ) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_i32gather_ps - #define _mm512_i32gather_ps(vindex, base_addr, scale) simde_mm512_i32gather_ps((vindex), (base_addr), (scale)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm512_i64gather_epi32(simde__m512i vindex, const void* base_addr, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m512i_private vindex_; - simde__m256i_private r_; - vindex_ = simde__m512i_to_private(vindex); - r_ = simde__m256i_to_private(simde_mm256_setzero_si256()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int32_t dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.i32[i] = dst; - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_i64gather_epi32(vindex, base_addr, scale) _mm512_i64gather_epi32((vindex), (base_addr), (scale)) -#elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm512_i64gather_epi32(vindex, base_addr, scale) SIMDE_STATEMENT_EXPR_(({\ - simde__m256i_private simde_mm512_i64gather_epi32_r_; \ - simde__m512i_private simde_mm512_i64gather_epi32_vindex_ = simde__m512i_to_private((vindex)); \ - simde_mm512_i64gather_epi32_r_.m128i[0] = _mm256_i64gather_epi32( \ - HEDLEY_STATIC_CAST(int const*, (base_addr)), simde_mm512_i64gather_epi32_vindex_.m256i[0], (scale)); \ - simde_mm512_i64gather_epi32_r_.m128i[1] = _mm256_i64gather_epi32( \ - HEDLEY_STATIC_CAST(int const*, (base_addr)), simde_mm512_i64gather_epi32_vindex_.m256i[1], (scale)); \ - simde__m256i_from_private(simde_mm512_i64gather_epi32_r_); \ - })) -#elif defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm512_i64gather_epi32(vindex, base_addr, scale) \ - _mm256_insertf128_si256( \ - _mm256_castsi128_si256( \ - _mm256_i64gather_epi32(HEDLEY_STATIC_CAST(int const*, (base_addr)), \ - simde_mm512_extracti64x4_epi64((vindex), 0), (scale))), \ - _mm256_i64gather_epi32(HEDLEY_STATIC_CAST(int const*, (base_addr)), \ - simde_mm512_extracti64x4_epi64((vindex), 1), (scale)), \ - 1) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_i64gather_epi32 - #define _mm512_i64gather_epi32(vindex, base_addr, scale) simde_mm512_i64gather_epi32((vindex), (base_addr), (scale)) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_mask_i64gather_epi32(src, k, vindex, base_addr, scale) _mm512_mask_i64gather_epi32((src), (k), (vindex), (base_addr), (scale)) -#else - #define simde_mm512_mask_i64gather_epi32(src, k, vindex, base_addr, scale) simde_mm256_mask_mov_epi32(src, k, simde_mm512_i64gather_epi32((vindex), (base_addr), (scale))) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_i64gather_epi32 - #define _mm512_mask_i64gather_epi32(src, k, vindex, base_addr, scale) simde_mm512_mask_i64gather_epi32((src), (k), (vindex), (base_addr), (scale)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_i64gather_epi64(simde__m512i vindex, const void* base_addr, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m512i_private - vindex_ = simde__m512i_to_private(vindex), - r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - int64_t dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.i64[i] = dst; - } - - return simde__m512i_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_i64gather_epi64(vindex, base_addr, scale) _mm512_i64gather_epi64((vindex), (base_addr), (scale)) -#elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm512_i64gather_epi64(vindex, base_addr, scale) SIMDE_STATEMENT_EXPR_(({\ - simde__m512i_private simde_mm512_i64gather_epi64_r_, \ - simde_mm512_i64gather_epi64_vindex_ = simde__m512i_to_private((vindex)); \ - simde_mm512_i64gather_epi64_r_.m256i[0] = _mm256_i64gather_epi64( \ - HEDLEY_STATIC_CAST(long long const*, (base_addr)), simde_mm512_i64gather_epi64_vindex_.m256i[0], (scale)); \ - simde_mm512_i64gather_epi64_r_.m256i[1] = _mm256_i64gather_epi64( \ - HEDLEY_STATIC_CAST(long long const*, (base_addr)), simde_mm512_i64gather_epi64_vindex_.m256i[1], (scale)); \ - simde__m512i_from_private(simde_mm512_i64gather_epi64_r_); \ - })) -#elif defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm512_i64gather_epi64(vindex, base_addr, scale) \ - simde_x_mm512_set_m256i( \ - _mm256_i64gather_epi64(HEDLEY_STATIC_CAST(long long const*, (base_addr)), \ - simde_mm512_extracti32x8_epi32((vindex), 1), (scale)), \ - _mm256_i64gather_epi64(HEDLEY_STATIC_CAST(long long const*, (base_addr)), \ - simde_mm512_extracti32x8_epi32((vindex), 0), (scale)) ) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_i64gather_epi64 - #define _mm512_i64gather_epi64(vindex, base_addr, scale) simde_mm512_i64gather_epi64(vindex, (base_addr), (scale)) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_mask_i64gather_epi64(src, k, vindex, base_addr, scale) _mm512_mask_i64gather_epi64((src), (k), (vindex), (base_addr), (scale)) -#else - #define simde_mm512_mask_i64gather_epi64(src, k, vindex, base_addr, scale) simde_mm512_mask_mov_epi64((src), (k), simde_mm512_i64gather_epi64((vindex), (base_addr), (scale))) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_i64gather_epi64 - #define _mm512_mask_i64gather_epi64(src, k, vindex, base_addr, scale) simde_mm512_mask_i64gather_epi64((src), (k), (vindex), (base_addr), (scale)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_i64gather_pd(simde__m512i vindex, const void* base_addr, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m512i_private vindex_; - simde__m512d_private r_; - vindex_ = simde__m512i_to_private(vindex); - r_ = simde__m512d_to_private(simde_mm512_setzero_pd()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float64 dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.f64[i] = dst; - } - - return simde__m512d_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_i64gather_pd(vindex, base_addr, scale) _mm512_i64gather_pd((vindex), (base_addr), (scale)) -#elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm512_i64gather_pd(vindex, base_addr, scale) SIMDE_STATEMENT_EXPR_(({\ - simde__m512d_private simde_mm512_i64gather_pd_r_; \ - simde__m512i_private simde_mm512_i64gather_pd_vindex_ = simde__m512i_to_private((vindex)); \ - simde_mm512_i64gather_pd_r_.m256d[0] = _mm256_i64gather_pd( \ - HEDLEY_STATIC_CAST(double const*, (base_addr)), simde_mm512_i64gather_pd_vindex_.m256i[0], (scale)); \ - simde_mm512_i64gather_pd_r_.m256d[1] = _mm256_i64gather_pd( \ - HEDLEY_STATIC_CAST(double const*, (base_addr)), simde_mm512_i64gather_pd_vindex_.m256i[1], (scale)); \ - simde__m512d_from_private(simde_mm512_i64gather_pd_r_); \ - })) -#elif defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm512_i64gather_pd(vindex, base_addr, scale) \ - simde_x_mm512_set_m256d( \ - _mm256_i64gather_pd(HEDLEY_STATIC_CAST(double const*, (base_addr)), \ - simde_mm512_extracti64x4_epi64((vindex), 1), (scale)), \ - _mm256_i64gather_pd(HEDLEY_STATIC_CAST(double const*, (base_addr)), \ - simde_mm512_extracti64x4_epi64((vindex), 0), (scale)) ) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_i64gather_pd - #define _mm512_i64gather_pd(vindex, base_addr, scale) simde_mm512_i64gather_pd((vindex), (base_addr), (scale)) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_mask_i64gather_pd(src, k, vindex, base_addr, scale) _mm512_mask_i64gather_pd((src), (k), (vindex), (base_addr), (scale)) -#else - #define simde_mm512_mask_i64gather_pd(src, k, vindex, base_addr, scale) simde_mm512_mask_mov_pd((src), (k), simde_mm512_i64gather_pd((vindex), (base_addr), (scale))) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_i64gather_pd - #define _mm512_mask_i64gather_pd(src, k, vindex, base_addr, scale) simde_mm512_mask_i64gather_pd((src), (k), (vindex), (base_addr), (scale)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm512_i64gather_ps(simde__m512i vindex, const void* base_addr, const int32_t scale) - SIMDE_REQUIRE_CONSTANT(scale) - HEDLEY_REQUIRE_MSG((scale && scale <= 8 && !(scale & (scale - 1))), "`scale' must be a power of two less than or equal to 8") { - simde__m512i_private vindex_; - simde__m256_private r_; - vindex_ = simde__m512i_to_private(vindex); - r_ = simde__m256_to_private(simde_mm256_setzero_ps()); - const uint8_t* addr = HEDLEY_REINTERPRET_CAST(const uint8_t*, base_addr); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(vindex_.i64) / sizeof(vindex_.i64[0])) ; i++) { - const uint8_t* src = addr + (HEDLEY_STATIC_CAST(size_t , vindex_.i64[i]) * HEDLEY_STATIC_CAST(size_t , scale)); - simde_float32 dst; - simde_memcpy(&dst, src, sizeof(dst)); - r_.f32[i] = dst; - } - - return simde__m256_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_i64gather_ps(vindex, base_addr, scale) _mm512_i64gather_ps((vindex), (base_addr), (scale)) -#elif defined(SIMDE_X86_AVX2_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm512_i64gather_ps(vindex, base_addr, scale) SIMDE_STATEMENT_EXPR_(({\ - simde__m256_private simde_mm512_i64gather_ps_r_; \ - simde__m512i_private simde_mm512_i64gather_ps_vindex_ = simde__m512i_to_private((vindex)); \ - simde_mm512_i64gather_ps_r_.m128[0] = _mm256_i64gather_ps( \ - HEDLEY_STATIC_CAST(float const*, (base_addr)), simde_mm512_i64gather_ps_vindex_.m256i[0], (scale)); \ - simde_mm512_i64gather_ps_r_.m128[1] = _mm256_i64gather_ps( \ - HEDLEY_STATIC_CAST(float const*, (base_addr)), simde_mm512_i64gather_ps_vindex_.m256i[1], (scale)); \ - simde__m256_from_private(simde_mm512_i64gather_ps_r_); \ - })) -#elif defined(SIMDE_X86_AVX2_NATIVE) && !defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm512_i64gather_ps(vindex, base_addr, scale) \ - _mm256_insertf128_ps( \ - _mm256_castps128_ps256( \ - _mm256_i64gather_ps(HEDLEY_STATIC_CAST(float const*, (base_addr)), \ - simde_mm512_extracti64x4_epi64((vindex), 0), (scale))), \ - _mm256_i64gather_ps(HEDLEY_STATIC_CAST(float const*, (base_addr)), \ - simde_mm512_extracti64x4_epi64((vindex), 1), (scale)), \ - 1) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_i64gather_ps - #define _mm512_i64gather_ps(vindex, base_addr, scale) simde_mm512_i64gather_ps((vindex), (base_addr), (scale)) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_mask_i64gather_ps(src, k, vindex, base_addr, scale) _mm512_mask_i64gather_ps((src), (k), (vindex), (base_addr), (scale)) -#else - #define simde_mm512_mask_i64gather_ps(src, k, vindex, base_addr, scale) simde_mm256_mask_mov_ps((src), (k), simde_mm512_i64gather_ps((vindex), (base_addr), (scale))) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_i64gather_ps - #define _mm512_mask_i64gather_ps(src, k, vindex, base_addr, scale) simde_mm512_mask_i64gather_ps((src), (k), (vindex), (base_addr), (scale)) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_GATHER_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/insert.h b/ffi-deps/simde/simde/x86/avx512/insert.h deleted file mode 100644 index 67120d3..0000000 --- a/ffi-deps/simde/simde/x86/avx512/insert.h +++ /dev/null @@ -1,490 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Christopher Moore - */ - -#if !defined(SIMDE_X86_AVX512_INSERT_H) -#define SIMDE_X86_AVX512_INSERT_H - -#include "types.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_insertf32x4 (simde__m512 a, simde__m128 b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - simde__m512 r; - switch(imm8) { - case 0: r = _mm512_insertf32x4(a, b, 0); break; - case 1: r = _mm512_insertf32x4(a, b, 1); break; - case 2: r = _mm512_insertf32x4(a, b, 2); break; - case 3: r = _mm512_insertf32x4(a, b, 3); break; - default: HEDLEY_UNREACHABLE(); r = simde_mm512_setzero_ps(); break; - } - return r; - #else - simde__m512_private a_ = simde__m512_to_private(a); - - a_.m128[imm8 & 3] = b; - - return simde__m512_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_insertf32x4 - #define _mm512_insertf32x4(a, b, imm8) simde_mm512_insertf32x4(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_insertf32x4 (simde__m512 src, simde__mmask16 k, simde__m512 a, simde__m128 b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m512 r; - - #if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) - SIMDE_CONSTIFY_4_(_mm512_mask_insertf32x4, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_ps ()), imm8, src, k, a, b); - return r; - #else - SIMDE_CONSTIFY_4_(simde_mm512_insertf32x4, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_ps ()), imm8, a, b); - return simde_mm512_mask_mov_ps(src, k, r); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_insertf32x4 - #define _mm512_mask_insertf32x4(src, k, a, b, imm8) simde_mm512_mask_insertf32x4(src, k, a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_insertf32x4 (simde__mmask16 k, simde__m512 a, simde__m128 b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m512 r; - - #if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) - SIMDE_CONSTIFY_4_(_mm512_maskz_insertf32x4, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_ps ()), imm8, k, a, b); - return r; - #else - SIMDE_CONSTIFY_4_(simde_mm512_insertf32x4, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_ps ()), imm8, a, b); - return simde_mm512_maskz_mov_ps(k, r); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_insertf32x4 - #define _mm512_maskz_insertf32x4(k, a, b, imm8) simde_mm512_maskz_insertf32x4(k, a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_insertf64x4 (simde__m512d a, simde__m256d b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m512d_private a_ = simde__m512d_to_private(a); - - a_.m256d[imm8 & 1] = b; - - return simde__m512d_from_private(a_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_insertf64x4(a, b, imm8) _mm512_insertf64x4(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_insertf64x4 - #define _mm512_insertf64x4(a, b, imm8) simde_mm512_insertf64x4(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_insertf64x4 (simde__m512d src, simde__mmask8 k, simde__m512d a, simde__m256d b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m512d r; - - #if defined(SIMDE_X86_AVX512F_NATIVE) - SIMDE_CONSTIFY_2_(_mm512_mask_insertf64x4, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_pd ()), imm8, src, k, a, b); - return r; - #else - SIMDE_CONSTIFY_2_(simde_mm512_insertf64x4, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_pd ()), imm8, a, b); - return simde_mm512_mask_mov_pd(src, k, r); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_insertf64x4 - #define _mm512_mask_insertf64x4(src, k, a, b, imm8) simde_mm512_mask_insertf64x4(src, k, a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_maskz_insertf64x4 (simde__mmask8 k, simde__m512d a, simde__m256d b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m512d r; - - #if defined(SIMDE_X86_AVX512F_NATIVE) - SIMDE_CONSTIFY_2_(_mm512_maskz_insertf64x4, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_pd ()), imm8, k, a, b); - return r; - #else - SIMDE_CONSTIFY_2_(simde_mm512_insertf64x4, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_pd ()), imm8, a, b); - return simde_mm512_maskz_mov_pd(k, r); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_insertf64x4 - #define _mm512_maskz_insertf64x4(k, a, b, imm8) simde_mm512_maskz_insertf64x4(k, a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_inserti32x4 (simde__m512i a, simde__m128i b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m512i_private a_ = simde__m512i_to_private(a); - - a_.m128i[imm8 & 3] = b; - - return simde__m512i_from_private(a_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_inserti32x4(a, b, imm8) _mm512_inserti32x4(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_inserti32x4 - #define _mm512_inserti32x4(a, b, imm8) simde_mm512_inserti32x4(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_inserti32x4 (simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m128i b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m512i r; - - #if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) - SIMDE_CONSTIFY_4_(_mm512_mask_inserti32x4, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_si512 ()), imm8, src, k, a, b); - return r; - #else - SIMDE_CONSTIFY_4_(simde_mm512_inserti32x4, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_si512 ()), imm8, a, b); - return simde_mm512_mask_mov_epi32(src, k, r); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_inserti32x4 - #define _mm512_mask_inserti32x4(src, k, a, b, imm8) simde_mm512_mask_inserti32x4(src, k, a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_inserti32x4 (simde__mmask16 k, simde__m512i a, simde__m128i b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m512i r; - - #if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,0,0)) - SIMDE_CONSTIFY_4_(_mm512_maskz_inserti32x4, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_si512 ()), imm8, k, a, b); - return r; - #else - SIMDE_CONSTIFY_4_(simde_mm512_inserti32x4, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_si512 ()), imm8, a, b); - return simde_mm512_maskz_mov_epi32(k, r); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_inserti32x4 - #define _mm512_maskz_inserti32x4(k, a, b, imm8) simde_mm512_maskz_inserti32x4(k, a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_inserti64x4 (simde__m512i a, simde__m256i b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m512i_private a_ = simde__m512i_to_private(a); - - a_.m256i[imm8 & 1] = b; - - return simde__m512i_from_private(a_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_inserti64x4(a, b, imm8) _mm512_inserti64x4(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_inserti64x4 - #define _mm512_inserti64x4(a, b, imm8) simde_mm512_inserti64x4(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_inserti64x4 (simde__m512i src, simde__mmask8 k, simde__m512i a, simde__m256i b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 2) { - simde__m512i r; - - #if defined(SIMDE_X86_AVX512F_NATIVE) - SIMDE_CONSTIFY_2_(_mm512_mask_inserti64x4, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_si512 ()), imm8, src, k, a, b); - return r; - #else - SIMDE_CONSTIFY_2_(simde_mm512_inserti64x4, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_si512 ()), imm8, a, b); - return simde_mm512_mask_mov_epi64(src, k, r); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_inserti64x4 - #define _mm512_mask_inserti64x4(src, k, a, b, imm8) simde_mm512_mask_inserti64x4(src, k, a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_inserti64x4 (simde__mmask8 k, simde__m512i a, simde__m256i b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 2) { - simde__m512i r; - - #if defined(SIMDE_X86_AVX512F_NATIVE) - SIMDE_CONSTIFY_2_(_mm512_maskz_inserti64x4, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_si512 ()), imm8, k, a, b); - return r; - #else - SIMDE_CONSTIFY_2_(simde_mm512_inserti64x4, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_si512 ()), imm8, a, b); - return simde_mm512_maskz_mov_epi64(k, r); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_inserti64x4 - #define _mm512_maskz_inserti64x4(k, a, b, imm8) simde_mm512_maskz_inserti64x4(k, a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_insertf32x8 (simde__m512 a, simde__m256 b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m512_private a_ = simde__m512_to_private(a); - - a_.m256[imm8 & 1] = b; - - return simde__m512_from_private(a_); -} -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm512_insertf32x8(a, b, imm8) _mm512_insertf32x8(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_insertf32x8 - #define _mm512_insertf32x8(a, b, imm8) simde_mm512_insertf32x8(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_insertf32x8(simde__m512 src, simde__mmask16 k, simde__m512 a, simde__m256 b, const int imm8) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - simde__m512 r; - SIMDE_CONSTIFY_2_(_mm512_mask_insertf32x8, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_ps ()), imm8, src, k, a, b); - return r; - #else - simde__m512 r; - SIMDE_CONSTIFY_2_(simde_mm512_insertf32x8, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_ps ()), imm8, a, b); - return simde_mm512_mask_mov_ps(src, k, r); - #endif - } -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_insertf32x8 - #define _mm512_mask_insertf32x8(src, k, a, b, imm8) simde_mm512_mask_insertf32x8(src, k, a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_insertf32x8(simde__mmask16 k, simde__m512 a, simde__m256 b, const int imm8) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - simde__m512 r; - SIMDE_CONSTIFY_2_(_mm512_maskz_insertf32x8, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_ps ()), imm8, k, a, b); - return r; - #else - simde__m512 r; - SIMDE_CONSTIFY_2_(simde_mm512_insertf32x8, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_ps ()), imm8, a, b); - return simde_mm512_maskz_mov_ps(k, r); - #endif - } -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_insertf32x8 - #define _mm512_maskz_insertf32x8(k, a, b, imm8) simde_mm512_maskz_insertf32x8(k, a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_insertf64x2 (simde__m512d a, simde__m128d b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m512d_private a_ = simde__m512d_to_private(a); - - a_.m128d[imm8 & 3] = b; - - return simde__m512d_from_private(a_); -} -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm512_insertf64x2(a, b, imm8) _mm512_insertf64x2(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_insertf64x2 - #define _mm512_insertf64x2(a, b, imm8) simde_mm512_insertf64x2(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_insertf64x2(simde__m512d src, simde__mmask8 k, simde__m512d a, simde__m128d b, const int imm8) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - simde__m512d r; - SIMDE_CONSTIFY_4_(_mm512_mask_insertf64x2, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_pd ()), imm8, src, k, a, b); - return r; - #else - simde__m512d r; - SIMDE_CONSTIFY_4_(simde_mm512_insertf64x2, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_pd ()), imm8, a, b); - return simde_mm512_mask_mov_pd(src, k, r); - #endif - } -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_insertf64x2 - #define _mm512_mask_insertf64x2(src, k, a, b, imm8) simde_mm512_mask_insertf64x2(src, k, a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_maskz_insertf64x2(simde__mmask8 k, simde__m512d a, simde__m128d b, const int imm8) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - simde__m512d r; - SIMDE_CONSTIFY_4_(_mm512_maskz_insertf64x2, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_pd ()), imm8, k, a, b); - return r; - #else - simde__m512d r; - SIMDE_CONSTIFY_4_(simde_mm512_insertf64x2, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_pd ()), imm8, a, b); - return simde_mm512_maskz_mov_pd(k, r); - #endif - } -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_insertf64x2 - #define _mm512_maskz_insertf64x2(k, a, b, imm8) simde_mm512_maskz_insertf64x2(k, a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_inserti32x8 (simde__m512i a, simde__m256i b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m512i_private a_ = simde__m512i_to_private(a); - - a_.m256i[imm8 & 1] = b; - - return simde__m512i_from_private(a_); -} -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm512_inserti32x8(a, b, imm8) _mm512_inserti32x8(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_inserti32x8 - #define _mm512_inserti32x8(a, b, imm8) simde_mm512_inserti32x8(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_inserti32x8(simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m256i b, const int imm8) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - simde__m512i r; - SIMDE_CONSTIFY_2_(_mm512_mask_inserti32x8, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_epi32 ()), imm8, src, k, a, b); - return r; - #else - simde__m512i r; - SIMDE_CONSTIFY_2_(simde_mm512_inserti32x8, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_epi32 ()), imm8, a, b); - return simde_mm512_mask_mov_epi32(src, k, r); - #endif - } -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_inserti32x8 - #define _mm512_mask_inserti32x8(src, k, a, b, imm8) simde_mm512_mask_inserti32x8(src, k, a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_inserti32x8(simde__mmask16 k, simde__m512i a, simde__m256i b, const int imm8) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - simde__m512i r; - SIMDE_CONSTIFY_2_(_mm512_maskz_inserti32x8, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_epi32 ()), imm8, k, a, b); - return r; - #else - simde__m512i r; - SIMDE_CONSTIFY_2_(simde_mm512_inserti32x8, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_epi32 ()), imm8, a, b); - return simde_mm512_maskz_mov_epi32(k, r); - #endif - } -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_inserti32x8 - #define _mm512_maskz_inserti32x8(k, a, b, imm8) simde_mm512_maskz_inserti32x8(k, a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_inserti64x2 (simde__m512i a, simde__m128i b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m512i_private a_ = simde__m512i_to_private(a); - - a_.m128i[imm8 & 3] = b; - - return simde__m512i_from_private(a_); -} -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm512_inserti64x2(a, b, imm8) _mm512_inserti64x2(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_inserti64x2 - #define _mm512_inserti64x2(a, b, imm8) simde_mm512_inserti64x2(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_inserti64x2(simde__m512i src, simde__mmask8 k, simde__m512i a, simde__m128i b, const int imm8) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - simde__m512i r; - SIMDE_CONSTIFY_4_(_mm512_mask_inserti64x2, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_si512 ()), imm8, src, k, a, b); - return r; - #else - simde__m512i r; - SIMDE_CONSTIFY_4_(simde_mm512_inserti64x2, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_si512 ()), imm8, a, b); - return simde_mm512_mask_mov_epi64(src, k, r); - #endif - } -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_inserti64x2 - #define _mm512_mask_inserti64x2(src, k, a, b, imm8) simde_mm512_mask_inserti64x2(src, k, a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_inserti64x2(simde__mmask8 k, simde__m512i a, simde__m128i b, const int imm8) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - simde__m512i r; - SIMDE_CONSTIFY_4_(_mm512_maskz_inserti64x2, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_si512 ()), imm8, k, a, b); - return r; - #else - simde__m512i r; - SIMDE_CONSTIFY_4_(simde_mm512_inserti64x2, r, (HEDLEY_UNREACHABLE(), simde_mm512_setzero_si512 ()), imm8, a, b); - return simde_mm512_maskz_mov_epi64(k, r); - #endif - } -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_inserti64x2 - #define _mm512_maskz_inserti64x2(k, a, b, imm8) simde_mm512_maskz_inserti64x2(k, a, b, imm8) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_INSERT_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/kand.h b/ffi-deps/simde/simde/x86/avx512/kand.h deleted file mode 100644 index 7864100..0000000 --- a/ffi-deps/simde/simde/x86/avx512/kand.h +++ /dev/null @@ -1,53 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2023 Michael R. Crusoe - */ - -#if !defined(SIMDE_X86_AVX512_KAND_H) -#define SIMDE_X86_AVX512_KAND_H - -#include "types.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm512_kand (simde__mmask16 a, simde__mmask16 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_kand(a, b); - #else - return a & b; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_kand - #define _mm512_kand(a, b) simde_mm512_kand((a), (b)) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_KAND_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/knot.h b/ffi-deps/simde/simde/x86/avx512/knot.h deleted file mode 100644 index 3b4696e..0000000 --- a/ffi-deps/simde/simde/x86/avx512/knot.h +++ /dev/null @@ -1,106 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2023 Michael R. Crusoe - */ - -#if !defined(SIMDE_X86_AVX512_KNOT_H) -#define SIMDE_X86_AVX512_KNOT_H - -#include "types.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_knot_mask8 (simde__mmask8 a) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) \ - && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ - && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - return _knot_mask8(a); - #else - return ~a; - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _knot_mask8 - #define _knot_mask8(a) simde_knot_mask8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_knot_mask16 (simde__mmask16 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) \ - && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ - && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - return _knot_mask16(a); - #else - return ~a; - #endif -} -#define simde_mm512_knot(a) simde_knot_mask16(a) -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _knot_mask16 - #undef _mm512_knot - #define _knot_mask16(a) simde_knot_mask16(a) - #define _mm512_knot(a) simde_knot_mask16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_knot_mask32 (simde__mmask32 a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) \ - && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ - && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - return _knot_mask32(a); - #else - return ~a; - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _knot_mask32 - #define _knot_mask32(a) simde_knot_mask32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_knot_mask64 (simde__mmask64 a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) \ - && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ - && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - return _knot_mask64(a); - #else - return ~a; - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _knot_mask64 - #define _knot_mask64(a) simde_knot_mask64(a) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_KNOT_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/kshift.h b/ffi-deps/simde/simde/x86/avx512/kshift.h deleted file mode 100644 index 4dfe855..0000000 --- a/ffi-deps/simde/simde/x86/avx512/kshift.h +++ /dev/null @@ -1,152 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Christopher Moore - */ - -#if !defined(SIMDE_X86_AVX512_KSHIFT_H) -#define SIMDE_X86_AVX512_KSHIFT_H - -#include "types.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_kshiftli_mask16 (simde__mmask16 a, unsigned int count) - SIMDE_REQUIRE_CONSTANT_RANGE(count, 0, 255) { - return HEDLEY_STATIC_CAST(simde__mmask16, (count <= 15) ? (a << count) : 0); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && (!defined(SIMDE_DETECT_CLANG_VERSION) && SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) - #define simde_kshiftli_mask16(a, count) _kshiftli_mask16(a, count) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _kshiftli_mask16 - #define _kshiftli_mask16(a, count) simde_kshiftli_mask16(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_kshiftli_mask32 (simde__mmask32 a, unsigned int count) - SIMDE_REQUIRE_CONSTANT_RANGE(count, 0, 255) { - return (count <= 31) ? (a << count) : 0; -} -#if defined(SIMDE_X86_AVX512BW_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && (!defined(SIMDE_DETECT_CLANG_VERSION) && SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) - #define simde_kshiftli_mask32(a, count) _kshiftli_mask32(a, count) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _kshiftli_mask32 - #define _kshiftli_mask32(a, count) simde_kshiftli_mask32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_kshiftli_mask64 (simde__mmask64 a, unsigned int count) - SIMDE_REQUIRE_CONSTANT_RANGE(count, 0, 255) { - return (count <= 63) ? (a << count) : 0; -} -#if defined(SIMDE_X86_AVX512BW_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && (!defined(SIMDE_DETECT_CLANG_VERSION) && SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) - #define simde_kshiftli_mask64(a, count) _kshiftli_mask64(a, count) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _kshiftli_mask64 - #define _kshiftli_mask64(a, count) simde_kshiftli_mask64(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_kshiftli_mask8 (simde__mmask8 a, unsigned int count) - SIMDE_REQUIRE_CONSTANT_RANGE(count, 0, 255) { - return HEDLEY_STATIC_CAST(simde__mmask8, (count <= 7) ? (a << count) : 0); -} -#if defined(SIMDE_X86_AVX512DQ_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && (!defined(SIMDE_DETECT_CLANG_VERSION) && SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) - #define simde_kshiftli_mask8(a, count) _kshiftli_mask8(a, count) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _kshiftli_mask8 - #define _kshiftli_mask8(a, count) simde_kshiftli_mask8(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_kshiftri_mask16 (simde__mmask16 a, unsigned int count) - SIMDE_REQUIRE_CONSTANT_RANGE(count, 0, 255) { - return HEDLEY_STATIC_CAST(simde__mmask16, (count <= 15) ? (a >> count) : 0); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && (!defined(SIMDE_DETECT_CLANG_VERSION) && SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) - #define simde_kshiftri_mask16(a, count) _kshiftri_mask16(a, count) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _kshiftri_mask16 - #define _kshiftri_mask16(a, count) simde_kshiftri_mask16(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_kshiftri_mask32 (simde__mmask32 a, unsigned int count) - SIMDE_REQUIRE_CONSTANT_RANGE(count, 0, 255) { - return (count <= 31) ? (a >> count) : 0; -} -#if defined(SIMDE_X86_AVX512BW_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && (!defined(SIMDE_DETECT_CLANG_VERSION) && SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) - #define simde_kshiftri_mask32(a, count) _kshiftri_mask32(a, count) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _kshiftri_mask32 - #define _kshiftri_mask32(a, count) simde_kshiftri_mask32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_kshiftri_mask64 (simde__mmask64 a, unsigned int count) - SIMDE_REQUIRE_CONSTANT_RANGE(count, 0, 255) { - return (count <= 63) ? (a >> count) : 0; -} -#if defined(SIMDE_X86_AVX512BW_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && (!defined(SIMDE_DETECT_CLANG_VERSION) && SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) - #define simde_kshiftri_mask64(a, count) _kshiftri_mask64(a, count) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _kshiftri_mask64 - #define _kshiftri_mask64(a, count) simde_kshiftri_mask64(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_kshiftri_mask8 (simde__mmask8 a, unsigned int count) - SIMDE_REQUIRE_CONSTANT_RANGE(count, 0, 255) { - return HEDLEY_STATIC_CAST(simde__mmask8, (count <= 7) ? (a >> count) : 0); -} -#if defined(SIMDE_X86_AVX512DQ_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(7,0,0)) && (!defined(SIMDE_DETECT_CLANG_VERSION) && SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) - #define simde_kshiftri_mask8(a, count) _kshiftri_mask8(a, count) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _kshiftri_mask8 - #define _kshiftri_mask8(a, count) simde_kshiftri_mask8(a, count) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_KSHIFT_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/kxor.h b/ffi-deps/simde/simde/x86/avx512/kxor.h deleted file mode 100644 index 45f5d04..0000000 --- a/ffi-deps/simde/simde/x86/avx512/kxor.h +++ /dev/null @@ -1,107 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2023 Michael R. Crusoe - */ - -#if !defined(SIMDE_X86_AVX512_KXOR_H) -#define SIMDE_X86_AVX512_KXOR_H - -#include "types.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_kxor_mask8 (simde__mmask8 a, simde__mmask8 b) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) \ - && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ - && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - return _kxor_mask8(a, b); - #else - return a^b; - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _kxor_mask8 - #define _kxor_mask8(a, b) simde_kxor_mask8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_kxor_mask16 (simde__mmask16 a, simde__mmask16 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) \ - && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ - && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - return _kxor_mask16(a, b); - #else - return a^b; - #endif -} -#define simde_mm512_kxor(a, b) simde_kxor_mask16(a, b) -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _kxor_mask16 - #undef _mm512_kxor - #define _kxor_mask16(a, b) simde_kxor_mask16(a, b) - #define _mm512_kxor(a, b) simde_kxor_mask16(a, b) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_kxor_mask32 (simde__mmask32 a, simde__mmask32 b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) \ - && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ - && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - return _kxor_mask32(a, b); - #else - return a^b; - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _kxor_mask32 - #define _kxor_mask32(a, b) simde_kxor_mask32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_kxor_mask64 (simde__mmask64 a, simde__mmask64 b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) \ - && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) \ - && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - return _kxor_mask64(a, b); - #else - return a^b; - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _kxor_mask64 - #define _kxor_mask64(a, b) simde_kxor_mask64(a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_KXOR_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/load.h b/ffi-deps/simde/simde/x86/avx512/load.h deleted file mode 100644 index 6a4af93..0000000 --- a/ffi-deps/simde/simde/x86/avx512/load.h +++ /dev/null @@ -1,115 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - */ - -#if !defined(SIMDE_X86_AVX512_LOAD_H) -#define SIMDE_X86_AVX512_LOAD_H - -#include "types.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_load_pd (void const * mem_addr) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_load_pd(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512d)); - #else - simde__m512d r; - simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512d), sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_load_pd - #define _mm512_load_pd(a) simde_mm512_load_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_load_ps (void const * mem_addr) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_load_ps(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512)); - #else - simde__m512 r; - simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512), sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_load_ps - #define _mm512_load_ps(a) simde_mm512_load_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512h -simde_mm512_load_ph (void const * mem_addr) { - #if defined(SIMDE_X86_AVX512FP16_NATIVE) - return _mm512_load_ph(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512h)); - #else - simde__m512h r; - simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512h), sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) - #undef _mm512_load_ph - #define _mm512_load_ph(a) simde_mm512_load_ph(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_load_si512 (void const * mem_addr) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_load_si512(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512i)); - #else - simde__m512i r; - simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512i), sizeof(r)); - return r; - #endif -} -#define simde_mm512_load_epi8(mem_addr) simde_mm512_load_si512(mem_addr) -#define simde_mm512_load_epi16(mem_addr) simde_mm512_load_si512(mem_addr) -#define simde_mm512_load_epi32(mem_addr) simde_mm512_load_si512(mem_addr) -#define simde_mm512_load_epi64(mem_addr) simde_mm512_load_si512(mem_addr) -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_load_epi8 - #undef _mm512_load_epi16 - #undef _mm512_load_epi32 - #undef _mm512_load_epi64 - #undef _mm512_load_si512 - #define _mm512_load_si512(a) simde_mm512_load_si512(a) - #define _mm512_load_epi8(a) simde_mm512_load_si512(a) - #define _mm512_load_epi16(a) simde_mm512_load_si512(a) - #define _mm512_load_epi32(a) simde_mm512_load_si512(a) - #define _mm512_load_epi64(a) simde_mm512_load_si512(a) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_LOAD_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/loadu.h b/ffi-deps/simde/simde/x86/avx512/loadu.h deleted file mode 100644 index 4a31966..0000000 --- a/ffi-deps/simde/simde/x86/avx512/loadu.h +++ /dev/null @@ -1,297 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - */ - -#if !defined(SIMDE_X86_AVX512_LOADU_H) -#define SIMDE_X86_AVX512_LOADU_H - -#include "types.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_loadu_ps (void const * mem_addr) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - #if defined(SIMDE_BUG_CLANG_REV_298042) - return _mm512_loadu_ps(SIMDE_ALIGN_CAST(const float *, mem_addr)); - #else - return _mm512_loadu_ps(mem_addr); - #endif - #else - simde__m512 r; - simde_memcpy(&r, mem_addr, sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_loadu_ps - #define _mm512_loadu_ps(a) simde_mm512_loadu_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_loadu_pd (void const * mem_addr) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - #if defined(SIMDE_BUG_CLANG_REV_298042) - return _mm512_loadu_pd(SIMDE_ALIGN_CAST(const double *, mem_addr)); - #else - return _mm512_loadu_pd(mem_addr); - #endif - #else - simde__m512d r; - simde_memcpy(&r, mem_addr, sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_loadu_pd - #define _mm512_loadu_pd(a) simde_mm512_loadu_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512h -simde_mm512_loadu_ph (void const * mem_addr) { - #if defined(SIMDE_X86_AVX512FP16_NATIVE) - return _mm512_loadu_ph(mem_addr); - #else - simde__m512h r; - simde_memcpy(&r, mem_addr, sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) - #undef _mm512_loadu_ph - #define _mm512_loadu_ph(a) simde_mm512_loadu_ph(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_loadu_si512 (void const * mem_addr) { - simde__m512i r; - #if HEDLEY_GNUC_HAS_ATTRIBUTE(may_alias,3,3,0) - HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_PACKED_ - struct simde_mm512_loadu_si512_s { - __typeof__(r) v; - } __attribute__((__packed__, __may_alias__)); - r = HEDLEY_REINTERPRET_CAST(const struct simde_mm512_loadu_si512_s *, mem_addr)->v; - HEDLEY_DIAGNOSTIC_POP - #else - simde_memcpy(&r, mem_addr, sizeof(r)); - #endif - - return r; -} -#if defined(SIMDE_X86_AVX512F_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(10,0,0)) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) - #define simde_mm512_loadu_si512(mem_addr) _mm512_loadu_si512(mem_addr) - #define simde_mm512_loadu_epi32(mem_addr) _mm512_loadu_epi32(mem_addr) - #define simde_mm512_loadu_epi64(mem_addr) _mm512_loadu_epi64(mem_addr) -#else - #define simde_mm512_loadu_epi32(mem_addr) simde_mm512_loadu_si512(mem_addr) - #define simde_mm512_loadu_epi64(mem_addr) simde_mm512_loadu_si512(mem_addr) -#endif -#if defined(SIMDE_X86_AVX512BW_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(11,0,0)) && (!defined(__clang__) || SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0)) - #define simde_mm512_loadu_epi8(mem_addr) _mm512_loadu_epi8(mem_addr) - #define simde_mm512_loadu_epi16(mem_addr) _mm512_loadu_epi16(mem_addr) -#else - #define simde_mm512_loadu_epi8(mem_addr) simde_mm512_loadu_si512(mem_addr) - #define simde_mm512_loadu_epi16(mem_addr) simde_mm512_loadu_si512(mem_addr) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_loadu_epi8 - #undef _mm512_loadu_epi16 - #define _mm512_loadu_epi8(a) simde_mm512_loadu_epi8(a) - #define _mm512_loadu_epi16(a) simde_mm512_loadu_epi16(a) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_loadu_epi32 - #undef _mm512_loadu_epi64 - #undef _mm512_loadu_si512 - #define _mm512_loadu_si512(a) simde_mm512_loadu_si512(a) - #define _mm512_loadu_epi32(a) simde_mm512_loadu_epi32(a) - #define _mm512_loadu_epi64(a) simde_mm512_loadu_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_loadu_epi16 (simde__mmask16 k, void const * mem_addr) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_loadu_epi16(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); - #else - return simde_mm256_maskz_mov_epi16(k, simde_mm256_loadu_epi16(mem_addr)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_loadu_epi16 - #define _mm256_maskz_loadu_epi16(k, mem_addr) simde_mm256_maskz_loadu_epi16(k, mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_maskz_loadu_ps (simde__mmask8 k, void const * mem_addr) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_loadu_ps(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); - #else - return simde_mm256_maskz_mov_ps(k, simde_mm256_loadu_ps(HEDLEY_REINTERPRET_CAST(const float*, mem_addr))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_loadu_ps - #define _mm256_maskz_loadu_ps(k, mem_addr) simde_mm256_maskz_loadu_ps(k, mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_loadu_epi16 (simde__m512i src, simde__mmask32 k, void const * mem_addr) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_loadu_epi16(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); - #else - return simde_mm512_mask_mov_epi16(src, k, simde_mm512_loadu_epi16(mem_addr)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_loadu_epi16 - #define _mm512_mask_loadu_epi16(src, k, mem_addr) simde_mm512_mask_loadu_epi16(src, k, mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_loadu_epi16 (simde__mmask32 k, void const * mem_addr) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_loadu_epi16(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); - #else - return simde_mm512_maskz_mov_epi16(k, simde_mm512_loadu_epi16(mem_addr)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_loadu_epi16 - #define _mm512_maskz_loadu_epi16(k, mem_addr) simde_mm512_maskz_loadu_epi16(k, mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_loadu_epi32 (simde__m512i src, simde__mmask16 k, void const * mem_addr) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_loadu_epi32(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_loadu_epi32(mem_addr)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_loadu_epi32 - #define _mm512_mask_loadu_epi32(src, k, mem_addr) simde_mm512_mask_loadu_epi32(src, k, mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_loadu_epi64 (simde__m512i src, simde__mmask8 k, void const * mem_addr) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_loadu_epi64(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_loadu_epi64(mem_addr)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_loadu_epi64 - #define _mm512_mask_loadu_epi64(src, k, mem_addr) simde_mm512_mask_loadu_epi64(src, k, mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_loadu_epi64 (simde__mmask8 k, void const * mem_addr) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_loadu_epi64(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_loadu_epi64(mem_addr)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_loadu_epi64 - #define _mm512_maskz_loadu_epi64(k, mem_addr) simde_mm512_maskz_loadu_epi64((k), (mem_addr)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_loadu_pd (simde__m512d src, simde__mmask8 k, void const * mem_addr) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_loadu_pd(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_loadu_pd(mem_addr)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_loadu_pd - #define _mm512_mask_loadu_pd(src, k, mem_addr) simde_mm512_mask_loadu_pd(src, k, mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_loadu_ps (simde__m512 src, simde__mmask16 k, void const * mem_addr) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_loadu_ps(src, k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_loadu_ps(mem_addr)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_loadu_ps - #define _mm512_mask_loadu_ps(src, k, mem_addr) simde_mm512_mask_loadu_ps(src, k, mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_loadu_ps (simde__mmask16 k, void const * mem_addr) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_loadu_ps(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); - #else - return simde_mm512_maskz_mov_ps(k, simde_mm512_loadu_ps(mem_addr)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_loadu_ps - #define _mm512_maskz_loadu_ps(k, mem_addr) simde_mm512_maskz_loadu_ps(k, mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_maskz_loadu_pd (simde__mmask8 k, void const * mem_addr) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_loadu_pd(k, HEDLEY_REINTERPRET_CAST(void const*, mem_addr)); - #else - return simde_mm512_maskz_mov_pd(k, simde_mm512_loadu_pd(mem_addr)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_loadu_pd - #define _mm512_maskz_loadu_pd(k, mem_addr) simde_mm512_maskz_loadu_pd(k, mem_addr) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_LOADU_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/lzcnt.h b/ffi-deps/simde/simde/x86/avx512/lzcnt.h deleted file mode 100644 index 41a0eec..0000000 --- a/ffi-deps/simde/simde/x86/avx512/lzcnt.h +++ /dev/null @@ -1,220 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - */ - -#if !defined(SIMDE_X86_AVX512_LZCNT_H) -#define SIMDE_X86_AVX512_LZCNT_H - -#include "types.h" -#include "mov.h" -#if HEDLEY_MSVC_VERSION_CHECK(14,0,0) -#include -#pragma intrinsic(_BitScanReverse) - #if defined(_M_AMD64) || defined(_M_ARM64) - #pragma intrinsic(_BitScanReverse64) - #endif -#endif - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -#if \ - ( HEDLEY_HAS_BUILTIN(__builtin_clz) || \ - HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ - HEDLEY_ARM_VERSION_CHECK(4,1,0) ) && \ - defined(__INT_MAX__) && defined(__LONG_MAX__) && defined(__LONG_LONG_MAX__) && \ - defined(__INT32_MAX__) && defined(__INT64_MAX__) - #if __INT_MAX__ == __INT32_MAX__ - #define simde_x_clz32(v) __builtin_clz(HEDLEY_STATIC_CAST(unsigned int, (v))) - #elif __LONG_MAX__ == __INT32_MAX__ - #define simde_x_clz32(v) __builtin_clzl(HEDLEY_STATIC_CAST(unsigned long, (v))) - #elif __LONG_LONG_MAX__ == __INT32_MAX__ - #define simde_x_clz32(v) __builtin_clzll(HEDLEY_STATIC_CAST(unsigned long long, (v))) - #endif - - #if __INT_MAX__ == __INT64_MAX__ - #define simde_x_clz64(v) __builtin_clz(HEDLEY_STATIC_CAST(unsigned int, (v))) - #elif __LONG_MAX__ == __INT64_MAX__ - #define simde_x_clz64(v) __builtin_clzl(HEDLEY_STATIC_CAST(unsigned long, (v))) - #elif __LONG_LONG_MAX__ == __INT64_MAX__ - #define simde_x_clz64(v) __builtin_clzll(HEDLEY_STATIC_CAST(unsigned long long, (v))) - #endif -#elif HEDLEY_MSVC_VERSION_CHECK(14,0,0) - static int simde_x_clz32(uint32_t x) { - unsigned long r; - _BitScanReverse(&r, x); - return 31 - HEDLEY_STATIC_CAST(int, r); - } - #define simde_x_clz32 simde_x_clz32 - - static int simde_x_clz64(uint64_t x) { - unsigned long r; - - #if defined(_M_AMD64) || defined(_M_ARM64) - _BitScanReverse64(&r, x); - return 63 - HEDLEY_STATIC_CAST(int, r); - #else - uint32_t high = HEDLEY_STATIC_CAST(uint32_t, x >> 32); - if (high != 0) - return _BitScanReverse(&r, HEDLEY_STATIC_CAST(unsigned long, high)); - else - return _BitScanReverse(&r, HEDLEY_STATIC_CAST(unsigned long, x & ~UINT32_C(0))) + 32; - #endif - } - #define simde_x_clz64 simde_x_clz64 -#endif - -#if !defined(simde_x_clz32) || !defined(simde_x_clz64) - static uint8_t simde_x_avx512cd_lz_lookup(const uint8_t value) { - static const uint8_t lut[256] = { - 7, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - }; - return lut[value]; - }; - - #if !defined(simde_x_clz32) - static int simde_x_clz32(uint32_t x) { - size_t s = sizeof(x) * 8; - uint32_t r; - - while ((s -= 8) != 0) { - r = x >> s; - if (r != 0) - return simde_x_avx512cd_lz_lookup(HEDLEY_STATIC_CAST(uint8_t, r)) + - (((sizeof(x) - 1) * 8) - s); - } - - if (x == 0) - return (int) ((sizeof(x) * 8) - 1); - else - return simde_x_avx512cd_lz_lookup(HEDLEY_STATIC_CAST(uint8_t, x)) + - ((sizeof(x) - 1) * 8); - } - #endif - - #if !defined(simde_x_clz64) - static int simde_x_clz64(uint64_t x) { - size_t s = sizeof(x) * 8; - uint64_t r; - - while ((s -= 8) != 0) { - r = x >> s; - if (r != 0) - return simde_x_avx512cd_lz_lookup(HEDLEY_STATIC_CAST(uint8_t, r)) + - (((sizeof(x) - 1) * 8) - s); - } - - if (x == 0) - return (int) ((sizeof(x) * 8) - 1); - else - return simde_x_avx512cd_lz_lookup(HEDLEY_STATIC_CAST(uint8_t, x)) + - ((sizeof(x) - 1) * 8); - } - #endif -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_lzcnt_epi32(simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) - return _mm_lzcnt_epi32(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - /* https://stackoverflow.com/a/58827596/501126 */ - a = _mm_andnot_si128(_mm_srli_epi32(a, 8), a); - a = _mm_castps_si128(_mm_cvtepi32_ps(a)); - a = _mm_srli_epi32(a, 23); - a = _mm_subs_epu16(_mm_set1_epi32(158), a); - a = _mm_min_epi16(a, _mm_set1_epi32(32)); - return a; - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u32 = vec_cntlz(a_.altivec_u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0; i < (sizeof(r_.i32) / sizeof(r_.i32[0])); i++) { - r_.i32[i] = (HEDLEY_UNLIKELY(a_.i32[i] == 0) ? HEDLEY_STATIC_CAST(int32_t, sizeof(int32_t) * CHAR_BIT) : HEDLEY_STATIC_CAST(int32_t, simde_x_clz32(HEDLEY_STATIC_CAST(uint32_t, a_.i32[i])))); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512CD_ENABLE_NATIVE_ALIASES) - #undef _mm_lzcnt_epi32 - #define _mm_lzcnt_epi32(a) simde_mm_lzcnt_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_lzcnt_epi32(simde__m128i src, simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) - return _mm_mask_lzcnt_epi32(src, k, a); - #else - return simde_mm_mask_mov_epi32(src, k, simde_mm_lzcnt_epi32(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_lzcnt_epi32 - #define _mm_mask_lzcnt_epi32(src, k, a) simde_mm_mask_lzcnt_epi32(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_lzcnt_epi32(simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512CD_NATIVE) - return _mm_maskz_lzcnt_epi32(k, a); - #else - return simde_mm_maskz_mov_epi32(k, simde_mm_lzcnt_epi32(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_lzcnt_epi32 - #define _mm_maskz_lzcnt_epi32(k, a) simde_mm_maskz_lzcnt_epi32(k, a) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_LZCNT_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/madd.h b/ffi-deps/simde/simde/x86/avx512/madd.h deleted file mode 100644 index 547d71c..0000000 --- a/ffi-deps/simde/simde/x86/avx512/madd.h +++ /dev/null @@ -1,157 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Ashleigh Newman-Jones - */ - -#if !defined(SIMDE_X86_AVX512_MADD_H) -#define SIMDE_X86_AVX512_MADD_H - -#include "types.h" -#include "mov.h" -#include "../avx2.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_madd_epi16 (simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_madd_epi16(src, k, a, b); - #else - return simde_mm_mask_mov_epi32(src, k, simde_mm_madd_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_madd_epi16 - #define _mm_mask_madd_epi16(src, k, a, b) simde_mm_mask_madd_epi16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_madd_epi16 (simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_madd_epi16(k, a, b); - #else - return simde_mm_maskz_mov_epi32(k, simde_mm_madd_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_madd_epi16 - #define _mm_maskz_madd_epi16(k, a, b) simde_mm_maskz_madd_epi16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_madd_epi16 (simde__m256i src, simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_madd_epi16(src, k, a, b); - #else - return simde_mm256_mask_mov_epi32(src, k, simde_mm256_madd_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_madd_epi16 - #define _mm256_mask_madd_epi16(src, k, a, b) simde_mm256_mask_madd_epi16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_madd_epi16 (simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_madd_epi16(k, a, b); - #else - return simde_mm256_maskz_mov_epi32(k, simde_mm256_madd_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_madd_epi16 - #define _mm256_maskz_madd_epi16(k, a, b) simde_mm256_maskz_madd_epi16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_madd_epi16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_madd_epi16(a, b); - #else - simde__m512i_private r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if 0 && SIMDE_NATURAL_VECTOR_SIZE_LE(256) || defined(SIMDE_BUG_CLANG_BAD_MADD) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_madd_epi16(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i += 2) { - r_.i32[i / 2] = - (HEDLEY_STATIC_CAST(int32_t, a_.i16[ i ]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[ i ])) + - (HEDLEY_STATIC_CAST(int32_t, a_.i16[i + 1]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i + 1])); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_madd_epi16 - #define _mm512_madd_epi16(a, b) simde_mm512_madd_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_madd_epi16 (simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_madd_epi16(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_madd_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_madd_epi16 - #define _mm512_mask_madd_epi16(src, k, a, b) simde_mm512_mask_madd_epi16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_madd_epi16 (simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_madd_epi16(k, a, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_madd_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_madd_epi16 - #define _mm512_maskz_madd_epi16(k, a, b) simde_mm512_maskz_madd_epi16(k, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_MADD_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/maddubs.h b/ffi-deps/simde/simde/x86/avx512/maddubs.h deleted file mode 100644 index 43b5594..0000000 --- a/ffi-deps/simde/simde/x86/avx512/maddubs.h +++ /dev/null @@ -1,159 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Ashleigh Newman-Jones - */ - -#if !defined(SIMDE_X86_AVX512_MADDUBS_H) -#define SIMDE_X86_AVX512_MADDUBS_H - -#include "types.h" -#include "mov.h" -#include "../avx2.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_maddubs_epi16 (simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_maddubs_epi16(src, k, a, b); - #else - return simde_mm_mask_mov_epi16(src, k, simde_mm_maddubs_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_maddubs_epi16 - #define _mm_mask_maddubs_epi16(src, k, a, b) simde_mm_mask_maddubs_epi16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_maddubs_epi16 (simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE ) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_maddubs_epi16(k, a, b); - #else - return simde_mm_maskz_mov_epi16(k, simde_mm_maddubs_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_maddubs_epi16 - #define _mm_maskz_maddubs_epi16(k, a, b) simde_mm_maskz_maddubs_epi16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_maddubs_epi16 (simde__m256i src, simde__mmask16 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_maddubs_epi16(src, k, a, b); - #else - return simde_mm256_mask_mov_epi16(src, k, simde_mm256_maddubs_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_maddubs_epi16 - #define _mm256_mask_maddubs_epi16(src, k, a, b) simde_mm256_mask_maddubs_epi16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_maddubs_epi16 (simde__mmask16 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_maddubs_epi16(k, a, b); - #else - return simde_mm256_maskz_mov_epi16(k, simde_mm256_maddubs_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_maddubs_epi16 - #define _mm256_maskz_maddubs_epi16(k, a, b) simde_mm256_maskz_maddubs_epi16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maddubs_epi16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maddubs_epi16(a, b); - #else - simde__m512i_private r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) || defined(SIMDE_BUG_CLANG_BAD_MADD) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_maddubs_epi16(a_.m256i[i], b_.m256i[i]); - } - #else - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - const int idx = HEDLEY_STATIC_CAST(int, i) << 1; - int32_t ts = - (HEDLEY_STATIC_CAST(int16_t, a_.u8[ idx ]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[ idx ])) + - (HEDLEY_STATIC_CAST(int16_t, a_.u8[idx + 1]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[idx + 1])); - r_.i16[i] = (ts > INT16_MIN) ? ((ts < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ts) : INT16_MAX) : INT16_MIN; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maddubs_epi16 - #define _mm512_maddubs_epi16(a, b) simde_mm512_maddubs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_maddubs_epi16 (simde__m512i src, simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_maddubs_epi16(src, k, a, b); - #else - return simde_mm512_mask_mov_epi16(src, k, simde_mm512_maddubs_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_maddubs_epi16 - #define _mm512_mask_maddubs_epi16(src, k, a, b) simde_mm512_mask_maddubs_epi16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_maddubs_epi16 (simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_maddubs_epi16(k, a, b); - #else - return simde_mm512_maskz_mov_epi16(k, simde_mm512_maddubs_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_maddubs_epi16 - #define _mm512_maskz_maddubs_epi16(k, a, b) simde_mm512_maskz_maddubs_epi16(k, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_MADDUBS_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/max.h b/ffi-deps/simde/simde/x86/avx512/max.h deleted file mode 100644 index 29ef0b3..0000000 --- a/ffi-deps/simde/simde/x86/avx512/max.h +++ /dev/null @@ -1,611 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_MAX_H) -#define SIMDE_X86_AVX512_MAX_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_max_epi8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_max_epi8(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? a_.i8[i] : b_.i8[i]; - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) -# define _mm512_max_epi8(a, b) simde_mm512_max_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_max_epi8 (simde__m512i src, simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_max_epi8(src, k, a, b); - #else - return simde_mm512_mask_mov_epi8(src, k, simde_mm512_max_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_max_epi8 - #define _mm512_mask_max_epi8(src, k, a, b) simde_mm512_mask_max_epi8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_max_epi8 (simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_max_epi8(k, a, b); - #else - return simde_mm512_maskz_mov_epi8(k, simde_mm512_max_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_max_epi8 - #define _mm512_maskz_max_epi8(k, a, b) simde_mm512_maskz_max_epi8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_max_epu8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_max_epu8(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(a_.m256i) / sizeof(a_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_max_epu8(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_max_epu8 - #define _mm512_max_epu8(a, b) simde_mm512_max_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_max_epu8 (simde__m512i src, simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_max_epu8(src, k, a, b); - #else - return simde_mm512_mask_mov_epi8(src, k, simde_mm512_max_epu8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_max_epu8 - #define _mm512_mask_max_epu8(src, k, a, b) simde_mm512_mask_max_epu8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_max_epu8 (simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_max_epu8(k, a, b); - #else - return simde_mm512_maskz_mov_epi8(k, simde_mm512_max_epu8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_max_epu8 - #define _mm512_maskz_max_epu8(k, a, b) simde_mm512_maskz_max_epu8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_max_epi16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_max_epi16(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i]; - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) -# define _mm512_max_epi16(a, b) simde_mm512_max_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_max_epi16 (simde__m512i src, simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_max_epi16(src, k, a, b); - #else - return simde_mm512_mask_mov_epi16(src, k, simde_mm512_max_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_max_epi16 - #define _mm512_mask_max_epi16(src, k, a, b) simde_mm512_mask_max_epi16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_max_epi16 (simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_max_epi16(k, a, b); - #else - return simde_mm512_maskz_mov_epi16(k, simde_mm512_max_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_max_epi16 - #define _mm512_maskz_max_epi16(k, a, b) simde_mm512_maskz_max_epi16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_max_epu16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_max_epu16(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(a_.m256i) / sizeof(a_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_max_epu16(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] > b_.u16[i]) ? a_.u16[i] : b_.u16[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_max_epu16 - #define _mm512_max_epu16(a, b) simde_mm512_max_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_max_epu16 (simde__m512i src, simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_max_epu16(src, k, a, b); - #else - return simde_mm512_mask_mov_epi16(src, k, simde_mm512_max_epu16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_max_epu16 - #define _mm512_mask_max_epu16(src, k, a, b) simde_mm512_mask_max_epu16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_max_epu16 (simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_max_epu16(k, a, b); - #else - return simde_mm512_maskz_mov_epi16(k, simde_mm512_max_epu16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_max_epu16 - #define _mm512_maskz_max_epu16(k, a, b) simde_mm512_maskz_max_epu16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_max_epi32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_max_epi32(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_X86_AVX2_NATIVE) - r_.m256i[0] = simde_mm256_max_epi32(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_max_epi32(a_.m256i[1], b_.m256i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] > b_.i32[i] ? a_.i32[i] : b_.i32[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_max_epi32 - #define _mm512_max_epi32(a, b) simde_mm512_max_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_max_epi32(simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_max_epi32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_max_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_max_epi32 - #define _mm512_mask_max_epi32(src, k, a, b) simde_mm512_mask_max_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_max_epi32(simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_max_epi32(k, a, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_max_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_max_epi32 - #define _mm512_maskz_max_epi32(k, a, b) simde_mm512_maskz_max_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_max_epu32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_max_epu32(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_X86_AVX2_NATIVE) - r_.m256i[0] = simde_mm256_max_epu32(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_max_epu32(a_.m256i[1], b_.m256i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] > b_.u32[i]) ? a_.u32[i] : b_.u32[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_max_epu32 - #define _mm512_max_epu32(a, b) simde_mm512_max_epu32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_max_epu32(simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_max_epu32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_max_epu32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_max_epu32 - #define _mm512_mask_max_epu32(src, k, a, b) simde_mm512_mask_max_epu32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_max_epu32(simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_max_epu32(k, a, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_max_epu32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_max_epu32 - #define _mm512_maskz_max_epu32(k, a, b) simde_mm512_maskz_max_epu32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_max_epi64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_max_epi64(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] > b_.i64[i] ? a_.i64[i] : b_.i64[i]; - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_max_epi64 - #define _mm512_max_epi64(a, b) simde_mm512_max_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_max_epi64(simde__m512i src, simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_max_epi64(src, k, a, b); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_max_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_max_epi64 - #define _mm512_mask_max_epi64(src, k, a, b) simde_mm512_mask_max_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_max_epi64(simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_max_epi64(k, a, b); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_max_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_max_epi64 - #define _mm512_maskz_max_epi64(k, a, b) simde_mm512_maskz_max_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_max_epu64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_max_epu64(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] > b_.u64[i]) ? a_.u64[i] : b_.u64[i]; - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_max_epu64 - #define _mm512_max_epu64(a, b) simde_mm512_max_epu64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_max_epu64(simde__m512i src, simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_max_epu64(src, k, a, b); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_max_epu64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_max_epu64 - #define _mm512_mask_max_epu64(src, k, a, b) simde_mm512_mask_max_epu64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_max_epu64(simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_max_epu64(k, a, b); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_max_epu64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_max_epu64 - #define _mm512_maskz_max_epu64(k, a, b) simde_mm512_maskz_max_epu64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_max_ps (simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_max_ps(a, b); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a), - b_ = simde__m512_to_private(b); - - #if defined(SIMDE_X86_AVX2_NATIVE) - r_.m256[0] = simde_mm256_max_ps(a_.m256[0], b_.m256[0]); - r_.m256[1] = simde_mm256_max_ps(a_.m256[1], b_.m256[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i] > b_.f32[i] ? a_.f32[i] : b_.f32[i]; - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_max_ps - #define _mm512_max_ps(a, b) simde_mm512_max_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_max_ps(simde__m512 src, simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_max_ps(src, k, a, b); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_max_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_max_ps - #define _mm512_mask_max_ps(src, k, a, b) simde_mm512_mask_max_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_max_ps(simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_max_ps(k, a, b); - #else - return simde_mm512_maskz_mov_ps(k, simde_mm512_max_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_max_ps - #define _mm512_maskz_max_ps(k, a, b) simde_mm512_maskz_max_ps(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_max_pd (simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_max_pd(a, b); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a), - b_ = simde__m512d_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[i] > b_.f64[i] ? a_.f64[i] : b_.f64[i]; - } - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_max_pd - #define _mm512_max_pd(a, b) simde_mm512_max_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512h -simde_mm512_max_ph (simde__m512h a, simde__m512h b) { - #if defined(SIMDE_X86_AVX512FP16_NATIVE) - return _mm512_max_ph(a, b); - #else - simde__m512h_private - r_, - a_ = simde__m512h_to_private(a), - b_ = simde__m512h_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { - r_.f16[i] = simde_float16_to_float32(a_.f16[i]) > simde_float16_to_float32(b_.f16[i]) ? a_.f16[i] : b_.f16[i]; - } - - return simde__m512h_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) - #undef _mm512_max_ph - #define _mm512_max_ph(a, b) simde_mm512_max_ph(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_max_pd(simde__m512d src, simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_max_pd(src, k, a, b); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_max_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_max_pd - #define _mm512_mask_max_pd(src, k, a, b) simde_mm512_mask_max_pd(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_maskz_max_pd(simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_max_pd(k, a, b); - #else - return simde_mm512_maskz_mov_pd(k, simde_mm512_max_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_max_pd - #define _mm512_maskz_max_pd(k, a, b) simde_mm512_maskz_max_pd(k, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_MAX_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/min.h b/ffi-deps/simde/simde/x86/avx512/min.h deleted file mode 100644 index 2e1dd84..0000000 --- a/ffi-deps/simde/simde/x86/avx512/min.h +++ /dev/null @@ -1,611 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_MIN_H) -#define SIMDE_X86_AVX512_MIN_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_min_epi8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_min_epi8(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] < b_.i8[i]) ? a_.i8[i] : b_.i8[i]; - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) -# define _mm512_min_epi8(a, b) simde_mm512_min_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_min_epi8 (simde__m512i src, simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_min_epi8(src, k, a, b); - #else - return simde_mm512_mask_mov_epi8(src, k, simde_mm512_min_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_min_epi8 - #define _mm512_mask_min_epi8(src, k, a, b) simde_mm512_mask_min_epi8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_min_epi8 (simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_min_epi8(k, a, b); - #else - return simde_mm512_maskz_mov_epi8(k, simde_mm512_min_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_min_epi8 - #define _mm512_maskz_min_epi8(k, a, b) simde_mm512_maskz_min_epi8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_min_epu8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_min_epu8(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(a_.m256i) / sizeof(a_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_min_epu8(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_min_epu8 - #define _mm512_min_epu8(a, b) simde_mm512_min_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_min_epu8 (simde__m512i src, simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_min_epu8(src, k, a, b); - #else - return simde_mm512_mask_mov_epi8(src, k, simde_mm512_min_epu8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_min_epu8 - #define _mm512_mask_min_epu8(src, k, a, b) simde_mm512_mask_min_epu8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_min_epu8 (simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_min_epu8(k, a, b); - #else - return simde_mm512_maskz_mov_epi8(k, simde_mm512_min_epu8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_min_epu8 - #define _mm512_maskz_min_epu8(k, a, b) simde_mm512_maskz_min_epu8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_min_epi16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_min_epi16(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i]; - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) -# define _mm512_min_epi16(a, b) simde_mm512_min_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_min_epi16 (simde__m512i src, simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_min_epi16(src, k, a, b); - #else - return simde_mm512_mask_mov_epi16(src, k, simde_mm512_min_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_min_epi16 - #define _mm512_mask_min_epi16(src, k, a, b) simde_mm512_mask_min_epi16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_min_epi16 (simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_min_epi16(k, a, b); - #else - return simde_mm512_maskz_mov_epi16(k, simde_mm512_min_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_min_epi16 - #define _mm512_maskz_min_epi16(k, a, b) simde_mm512_maskz_min_epi16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_min_epu16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_min_epu16(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(a_.m256i) / sizeof(a_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_min_epu16(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] < b_.u16[i]) ? a_.u16[i] : b_.u16[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_min_epu16 - #define _mm512_min_epu16(a, b) simde_mm512_min_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_min_epu16 (simde__m512i src, simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_min_epu16(src, k, a, b); - #else - return simde_mm512_mask_mov_epi16(src, k, simde_mm512_min_epu16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_min_epu16 - #define _mm512_mask_min_epu16(src, k, a, b) simde_mm512_mask_min_epu16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_min_epu16 (simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_min_epu16(k, a, b); - #else - return simde_mm512_maskz_mov_epi16(k, simde_mm512_min_epu16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_min_epu16 - #define _mm512_maskz_min_epu16(k, a, b) simde_mm512_maskz_min_epu16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_min_epi32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_min_epi32(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_X86_AVX2_NATIVE) - r_.m256i[0] = simde_mm256_min_epi32(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_min_epi32(a_.m256i[1], b_.m256i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] < b_.i32[i] ? a_.i32[i] : b_.i32[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_min_epi32 - #define _mm512_min_epi32(a, b) simde_mm512_min_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_min_epi32(simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_min_epi32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_min_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_min_epi32 - #define _mm512_mask_min_epi32(src, k, a, b) simde_mm512_mask_min_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i - simde_mm512_maskz_min_epi32(simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_min_epi32(k, a, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_min_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_min_epi32 - #define _mm512_maskz_min_epi32(k, a, b) simde_mm512_maskz_min_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_min_epu32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_min_epu32(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_X86_AVX2_NATIVE) - r_.m256i[0] = simde_mm256_min_epu32(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_min_epu32(a_.m256i[1], b_.m256i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] < b_.u32[i]) ? a_.u32[i] : b_.u32[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_min_epu32 - #define _mm512_min_epu32(a, b) simde_mm512_min_epu32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_min_epu32(simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_min_epu32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_min_epu32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_min_epu32 - #define _mm512_mask_min_epu32(src, k, a, b) simde_mm512_mask_min_epu32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_min_epu32(simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_min_epu32(k, a, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_min_epu32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_min_epu32 - #define _mm512_maskz_min_epu32(k, a, b) simde_mm512_maskz_min_epu32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_min_epi64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_min_epi64(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] < b_.i64[i] ? a_.i64[i] : b_.i64[i]; - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_min_epi64 - #define _mm512_min_epi64(a, b) simde_mm512_min_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_min_epi64(simde__m512i src, simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_min_epi64(src, k, a, b); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_min_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_min_epi64 - #define _mm512_mask_min_epi64(src, k, a, b) simde_mm512_mask_min_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_min_epi64(simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_min_epi64(k, a, b); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_min_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_min_epi64 - #define _mm512_maskz_min_epi64(k, a, b) simde_mm512_maskz_min_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_min_epu64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_min_epu64(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] < b_.u64[i]) ? a_.u64[i] : b_.u64[i]; - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_min_epu64 - #define _mm512_min_epu64(a, b) simde_mm512_min_epu64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_min_epu64(simde__m512i src, simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_min_epu64(src, k, a, b); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_min_epu64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_min_epu64 - #define _mm512_mask_min_epu64(src, k, a, b) simde_mm512_mask_min_epu64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_min_epu64(simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_min_epu64(k, a, b); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_min_epu64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_min_epu64 - #define _mm512_maskz_min_epu64(k, a, b) simde_mm512_maskz_min_epu64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_min_ps (simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_min_ps(a, b); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a), - b_ = simde__m512_to_private(b); - - #if defined(SIMDE_X86_AVX2_NATIVE) - r_.m256[0] = simde_mm256_min_ps(a_.m256[0], b_.m256[0]); - r_.m256[1] = simde_mm256_min_ps(a_.m256[1], b_.m256[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i] < b_.f32[i] ? a_.f32[i] : b_.f32[i]; - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_min_ps - #define _mm512_min_ps(a, b) simde_mm512_min_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_min_ps(simde__m512 src, simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_min_ps(src, k, a, b); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_min_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_min_ps - #define _mm512_mask_min_ps(src, k, a, b) simde_mm512_mask_min_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_min_ps(simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_min_ps(k, a, b); - #else - return simde_mm512_maskz_mov_ps(k, simde_mm512_min_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_min_ps - #define _mm512_maskz_min_ps(k, a, b) simde_mm512_maskz_min_ps(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_min_pd (simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_min_pd(a, b); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a), - b_ = simde__m512d_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[i] < b_.f64[i] ? a_.f64[i] : b_.f64[i]; - } - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_min_pd - #define _mm512_min_pd(a, b) simde_mm512_min_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_min_pd(simde__m512d src, simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_min_pd(src, k, a, b); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_min_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_min_pd - #define _mm512_mask_min_pd(src, k, a, b) simde_mm512_mask_min_pd(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_maskz_min_pd(simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_min_pd(k, a, b); - #else - return simde_mm512_maskz_mov_pd(k, simde_mm512_min_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_min_pd - #define _mm512_maskz_min_pd(k, a, b) simde_mm512_maskz_min_pd(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512h -simde_mm512_min_ph (simde__m512h a, simde__m512h b) { - #if defined(SIMDE_X86_AVX512FP16_NATIVE) - return _mm512_min_ph(a, b); - #else - simde__m512h_private - r_, - a_ = simde__m512h_to_private(a), - b_ = simde__m512h_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { - r_.f16[i] = simde_float16_to_float32(a_.f16[i]) < simde_float16_to_float32(b_.f16[i]) ? a_.f16[i] : b_.f16[i]; - } - - return simde__m512h_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) - #undef _mm512_min_ph - #define _mm512_min_ph(a, b) simde_mm512_min_ph(a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_MIN_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/mov.h b/ffi-deps/simde/simde/x86/avx512/mov.h deleted file mode 100644 index cee9dbb..0000000 --- a/ffi-deps/simde/simde/x86/avx512/mov.h +++ /dev/null @@ -1,865 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Christopher Moore - */ - -#if !defined(SIMDE_X86_AVX512_MOV_H) -#define SIMDE_X86_AVX512_MOV_H - -#include "types.h" -#include "cast.h" -#include "set.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_mov_epi8 (simde__m128i src, simde__mmask16 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_mov_epi8(src, k, a); - #else - simde__m128i_private - src_ = simde__m128i_to_private(src), - a_ = simde__m128i_to_private(a), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = ((k >> i) & 1) ? a_.i8[i] : src_.i8[i]; - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_mov_epi8 - #define _mm_mask_mov_epi8(src, k, a) simde_mm_mask_mov_epi8(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_mov_epi16 (simde__m128i src, simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_mov_epi16(src, k, a); - #else - simde__m128i_private - src_ = simde__m128i_to_private(src), - a_ = simde__m128i_to_private(a), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = ((k >> i) & 1) ? a_.i16[i] : src_.i16[i]; - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_mov_epi16 - #define _mm_mask_mov_epi16(src, k, a) simde_mm_mask_mov_epi16(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_mov_epi32 (simde__m128i src, simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_mov_epi32(src, k, a); - #else - simde__m128i_private - src_ = simde__m128i_to_private(src), - a_ = simde__m128i_to_private(a), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = ((k >> i) & 1) ? a_.i32[i] : src_.i32[i]; - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_mov_epi32 - #define _mm_mask_mov_epi32(src, k, a) simde_mm_mask_mov_epi32(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_mov_epi64 (simde__m128i src, simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_mov_epi64(src, k, a); - #else - simde__m128i_private - src_ = simde__m128i_to_private(src), - a_ = simde__m128i_to_private(a), - r_; - - /* N.B. CM: No fallbacks as there are only two elements */ - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = ((k >> i) & 1) ? a_.i64[i] : src_.i64[i]; - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_mov_epi64 - #define _mm_mask_mov_epi64(src, k, a) simde_mm_mask_mov_epi64(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_mask_mov_pd(simde__m128d src, simde__mmask8 k, simde__m128d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_mov_pd(src, k, a); - #else - return simde_mm_castsi128_pd(simde_mm_mask_mov_epi64(simde_mm_castpd_si128(src), k, simde_mm_castpd_si128(a))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_mov_pd - #define _mm_mask_mov_pd(src, k, a) simde_mm_mask_mov_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_mask_mov_ps (simde__m128 src, simde__mmask8 k, simde__m128 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_mov_ps(src, k, a); - #else - return simde_mm_castsi128_ps(simde_mm_mask_mov_epi32(simde_mm_castps_si128(src), k, simde_mm_castps_si128(a))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_mov_ps - #define _mm_mask_mov_ps(src, k, a) simde_mm_mask_mov_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_mov_epi8 (simde__m256i src, simde__mmask32 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_mov_epi8(src, k, a); - #else - simde__m256i_private - r_, - src_ = simde__m256i_to_private(src), - a_ = simde__m256i_to_private(a); - - #if defined(SIMDE_X86_SSSE3_NATIVE) - r_.m128i[0] = simde_mm_mask_mov_epi8(src_.m128i[0], HEDLEY_STATIC_CAST(simde__mmask16, k ), a_.m128i[0]); - r_.m128i[1] = simde_mm_mask_mov_epi8(src_.m128i[1], HEDLEY_STATIC_CAST(simde__mmask16, k >> 16), a_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = ((k >> i) & 1) ? a_.i8[i] : src_.i8[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_mov_epi8 - #define _mm256_mask_mov_epi8(src, k, a) simde_mm256_mask_mov_epi8(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_mov_epi16 (simde__m256i src, simde__mmask16 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_mov_epi16(src, k, a); - #else - simde__m256i_private - src_ = simde__m256i_to_private(src), - a_ = simde__m256i_to_private(a), - r_; - - #if defined(SIMDE_X86_SSE2_NATIVE) - r_.m128i[0] = simde_mm_mask_mov_epi16(src_.m128i[0], HEDLEY_STATIC_CAST(simde__mmask8, k ), a_.m128i[0]); - r_.m128i[1] = simde_mm_mask_mov_epi16(src_.m128i[1], HEDLEY_STATIC_CAST(simde__mmask8, k >> 8), a_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = ((k >> i) & 1) ? a_.i16[i] : src_.i16[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_mov_epi16 - #define _mm256_mask_mov_epi16(src, k, a) simde_mm256_mask_mov_epi16(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_mov_epi32 (simde__m256i src, simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_mov_epi32(src, k, a); - #else - simde__m256i_private - src_ = simde__m256i_to_private(src), - a_ = simde__m256i_to_private(a), - r_; - - #if defined(SIMDE_X86_SSE2_NATIVE) - r_.m128i[0] = simde_mm_mask_mov_epi32(src_.m128i[0], k , a_.m128i[0]); - r_.m128i[1] = simde_mm_mask_mov_epi32(src_.m128i[1], k >> 4, a_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = ((k >> i) & 1) ? a_.i32[i] : src_.i32[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_mov_epi32 - #define _mm256_mask_mov_epi32(src, k, a) simde_mm256_mask_mov_epi32(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_mov_epi64 (simde__m256i src, simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_mov_epi64(src, k, a); - #else - simde__m256i_private - src_ = simde__m256i_to_private(src), - a_ = simde__m256i_to_private(a), - r_; - - /* N.B. CM: This fallback may not be faster as there are only four elements */ - #if defined(SIMDE_X86_SSE2_NATIVE) - r_.m128i[0] = simde_mm_mask_mov_epi64(src_.m128i[0], k , a_.m128i[0]); - r_.m128i[1] = simde_mm_mask_mov_epi64(src_.m128i[1], k >> 2, a_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = ((k >> i) & 1) ? a_.i64[i] : src_.i64[i]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_mov_epi64 - #define _mm256_mask_mov_epi64(src, k, a) simde_mm256_mask_mov_epi64(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_mask_mov_pd (simde__m256d src, simde__mmask8 k, simde__m256d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_mov_pd(src, k, a); - #else - return simde_mm256_castsi256_pd(simde_mm256_mask_mov_epi64(simde_mm256_castpd_si256(src), k, simde_mm256_castpd_si256(a))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_mov_pd - #define _mm256_mask_mov_pd(src, k, a) simde_mm256_mask_mov_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_mask_mov_ps (simde__m256 src, simde__mmask8 k, simde__m256 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_mov_ps(src, k, a); - #else - return simde_mm256_castsi256_ps(simde_mm256_mask_mov_epi32(simde_mm256_castps_si256(src), k, simde_mm256_castps_si256(a))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_mov_ps - #define _mm256_mask_mov_ps(src, k, a) simde_mm256_mask_mov_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_mov_epi8 (simde__m512i src, simde__mmask64 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_mov_epi8(src, k, a); - #else - simde__m512i_private - src_ = simde__m512i_to_private(src), - a_ = simde__m512i_to_private(a), - r_; - - #if defined(SIMDE_X86_SSSE3_NATIVE) - r_.m256i[0] = simde_mm256_mask_mov_epi8(src_.m256i[0], HEDLEY_STATIC_CAST(simde__mmask32, k ), a_.m256i[0]); - r_.m256i[1] = simde_mm256_mask_mov_epi8(src_.m256i[1], HEDLEY_STATIC_CAST(simde__mmask32, k >> 32), a_.m256i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = ((k >> i) & 1) ? a_.i8[i] : src_.i8[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_mov_epi8 - #define _mm512_mask_mov_epi8(src, k, a) simde_mm512_mask_mov_epi8(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_mov_epi16 (simde__m512i src, simde__mmask32 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_mov_epi16(src, k, a); - #else - simde__m512i_private - src_ = simde__m512i_to_private(src), - a_ = simde__m512i_to_private(a), - r_; - - #if defined(SIMDE_X86_SSE2_NATIVE) - r_.m256i[0] = simde_mm256_mask_mov_epi16(src_.m256i[0], HEDLEY_STATIC_CAST(simde__mmask16, k ), a_.m256i[0]); - r_.m256i[1] = simde_mm256_mask_mov_epi16(src_.m256i[1], HEDLEY_STATIC_CAST(simde__mmask16, k >> 16), a_.m256i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = ((k >> i) & 1) ? a_.i16[i] : src_.i16[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_mov_epi16 - #define _mm512_mask_mov_epi16(src, k, a) simde_mm512_mask_mov_epi16(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_mov_epi32 (simde__m512i src, simde__mmask16 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_mov_epi32(src, k, a); - #else - simde__m512i_private - src_ = simde__m512i_to_private(src), - a_ = simde__m512i_to_private(a), - r_; - - #if defined(SIMDE_X86_SSE2_NATIVE) - r_.m256i[0] = simde_mm256_mask_mov_epi32(src_.m256i[0], HEDLEY_STATIC_CAST(simde__mmask8, k ), a_.m256i[0]); - r_.m256i[1] = simde_mm256_mask_mov_epi32(src_.m256i[1], HEDLEY_STATIC_CAST(simde__mmask8, k >> 8), a_.m256i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = ((k >> i) & 1) ? a_.i32[i] : src_.i32[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_mov_epi32 - #define _mm512_mask_mov_epi32(src, k, a) simde_mm512_mask_mov_epi32(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_mov_epi64 (simde__m512i src, simde__mmask8 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_mov_epi64(src, k, a); - #else - simde__m512i_private - src_ = simde__m512i_to_private(src), - a_ = simde__m512i_to_private(a), - r_; - - /* N.B. CM: Without AVX2 this fallback may not be faster as there are only eight elements */ - #if defined(SIMDE_X86_SSE2_NATIVE) - r_.m256i[0] = simde_mm256_mask_mov_epi64(src_.m256i[0], k , a_.m256i[0]); - r_.m256i[1] = simde_mm256_mask_mov_epi64(src_.m256i[1], k >> 4, a_.m256i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = ((k >> i) & 1) ? a_.i64[i] : src_.i64[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_mov_epi64 - #define _mm512_mask_mov_epi64(src, k, a) simde_mm512_mask_mov_epi64(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_mov_pd (simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_mov_pd(src, k, a); - #else - return simde_mm512_castsi512_pd(simde_mm512_mask_mov_epi64(simde_mm512_castpd_si512(src), k, simde_mm512_castpd_si512(a))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_mov_pd - #define _mm512_mask_mov_pd(src, k, a) simde_mm512_mask_mov_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_mov_ps (simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_mov_ps(src, k, a); - #else - return simde_mm512_castsi512_ps(simde_mm512_mask_mov_epi32(simde_mm512_castps_si512(src), k, simde_mm512_castps_si512(a))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_mov_ps - #define _mm512_mask_mov_ps(src, k, a) simde_mm512_mask_mov_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512h -simde_x_mm512_mask_mov_ph (simde__m512h src, simde__mmask32 k, simde__m512h a) { - return simde_mm512_castsi512_ph(simde_mm512_mask_mov_epi16(simde_mm512_castph_si512(src), k, simde_mm512_castph_si512(a))); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_mov_epi8 (simde__mmask16 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_mov_epi8(k, a); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = ((k >> i) & 1) ? a_.i8[i] : INT8_C(0); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_mov_epi8 - #define _mm_maskz_mov_epi8(k, a) simde_mm_maskz_mov_epi8(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_mov_epi16 (simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_mov_epi16(k, a); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = ((k >> i) & 1) ? a_.i16[i] : INT16_C(0); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_mov_epi16 - #define _mm_maskz_mov_epi16(k, a) simde_mm_maskz_mov_epi16(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_mov_epi32 (simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_mov_epi32(k, a); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = ((k >> i) & 1) ? a_.i32[i] : INT32_C(0); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_mov_epi32 - #define _mm_maskz_mov_epi32(k, a) simde_mm_maskz_mov_epi32(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_mov_epi64 (simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_mov_epi64(k, a); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - r_; - - /* N.B. CM: No fallbacks as there are only two elements */ - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = ((k >> i) & 1) ? a_.i64[i] : INT64_C(0); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_mov_epi64 - #define _mm_maskz_mov_epi64(k, a) simde_mm_maskz_mov_epi64(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_maskz_mov_pd (simde__mmask8 k, simde__m128d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_mov_pd(k, a); - #else - return simde_mm_castsi128_pd(simde_mm_maskz_mov_epi64(k, simde_mm_castpd_si128(a))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_mov_pd - #define _mm_maskz_mov_pd(k, a) simde_mm_maskz_mov_pd(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_maskz_mov_ps (simde__mmask8 k, simde__m128 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_mov_ps(k, a); - #else - return simde_mm_castsi128_ps(simde_mm_maskz_mov_epi32(k, simde_mm_castps_si128(a))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_mov_ps - #define _mm_maskz_mov_ps(k, a) simde_mm_maskz_mov_ps(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_mov_epi8 (simde__mmask32 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_mov_epi8(k, a); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a), - r_; - - #if defined(SIMDE_X86_SSSE3_NATIVE) - r_.m128i[0] = simde_mm_maskz_mov_epi8(HEDLEY_STATIC_CAST(simde__mmask16, k ), a_.m128i[0]); - r_.m128i[1] = simde_mm_maskz_mov_epi8(HEDLEY_STATIC_CAST(simde__mmask16, k >> 16), a_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = ((k >> i) & 1) ? a_.i8[i] : INT8_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_mov_epi8 - #define _mm256_maskz_mov_epi8(k, a) simde_mm256_maskz_mov_epi8(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_mov_epi16 (simde__mmask16 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_mov_epi16(k, a); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a), - r_; - - #if defined(SIMDE_X86_SSE2_NATIVE) - r_.m128i[0] = simde_mm_maskz_mov_epi16(HEDLEY_STATIC_CAST(simde__mmask8, k ), a_.m128i[0]); - r_.m128i[1] = simde_mm_maskz_mov_epi16(HEDLEY_STATIC_CAST(simde__mmask8, k >> 8), a_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = ((k >> i) & 1) ? a_.i16[i] : INT16_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_mov_epi16 - #define _mm256_maskz_mov_epi16(k, a) simde_mm256_maskz_mov_epi16(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_mov_epi32 (simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_mov_epi32(k, a); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a), - r_; - - #if defined(SIMDE_X86_SSE2_NATIVE) - r_.m128i[0] = simde_mm_maskz_mov_epi32(k , a_.m128i[0]); - r_.m128i[1] = simde_mm_maskz_mov_epi32(k >> 4, a_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = ((k >> i) & 1) ? a_.i32[i] : INT32_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_mov_epi32 - #define _mm256_maskz_mov_epi32(k, a) simde_mm256_maskz_mov_epi32(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_mov_epi64 (simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_mov_epi64(k, a); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a), - r_; - - /* N.B. CM: This fallback may not be faster as there are only four elements */ - #if defined(SIMDE_X86_SSE2_NATIVE) - r_.m128i[0] = simde_mm_maskz_mov_epi64(k , a_.m128i[0]); - r_.m128i[1] = simde_mm_maskz_mov_epi64(k >> 2, a_.m128i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = ((k >> i) & 1) ? a_.i64[i] : INT64_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_mov_epi64 - #define _mm256_maskz_mov_epi64(k, a) simde_mm256_maskz_mov_epi64(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_maskz_mov_pd (simde__mmask8 k, simde__m256d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_mov_pd(k, a); - #else - return simde_mm256_castsi256_pd(simde_mm256_maskz_mov_epi64(k, simde_mm256_castpd_si256(a))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_mov_pd - #define _mm256_maskz_mov_pd(k, a) simde_mm256_maskz_mov_pd(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_maskz_mov_ps (simde__mmask8 k, simde__m256 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_mov_ps(k, a); - #else - return simde_mm256_castsi256_ps(simde_mm256_maskz_mov_epi32(k, simde_mm256_castps_si256(a))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_mov_ps - #define _mm256_maskz_mov_ps(k, a) simde_mm256_maskz_mov_ps(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_mov_epi8 (simde__mmask64 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_mov_epi8(k, a); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - r_; - - #if defined(SIMDE_X86_SSSE3_NATIVE) - r_.m256i[0] = simde_mm256_maskz_mov_epi8(HEDLEY_STATIC_CAST(simde__mmask32, k ), a_.m256i[0]); - r_.m256i[1] = simde_mm256_maskz_mov_epi8(HEDLEY_STATIC_CAST(simde__mmask32, k >> 32), a_.m256i[1]); - #else - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = ((k >> i) & 1) ? a_.i8[i] : INT8_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_mov_epi8 - #define _mm512_maskz_mov_epi8(k, a) simde_mm512_maskz_mov_epi8(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_mov_epi16 (simde__mmask32 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_mov_epi16(k, a); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - r_; - - #if defined(SIMDE_X86_SSE2_NATIVE) - r_.m256i[0] = simde_mm256_maskz_mov_epi16(HEDLEY_STATIC_CAST(simde__mmask16, k ), a_.m256i[0]); - r_.m256i[1] = simde_mm256_maskz_mov_epi16(HEDLEY_STATIC_CAST(simde__mmask16, k >> 16), a_.m256i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = ((k >> i) & 1) ? a_.i16[i] : INT16_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_mov_epi16 - #define _mm512_maskz_mov_epi16(k, a) simde_mm512_maskz_mov_epi16(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_mov_epi32 (simde__mmask16 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_mov_epi32(k, a); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - r_; - - #if defined(SIMDE_X86_SSE2_NATIVE) - r_.m256i[0] = simde_mm256_maskz_mov_epi32(HEDLEY_STATIC_CAST(simde__mmask8, k ), a_.m256i[0]); - r_.m256i[1] = simde_mm256_maskz_mov_epi32(HEDLEY_STATIC_CAST(simde__mmask8, k >> 8), a_.m256i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = ((k >> i) & 1) ? a_.i32[i] : INT32_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_mov_epi32 - #define _mm512_maskz_mov_epi32(k, a) simde_mm512_maskz_mov_epi32(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_mov_epi64 (simde__mmask8 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_mov_epi64(k, a); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - r_; - - /* N.B. CM: Without AVX2 this fallback may not be faster as there are only eight elements */ - #if defined(SIMDE_X86_SSE2_NATIVE) - r_.m256i[0] = simde_mm256_maskz_mov_epi64(k , a_.m256i[0]); - r_.m256i[1] = simde_mm256_maskz_mov_epi64(k >> 4, a_.m256i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = ((k >> i) & 1) ? a_.i64[i] : INT64_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_mov_epi64 - #define _mm512_maskz_mov_epi64(k, a) simde_mm512_maskz_mov_epi64(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_maskz_mov_pd (simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_mov_pd(k, a); - #else - return simde_mm512_castsi512_pd(simde_mm512_maskz_mov_epi64(k, simde_mm512_castpd_si512(a))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_mov_pd - #define _mm512_maskz_mov_pd(k, a) simde_mm512_maskz_mov_pd(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_mov_ps (simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_mov_ps(k, a); - #else - return simde_mm512_castsi512_ps(simde_mm512_maskz_mov_epi32(k, simde_mm512_castps_si512(a))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_mov_ps - #define _mm512_maskz_mov_ps(k, a) simde_mm512_maskz_mov_ps(k, a) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_MOV_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/mov_mask.h b/ffi-deps/simde/simde/x86/avx512/mov_mask.h deleted file mode 100644 index 1d0b120..0000000 --- a/ffi-deps/simde/simde/x86/avx512/mov_mask.h +++ /dev/null @@ -1,372 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - */ - -#if !defined(SIMDE_X86_AVX512_MOV_MASK_H) -#define SIMDE_X86_AVX512_MOV_MASK_H - -#include "types.h" -#include "../avx2.h" - -#include "cast.h" -#include "set.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm_movepi8_mask (simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_movepi8_mask(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - return HEDLEY_STATIC_CAST(simde__mmask16, simde_mm_movemask_epi8(a)); - #else - simde__m128i_private a_ = simde__m128i_to_private(a); - simde__mmask16 r = 0; - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) { - r |= (a_.i8[i] < 0) ? (UINT64_C(1) << i) : 0; - } - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_movepi8_mask - #define _mm_movepi8_mask(a) simde_mm_movepi8_mask(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_movepi16_mask (simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_movepi16_mask(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - /* There is no 32-bit _mm_movemask_* function, so we use - * _mm_movemask_epi8 then extract the odd bits. */ - uint_fast16_t r = HEDLEY_STATIC_CAST(uint_fast16_t, simde_mm_movemask_epi8(a)); - r = ( (r >> 1)) & UINT32_C(0x5555); - r = (r | (r >> 1)) & UINT32_C(0x3333); - r = (r | (r >> 2)) & UINT32_C(0x0f0f); - r = (r | (r >> 4)) & UINT32_C(0x00ff); - return HEDLEY_STATIC_CAST(simde__mmask8, r); - #else - simde__m128i_private a_ = simde__m128i_to_private(a); - simde__mmask8 r = 0; - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i16) / sizeof(a_.i16[0])) ; i++) { - r |= (a_.i16[i] < 0) ? (UINT32_C(1) << i) : 0; - } - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_movepi16_mask - #define _mm_movepi16_mask(a) simde_mm_movepi16_mask(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_movepi32_mask (simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm_movepi32_mask(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - return HEDLEY_STATIC_CAST(simde__mmask8, simde_mm_movemask_ps(simde_mm_castsi128_ps(a))); - #else - simde__m128i_private a_ = simde__m128i_to_private(a); - simde__mmask8 r = 0; - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r |= (a_.i32[i] < 0) ? (UINT32_C(1) << i) : 0; - } - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm_movepi32_mask - #define _mm_movepi32_mask(a) simde_mm_movepi32_mask(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm_movepi64_mask (simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm_movepi64_mask(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - return HEDLEY_STATIC_CAST(simde__mmask8, simde_mm_movemask_pd(simde_mm_castsi128_pd(a))); - #else - simde__m128i_private a_ = simde__m128i_to_private(a); - simde__mmask8 r = 0; - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r |= (a_.i64[i] < 0) ? (UINT32_C(1) << i) : 0; - } - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm_movepi64_mask - #define _mm_movepi64_mask(a) simde_mm_movepi64_mask(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm256_movepi8_mask (simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_movepi8_mask(a); - #else - simde__m256i_private a_ = simde__m256i_to_private(a); - simde__mmask32 r = 0; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(a_.m128i) / sizeof(a_.m128i[0])) ; i++) { - r |= HEDLEY_STATIC_CAST(simde__mmask32, simde_mm_movepi8_mask(a_.m128i[i])) << (i * 16); - } - #else - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) { - r |= (a_.i8[i] < 0) ? (UINT64_C(1) << i) : 0; - } - #endif - - return HEDLEY_STATIC_CAST(simde__mmask32, r); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_movepi8_mask - #define _mm256_movepi8_mask(a) simde_mm256_movepi8_mask(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm256_movepi16_mask (simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_movepi16_mask(a); - #else - simde__m256i_private a_ = simde__m256i_to_private(a); - simde__mmask16 r = 0; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(a_.m128i) / sizeof(a_.m128i[0])) ; i++) { - r |= HEDLEY_STATIC_CAST(simde__mmask16, simde_mm_movepi16_mask(a_.m128i[i])) << (i * 8); - } - #else - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i16) / sizeof(a_.i16[0])) ; i++) { - r |= (a_.i16[i] < 0) ? (UINT32_C(1) << i) : 0; - } - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_movepi16_mask - #define _mm256_movepi16_mask(a) simde_mm256_movepi16_mask(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_movepi32_mask (simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm256_movepi32_mask(a); - #else - simde__m256i_private a_ = simde__m256i_to_private(a); - simde__mmask8 r = 0; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(a_.m128i) / sizeof(a_.m128i[0])) ; i++) { - r |= HEDLEY_STATIC_CAST(simde__mmask16, simde_mm_movepi32_mask(a_.m128i[i])) << (i * 4); - } - #else - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r |= (a_.i32[i] < 0) ? (UINT32_C(1) << i) : 0; - } - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm256_movepi32_mask - #define _mm256_movepi32_mask(a) simde_mm256_movepi32_mask(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_movepi64_mask (simde__m256i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm256_movepi64_mask(a); - #else - simde__m256i_private a_ = simde__m256i_to_private(a); - simde__mmask8 r = 0; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(a_.m128i) / sizeof(a_.m128i[0])) ; i++) { - r |= HEDLEY_STATIC_CAST(simde__mmask8, simde_mm_movepi64_mask(a_.m128i[i])) << (i * 2); - } - #else - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r |= (a_.i64[i] < 0) ? (UINT32_C(1) << i) : 0; - } - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm256_movepi64_mask - #define _mm256_movepi64_mask(a) simde_mm256_movepi64_mask(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_mm512_movepi8_mask (simde__m512i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_movepi8_mask(a); - #else - simde__m512i_private a_ = simde__m512i_to_private(a); - simde__mmask64 r = 0; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(a_.m256i) / sizeof(a_.m256i[0])) ; i++) { - r |= HEDLEY_STATIC_CAST(simde__mmask64, simde_mm256_movepi8_mask(a_.m256i[i])) << (i * 32); - } - #else - r = 0; - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) { - r |= (a_.i8[i] < 0) ? (UINT64_C(1) << i) : 0; - } - #endif - - return HEDLEY_STATIC_CAST(simde__mmask64, r); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_movepi8_mask - #define _mm512_movepi8_mask(a) simde_mm512_movepi8_mask(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm512_movepi16_mask (simde__m512i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_movepi16_mask(a); - #else - simde__m512i_private a_ = simde__m512i_to_private(a); - simde__mmask32 r = 0; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(a_.m256i) / sizeof(a_.m256i[0])) ; i++) { - r |= HEDLEY_STATIC_CAST(simde__mmask32, simde_mm256_movepi16_mask(a_.m256i[i])) << (i * 16); - } - #else - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i16) / sizeof(a_.i16[0])) ; i++) { - r |= (a_.i16[i] < 0) ? (UINT32_C(1) << i) : 0; - } - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_movepi16_mask - #define _mm512_movepi16_mask(a) simde_mm512_movepi16_mask(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm512_movepi32_mask (simde__m512i a) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_movepi32_mask(a); - #else - simde__m512i_private a_ = simde__m512i_to_private(a); - simde__mmask16 r = 0; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(a_.m256i) / sizeof(a_.m256i[0])) ; i++) { - r |= HEDLEY_STATIC_CAST(simde__mmask16, simde_mm256_movepi32_mask(a_.m256i[i])) << (i * 8); - } - #else - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r |= (a_.i32[i] < 0) ? (UINT32_C(1) << i) : 0; - } - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_movepi32_mask - #define _mm512_movepi32_mask(a) simde_mm512_movepi32_mask(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm512_movepi64_mask (simde__m512i a) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_movepi64_mask(a); - #else - simde__m512i_private a_ = simde__m512i_to_private(a); - simde__mmask8 r = 0; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(a_.m256i) / sizeof(a_.m256i[0])) ; i++) { - r |= simde_mm256_movepi64_mask(a_.m256i[i]) << (i * 4); - } - #else - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r |= (a_.i64[i] < 0) ? (UINT32_C(1) << i) : 0; - } - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_movepi64_mask - #define _mm512_movepi64_mask(a) simde_mm512_movepi64_mask(a) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_MOV_MASK_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/movm.h b/ffi-deps/simde/simde/x86/avx512/movm.h deleted file mode 100644 index 452e127..0000000 --- a/ffi-deps/simde/simde/x86/avx512/movm.h +++ /dev/null @@ -1,460 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Christopher Moore - */ - -#if !defined(SIMDE_X86_AVX512_MOVM_H) -#define SIMDE_X86_AVX512_MOVM_H - -#include "types.h" -#include "../avx2.h" -#include "set.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_movm_epi8 (simde__mmask16 k) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_movm_epi8(k); - #elif defined(SIMDE_X86_SSSE3_NATIVE) - const simde__m128i zero = simde_mm_setzero_si128(); - const simde__m128i bits = simde_mm_set_epi16(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80); - const simde__m128i shuffle = simde_mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); - simde__m128i r; - - r = simde_mm_set1_epi16(HEDLEY_STATIC_CAST(short, k)); - r = simde_mm_mullo_epi16(r, bits); - r = simde_mm_shuffle_epi8(r, shuffle); - r = simde_mm_cmpgt_epi8(zero, r); - - return r; - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - static const int8_t pos_data[] = { 7, 6, 5, 4, 3, 2, 1, 0 }; - int8x8_t pos = vld1_s8(pos_data); - r_.neon_i8 = vcombine_s8( - vshr_n_s8(vshl_s8(vdup_n_s8(HEDLEY_STATIC_CAST(int8_t, k)), pos), 7), - vshr_n_s8(vshl_s8(vdup_n_s8(HEDLEY_STATIC_CAST(int8_t, k >> 8)), pos), 7)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = ((k >> i) & 1) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_movm_epi8 - #define _mm_movm_epi8(k) simde_mm_movm_epi8(k) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_movm_epi8 (simde__mmask32 k) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_movm_epi8(k); - #elif defined(SIMDE_X86_AVX2_NATIVE) - const simde__m256i zero = simde_mm256_setzero_si256(); - const simde__m256i bits = simde_mm256_broadcastsi128_si256(simde_mm_set_epi16(0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80)); - const simde__m256i shuffle = simde_mm256_broadcastsi128_si256(simde_mm_set_epi8(15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0)); - simde__m256i r; - - r = simde_mm256_set_m128i(_mm_set1_epi16(HEDLEY_STATIC_CAST(short, k >> 16)), _mm_set1_epi16(HEDLEY_STATIC_CAST(short, k))); - r = simde_mm256_mullo_epi16(r, bits); - r = simde_mm256_shuffle_epi8(r, shuffle); - r = simde_mm256_cmpgt_epi8(zero, r); - - return r; - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_movm_epi8(HEDLEY_STATIC_CAST(simde__mmask16, k)); - r_.m128i[1] = simde_mm_movm_epi8(HEDLEY_STATIC_CAST(simde__mmask16, k >> 16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = ((k >> i) & 1) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_movm_epi8 - #define _mm256_movm_epi8(k) simde_mm256_movm_epi8(k) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_movm_epi8 (simde__mmask64 k) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_movm_epi8(k); - #else - simde__m512i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256i[0] = simde_mm256_movm_epi8(HEDLEY_STATIC_CAST(simde__mmask32, k)); - r_.m256i[1] = simde_mm256_movm_epi8(HEDLEY_STATIC_CAST(simde__mmask32, k >> 32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = ((k >> i) & 1) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_movm_epi8 - #define _mm512_movm_epi8(k) simde_mm512_movm_epi8(k) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_movm_epi16 (simde__mmask8 k) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_movm_epi16(k); - #elif defined(SIMDE_X86_SSE2_NATIVE) - const simde__m128i bits = simde_mm_set_epi16(0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, INT16_MIN /* 0x8000 */); - simde__m128i r; - - r = simde_mm_set1_epi16(HEDLEY_STATIC_CAST(short, k)); - r = simde_mm_mullo_epi16(r, bits); - r = simde_mm_srai_epi16(r, 15); - - return r; - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - static const int16_t pos_data[] = { 15, 14, 13, 12, 11, 10, 9, 8 }; - const int16x8_t pos = vld1q_s16(pos_data); - r_.neon_i16 = vshrq_n_s16(vshlq_s16(vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, k)), pos), 15); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = ((k >> i) & 1) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_movm_epi16 - #define _mm_movm_epi16(k) simde_mm_movm_epi16(k) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_movm_epi16 (simde__mmask16 k) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_movm_epi16(k); - #elif defined(SIMDE_X86_AVX2_NATIVE) - const __m256i bits = _mm256_set_epi16(0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080, - 0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000, INT16_MIN /* 0x8000 */); - __m256i r; - - r = _mm256_set1_epi16(HEDLEY_STATIC_CAST(short, k)); - r = _mm256_mullo_epi16(r, bits); - r = _mm256_srai_epi16(r, 15); - - return r; - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_movm_epi16(HEDLEY_STATIC_CAST(simde__mmask8, k)); - r_.m128i[1] = simde_mm_movm_epi16(HEDLEY_STATIC_CAST(simde__mmask8, k >> 8)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = ((k >> i) & 1) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_movm_epi16 - #define _mm256_movm_epi16(k) simde_mm256_movm_epi16(k) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_movm_epi16 (simde__mmask32 k) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm512_movm_epi16(k); - #else - simde__m512i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256i[0] = simde_mm256_movm_epi16(HEDLEY_STATIC_CAST(simde__mmask16, k)); - r_.m256i[1] = simde_mm256_movm_epi16(HEDLEY_STATIC_CAST(simde__mmask16, k >> 16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = ((k >> i) & 1) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_movm_epi16 - #define _mm512_movm_epi16(k) simde_mm512_movm_epi16(k) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_movm_epi32 (simde__mmask8 k) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm_movm_epi32(k); - #elif defined(SIMDE_X86_AVX2_NATIVE) - const __m128i shifts = _mm_set_epi32(28, 29, 30, 31); - __m128i r; - - r = _mm_set1_epi32(HEDLEY_STATIC_CAST(int, k)); - r = _mm_sllv_epi32(r, shifts); - r = _mm_srai_epi32(r, 31); - - return r; - #elif defined(SIMDE_X86_SSE2_NATIVE) - const simde__m128i bits = simde_mm_set_epi32(0x10000000, 0x20000000, 0x40000000, INT32_MIN /* 0x80000000 */); - simde__m128i r; - - r = simde_mm_set1_epi16(HEDLEY_STATIC_CAST(short, k)); - r = simde_mm_mullo_epi16(r, bits); - r = simde_mm_srai_epi32(r, 31); - - return r; - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - static const int32_t pos_data[] = { 31, 30, 29, 28 }; - const int32x4_t pos = vld1q_s32(pos_data); - r_.neon_i32 = vshrq_n_s32(vshlq_s32(vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, k)), pos), 31); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = ((k >> i) & 1) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_movm_epi32 - #define _mm_movm_epi32(k) simde_mm_movm_epi32(k) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_movm_epi32 (simde__mmask8 k) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm256_movm_epi32(k); - #elif defined(SIMDE_X86_AVX2_NATIVE) - const __m256i shifts = _mm256_set_epi32(24, 25, 26, 27, 28, 29, 30, 31); - __m256i r; - - r = _mm256_set1_epi32(HEDLEY_STATIC_CAST(int, k)); - r = _mm256_sllv_epi32(r, shifts); - r = _mm256_srai_epi32(r, 31); - - return r; - #else - simde__m256i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_movm_epi32(k ); - r_.m128i[1] = simde_mm_movm_epi32(k >> 4); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = ((k >> i) & 1) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_movm_epi32 - #define _mm256_movm_epi32(k) simde_mm256_movm_epi32(k) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_movm_epi32 (simde__mmask16 k) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_movm_epi32(k); - #else - simde__m512i_private r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256i[0] = simde_mm256_movm_epi32(HEDLEY_STATIC_CAST(simde__mmask8, k )); - r_.m256i[1] = simde_mm256_movm_epi32(HEDLEY_STATIC_CAST(simde__mmask8, k >> 8)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = ((k >> i) & 1) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_movm_epi32 - #define _mm512_movm_epi32(k) simde_mm512_movm_epi32(k) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_movm_epi64 (simde__mmask8 k) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_movm_epi64(k); - /* N.B. CM: These fallbacks may not be faster as there are only two elements */ - #elif defined(SIMDE_X86_AVX2_NATIVE) - const __m128i shifts = _mm_set_epi32(30, 30, 31, 31); - __m128i r; - - r = _mm_set1_epi32(HEDLEY_STATIC_CAST(int, k)); - r = _mm_sllv_epi32(r, shifts); - r = _mm_srai_epi32(r, 31); - - return r; - #elif defined(SIMDE_X86_SSE2_NATIVE) - const simde__m128i bits = simde_mm_set_epi32(0x40000000, 0x40000000, INT32_MIN /* 0x80000000 */, INT32_MIN /* 0x80000000 */); - simde__m128i r; - - r = simde_mm_set1_epi16(HEDLEY_STATIC_CAST(short, k)); - r = simde_mm_mullo_epi16(r, bits); - r = simde_mm_srai_epi32(r, 31); - - return r; - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - static const int64_t pos_data[] = { 63, 62 }; - const int64x2_t pos = vld1q_s64(pos_data); - r_.neon_i64 = vshrq_n_s64(vshlq_s64(vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, k)), pos), 63); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = ((k >> i) & 1) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_movm_epi64 - #define _mm_movm_epi64(k) simde_mm_movm_epi64(k) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_movm_epi64 (simde__mmask8 k) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_movm_epi64(k); - #elif defined(SIMDE_X86_AVX2_NATIVE) - const __m256i shifts = _mm256_set_epi32(28, 28, 29, 29, 30, 30, 31, 31); - __m256i r; - - r = _mm256_set1_epi32(HEDLEY_STATIC_CAST(int, k)); - r = _mm256_sllv_epi32(r, shifts); - r = _mm256_srai_epi32(r, 31); - - return r; - #else - simde__m256i_private r_; - - /* N.B. CM: This fallback may not be faster as there are only four elements */ - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_movm_epi64(k ); - r_.m128i[1] = simde_mm_movm_epi64(k >> 2); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = ((k >> i) & 1) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_movm_epi64 - #define _mm256_movm_epi64(k) simde_mm256_movm_epi64(k) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_movm_epi64 (simde__mmask8 k) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_movm_epi64(k); - #else - simde__m512i_private r_; - - /* N.B. CM: Without AVX2 this fallback may not be faster as there are only eight elements */ - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256i[0] = simde_mm256_movm_epi64(k ); - r_.m256i[1] = simde_mm256_movm_epi64(k >> 4); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = ((k >> i) & 1) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_movm_epi64 - #define _mm512_movm_epi64(k) simde_mm512_movm_epi64(k) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_MOVM_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/mul.h b/ffi-deps/simde/simde/x86/avx512/mul.h deleted file mode 100644 index 2ca7def..0000000 --- a/ffi-deps/simde/simde/x86/avx512/mul.h +++ /dev/null @@ -1,279 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Himanshi Mathur - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_MUL_H) -#define SIMDE_X86_AVX512_MUL_H - -#include "types.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mul_ps (simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mul_ps(a, b); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a), - b_ = simde__m512_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = a_.f32 * b_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_mul_ps(a_.m256[i], b_.m256[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mul_ps - #define _mm512_mul_ps(a, b) simde_mm512_mul_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_mul_ps(simde__m512 src, simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_mul_ps(src, k, a, b); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_mul_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_mul_ps - #define _mm512_mask_mul_ps(src, k, a, b) simde_mm512_mask_mul_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_mul_ps(simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_mul_ps(k, a, b); - #else - return simde_mm512_maskz_mov_ps(k, simde_mm512_mul_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_mul_ps - #define _mm512_maskz_mul_ps(k, a, b) simde_mm512_maskz_mul_ps(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mul_pd (simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mul_pd(a, b); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a), - b_ = simde__m512d_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 * b_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_mul_pd(a_.m256d[i], b_.m256d[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mul_pd - #define _mm512_mul_pd(a, b) simde_mm512_mul_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_mul_pd(simde__m512d src, simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_mul_pd(src, k, a, b); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_mul_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_mul_pd - #define _mm512_mask_mul_pd(src, k, a, b) simde_mm512_mask_mul_pd(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_maskz_mul_pd(simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_mul_pd(k, a, b); - #else - return simde_mm512_maskz_mov_pd(k, simde_mm512_mul_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_mul_pd - #define _mm512_maskz_mul_pd(k, a, b) simde_mm512_maskz_mul_pd(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mul_epi32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mul_epi32(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_SHUFFLE_VECTOR_) - simde__m512i_private x; - __typeof__(r_.i64) ta, tb; - - /* Get even numbered 32-bit values */ - x.i32 = SIMDE_SHUFFLE_VECTOR_(32, 64, a_.i32, b_.i32, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); - /* Cast to 64 bits */ - SIMDE_CONVERT_VECTOR_(ta, x.m256i_private[0].i32); - SIMDE_CONVERT_VECTOR_(tb, x.m256i_private[1].i32); - r_.i64 = ta * tb; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = HEDLEY_STATIC_CAST(int64_t, a_.i32[i << 1]) * HEDLEY_STATIC_CAST(int64_t, b_.i32[i << 1]); - } - #endif - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mul_epi32 - #define _mm512_mul_epi32(a, b) simde_mm512_mul_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_mul_epi32(simde__m512i src, simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_mul_epi32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_mul_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_mul_epi32 - #define _mm512_mask_mul_epi32(src, k, a, b) simde_mm512_mask_mul_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_mul_epi32(simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_mul_epi32(k, a, b); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_mul_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_mul_epi32 - #define _mm512_maskz_mul_epi32(k, a, b) simde_mm512_maskz_mul_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mul_epu32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mul_epu32(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_SHUFFLE_VECTOR_) - simde__m512i_private x; - __typeof__(r_.u64) ta, tb; - - x.u32 = SIMDE_SHUFFLE_VECTOR_(32, 64, a_.u32, b_.u32, 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30); - SIMDE_CONVERT_VECTOR_(ta, x.m256i_private[0].u32); - SIMDE_CONVERT_VECTOR_(tb, x.m256i_private[1].u32); - r_.u64 = ta * tb; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[i << 1]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[i << 1]); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mul_epu32 - #define _mm512_mul_epu32(a, b) simde_mm512_mul_epu32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_mul_epu32(simde__m512i src, simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_mul_epu32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_mul_epu32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_mul_epu32 - #define _mm512_mask_mul_epu32(src, k, a, b) simde_mm512_mask_mul_epu32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_mul_epu32(simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_mul_epu32(k, a, b); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_mul_epu32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_mul_epu32 - #define _mm512_maskz_mul_epu32(k, a, b) simde_mm512_maskz_mul_epu32(k, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_MUL_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/mulhi.h b/ffi-deps/simde/simde/x86/avx512/mulhi.h deleted file mode 100644 index 19cf81a..0000000 --- a/ffi-deps/simde/simde/x86/avx512/mulhi.h +++ /dev/null @@ -1,65 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_MULHI_H) -#define SIMDE_X86_AVX512_MULHI_H - -#include "types.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mulhi_epi16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mulhi_epi16(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (HEDLEY_STATIC_CAST(uint32_t, HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) >> 16)); - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mulhi_epi16 - #define _mm512_mulhi_epi16(a, b) simde_mm512_mulhi_epi16(a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_MULHI_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/mulhrs.h b/ffi-deps/simde/simde/x86/avx512/mulhrs.h deleted file mode 100644 index 8a3b169..0000000 --- a/ffi-deps/simde/simde/x86/avx512/mulhrs.h +++ /dev/null @@ -1,65 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_MULHRS_H) -#define SIMDE_X86_AVX512_MULHRS_H - -#include "types.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mulhrs_epi16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mulhrs_epi16(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, (((HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) + 0x4000) >> 15)); - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mulhrs_epi16 - #define _mm512_mulhrs_epi16(a, b) simde_mm512_mulhrs_epi16(a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_MULHRS_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/mullo.h b/ffi-deps/simde/simde/x86/avx512/mullo.h deleted file mode 100644 index f0cae05..0000000 --- a/ffi-deps/simde/simde/x86/avx512/mullo.h +++ /dev/null @@ -1,169 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_MULLO_H) -#define SIMDE_X86_AVX512_MULLO_H - -#include "types.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mullo_epi16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mullo_epi16(a, b); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] * b_.i16[i]); - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mullo_epi16 - #define _mm512_mullo_epi16(a, b) simde_mm512_mullo_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mullo_epi32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mullo_epi32(a, b); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.i32[i] * b_.i32[i]); - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mullo_epi32 - #define _mm512_mullo_epi32(a, b) simde_mm512_mullo_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_mullo_epi32(simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_mullo_epi32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_mullo_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_mullo_epi32 - #define _mm512_mask_mullo_epi32(src, k, a, b) simde_mm512_mask_mullo_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_mullo_epi32(simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_mullo_epi32(k, a, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_mullo_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_mullo_epi32 - #define _mm512_maskz_mullo_epi32(k, a, b) simde_mm512_maskz_mullo_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mullo_epi64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_mullo_epi64(a, b); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = HEDLEY_STATIC_CAST(int64_t, a_.i64[i] * b_.i64[i]); - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mullo_epi64 - #define _mm512_mullo_epi64(a, b) simde_mm512_mullo_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_mullo_epi64(simde__m512i src, simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_mask_mullo_epi64(src, k, a, b); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_mullo_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_mullo_epi64 - #define _mm512_mask_mullo_epi64(src, k, a, b) simde_mm512_mask_mullo_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_mullo_epi64(simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_maskz_mullo_epi64(k, a, b); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_mullo_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_mullo_epi64 - #define _mm512_maskz_mullo_epi64(k, a, b) simde_mm512_maskz_mullo_epi64(k, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_MULLO_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/multishift.h b/ffi-deps/simde/simde/x86/avx512/multishift.h deleted file mode 100644 index 5388d0d..0000000 --- a/ffi-deps/simde/simde/x86/avx512/multishift.h +++ /dev/null @@ -1,170 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_MULTISHIFT_H) -#define SIMDE_X86_AVX512_MULTISHIFT_H - -#include "types.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_multishift_epi64_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_multishift_epi64_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < sizeof(r_.u8) / sizeof(r_.u8[0]) ; i++) { - r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, (b_.u64[i / 8] >> (a_.u8[i] & 63)) | (b_.u64[i / 8] << (64 - (a_.u8[i] & 63)))); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_multishift_epi64_epi8 - #define _mm_multishift_epi64_epi8(a, b) simde_mm_multishift_epi64_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_multishift_epi64_epi8 (simde__m128i src, simde__mmask16 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_multishift_epi64_epi8(src, k, a, b); - #else - return simde_mm_mask_mov_epi8(src, k, simde_mm_multishift_epi64_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_multishift_epi64_epi8 - #define _mm_mask_multishift_epi64_epi8(src, k, a, b) simde_mm_mask_multishift_epi64_epi8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_multishift_epi64_epi8 (simde__mmask16 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_multishift_epi64_epi8(k, a, b); - #else - return simde_mm_maskz_mov_epi8(k, simde_mm_multishift_epi64_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_multishift_epi64_epi8 - #define _mm_maskz_multishift_epi64_epi8(k, a, b) simde_mm_maskz_multishift_epi64_epi8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_multishift_epi64_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_multishift_epi64_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < sizeof(r_.u8) / sizeof(r_.u8[0]) ; i++) { - r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, (b_.u64[i / 8] >> (a_.u8[i] & 63)) | (b_.u64[i / 8] << (64 - (a_.u8[i] & 63)))); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_multishift_epi64_epi8 - #define _mm256_multishift_epi64_epi8(a, b) simde_mm256_multishift_epi64_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_multishift_epi64_epi8 (simde__m256i src, simde__mmask32 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_multishift_epi64_epi8(src, k, a, b); - #else - return simde_mm256_mask_mov_epi8(src, k, simde_mm256_multishift_epi64_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_multishift_epi64_epi8 - #define _mm256_mask_multishift_epi64_epi8(src, k, a, b) simde_mm256_mask_multishift_epi64_epi8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_multishift_epi64_epi8 (simde__mmask32 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_multishift_epi64_epi8(k, a, b); - #else - return simde_mm256_maskz_mov_epi8(k, simde_mm256_multishift_epi64_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_multishift_epi64_epi8 - #define _mm256_maskz_multishift_epi64_epi8(k, a, b) simde_mm256_maskz_multishift_epi64_epi8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_multishift_epi64_epi8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) - return _mm512_multishift_epi64_epi8(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < sizeof(r_.u8) / sizeof(r_.u8[0]) ; i++) { - r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, (b_.u64[i / 8] >> (a_.u8[i] & 63)) | (b_.u64[i / 8] << (64 - (a_.u8[i] & 63)))); - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) - #undef _mm512_multishift_epi64_epi8 - #define _mm512_multishift_epi64_epi8(a, b) simde_mm512_multishift_epi64_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_multishift_epi64_epi8 (simde__m512i src, simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) - return _mm512_mask_multishift_epi64_epi8(src, k, a, b); - #else - return simde_mm512_mask_mov_epi8(src, k, simde_mm512_multishift_epi64_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_multishift_epi64_epi8 - #define _mm512_mask_multishift_epi64_epi8(src, k, a, b) simde_mm512_mask_multishift_epi64_epi8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_multishift_epi64_epi8 (simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) - return _mm512_maskz_multishift_epi64_epi8(k, a, b); - #else - return simde_mm512_maskz_mov_epi8(k, simde_mm512_multishift_epi64_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_multishift_epi64_epi8 - #define _mm512_maskz_multishift_epi64_epi8(k, a, b) simde_mm512_maskz_multishift_epi64_epi8(k, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_MULTISHIFT_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/negate.h b/ffi-deps/simde/simde/x86/avx512/negate.h deleted file mode 100644 index cee4ee6..0000000 --- a/ffi-deps/simde/simde/x86/avx512/negate.h +++ /dev/null @@ -1,88 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Himanshi Mathur - */ - -#if !defined(SIMDE_X86_AVX512_NEGATE_H) -#define SIMDE_X86_AVX512_NEGATE_H - -#include "types.h" -#include "mov.h" -#include "xor.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_x_mm512_negate_ps(simde__m512 a) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return simde_mm512_xor_ps(a,_mm512_set1_ps(SIMDE_FLOAT32_C(-0.0))); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if defined(SIMDE_VECTOR_NEGATE) - r_.f32 = -a_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = -a_.f32[i]; - } - #endif - - return simde__m512_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_x_mm512_negate_pd(simde__m512d a) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return simde_mm512_xor_pd(a, _mm512_set1_pd(SIMDE_FLOAT64_C(-0.0))); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if defined(SIMDE_VECTOR_NEGATE) - r_.f64 = -a_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = -a_.f64[i]; - } - #endif - - return simde__m512d_from_private(r_); - #endif -} - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_NEGATE_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/or.h b/ffi-deps/simde/simde/x86/avx512/or.h deleted file mode 100644 index b8516a4..0000000 --- a/ffi-deps/simde/simde/x86/avx512/or.h +++ /dev/null @@ -1,308 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Himanshi Mathur - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_OR_H) -#define SIMDE_X86_AVX512_OR_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_or_ps (simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_or_ps(a, b); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a), - b_ = simde__m512_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256[0] = simde_mm256_or_ps(a_.m256[0], b_.m256[0]); - r_.m256[1] = simde_mm256_or_ps(a_.m256[1], b_.m256[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_CLANG_BAD_VI64_OPS) - r_.i32f = a_.i32f | b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] | b_.i32f[i]; - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_or_ps - #define _mm512_or_ps(a, b) simde_mm512_or_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_or_ps(simde__m512 src, simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_mask_or_ps(src, k, a, b); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_or_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_or_ps - #define _mm512_mask_or_ps(src, k, a, b) simde_mm512_mask_or_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_or_ps(simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_maskz_or_ps(k, a, b); - #else - return simde_mm512_maskz_mov_ps(k, simde_mm512_or_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_or_ps - #define _mm512_maskz_or_ps(k, a, b) simde_mm512_maskz_or_ps(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_or_pd (simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_or_pd(a, b); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a), - b_ = simde__m512d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256d[0] = simde_mm256_or_pd(a_.m256d[0], b_.m256d[0]); - r_.m256d[1] = simde_mm256_or_pd(a_.m256d[1], b_.m256d[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_CLANG_BAD_VI64_OPS) - r_.i32f = a_.i32f | b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] | b_.i32f[i]; - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_or_pd - #define _mm512_or_pd(a, b) simde_mm512_or_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_or_pd(simde__m512d src, simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_mask_or_pd(src, k, a, b); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_or_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_or_pd - #define _mm512_mask_or_pd(src, k, a, b) simde_mm512_mask_or_pd(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_maskz_or_pd(simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_maskz_or_pd(k, a, b); - #else - return simde_mm512_maskz_mov_pd(k, simde_mm512_or_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_or_pd - #define _mm512_maskz_or_pd(k, a, b) simde_mm512_maskz_or_pd(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_or_epi32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_or_epi32(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 | b_.i32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] | b_.i32[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_or_epi32 - #define _mm512_or_epi32(a, b) simde_mm512_or_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_or_epi32(simde__m512i src, simde__mmask16 k, simde__m512i v2, simde__m512i v3) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_or_epi32(src, k, v2, v3); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_or_epi32(v2, v3)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_or_epi32 - #define _mm512_mask_or_epi32(src, k, v2, v3) simde_mm512_mask_or_epi32(src, k, v2, v3) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_or_epi32(simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_or_epi32(k, a, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_or_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_or_epi32 - #define _mm512_maskz_or_epi32(k, a, b) simde_mm512_maskz_or_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_or_epi64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_or_epi64(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_or_si256(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_CLANG_BAD_VI64_OPS) - r_.i32f = a_.i32f | b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] | b_.i32f[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_or_epi64 - #define _mm512_or_epi64(a, b) simde_mm512_or_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_or_epi64(simde__m512i src, simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_or_epi64(src, k, a, b); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_or_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_or_epi64 - #define _mm512_mask_or_epi64(src, k, a, b) simde_mm512_mask_or_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_or_epi64(simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_or_epi64(k, a, b); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_or_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_or_epi64 - #define _mm512_maskz_or_epi64(k, a, b) simde_mm512_maskz_or_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_or_si512 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_or_si512(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_X86_AVX2_NATIVE) - r_.m256i[0] = simde_mm256_or_si256(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_or_si256(a_.m256i[1], b_.m256i[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f | b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] | b_.i32f[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_or_si512 - #define _mm512_or_si512(a, b) simde_mm512_or_si512(a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_OR_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/packs.h b/ffi-deps/simde/simde/x86/avx512/packs.h deleted file mode 100644 index 55a39c6..0000000 --- a/ffi-deps/simde/simde/x86/avx512/packs.h +++ /dev/null @@ -1,122 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_PACKS_H) -#define SIMDE_X86_AVX512_PACKS_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_packs_epi16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_packs_epi16(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256i[0] = simde_mm256_packs_epi16(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_packs_epi16(a_.m256i[1], b_.m256i[1]); - #else - const size_t halfway_point = (sizeof(r_.i8) / sizeof(r_.i8[0])) / 2; - const size_t quarter_point = (sizeof(r_.i8) / sizeof(r_.i8[0])) / 4; - const size_t octet_point = (sizeof(r_.i8) / sizeof(r_.i8[0])) / 8; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < octet_point ; i++) { - r_.i8[i] = (a_.i16[i] > INT8_MAX) ? INT8_MAX : ((a_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[i])); - r_.i8[i + octet_point] = (b_.i16[i] > INT8_MAX) ? INT8_MAX : ((b_.i16[i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[i])); - r_.i8[quarter_point + i] = (a_.i16[octet_point + i] > INT8_MAX) ? INT8_MAX : ((a_.i16[octet_point + i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[octet_point + i])); - r_.i8[quarter_point + i + octet_point] = (b_.i16[octet_point + i] > INT8_MAX) ? INT8_MAX : ((b_.i16[octet_point + i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[octet_point + i])); - r_.i8[halfway_point + i] = (a_.i16[quarter_point + i] > INT8_MAX) ? INT8_MAX : ((a_.i16[quarter_point + i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[quarter_point + i])); - r_.i8[halfway_point + i + octet_point] = (b_.i16[quarter_point + i] > INT8_MAX) ? INT8_MAX : ((b_.i16[quarter_point + i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[quarter_point + i])); - r_.i8[halfway_point + quarter_point + i] = (a_.i16[quarter_point + octet_point + i] > INT8_MAX) ? INT8_MAX : ((a_.i16[quarter_point + octet_point + i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, a_.i16[quarter_point + octet_point + i])); - r_.i8[halfway_point + quarter_point + i + octet_point] = (b_.i16[quarter_point + octet_point + i] > INT8_MAX) ? INT8_MAX : ((b_.i16[quarter_point + octet_point + i] < INT8_MIN) ? INT8_MIN : HEDLEY_STATIC_CAST(int8_t, b_.i16[quarter_point + octet_point + i])); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_packs_epi16 - #define _mm512_packs_epi16(a, b) simde_mm512_packs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_packs_epi32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_packs_epi32(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256i[0] = simde_mm256_packs_epi32(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_packs_epi32(a_.m256i[1], b_.m256i[1]); - #else - const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2; - const size_t quarter_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 4; - const size_t octet_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 8; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < octet_point ; i++) { - r_.i16[i] = (a_.i32[i] > INT16_MAX) ? INT16_MAX : ((a_.i32[i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, a_.i32[i])); - r_.i16[i + octet_point] = (b_.i32[i] > INT16_MAX) ? INT16_MAX : ((b_.i32[i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, b_.i32[i])); - r_.i16[quarter_point + i] = (a_.i32[octet_point + i] > INT16_MAX) ? INT16_MAX : ((a_.i32[octet_point + i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, a_.i32[octet_point + i])); - r_.i16[quarter_point + i + octet_point] = (b_.i32[octet_point + i] > INT16_MAX) ? INT16_MAX : ((b_.i32[octet_point + i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, b_.i32[octet_point + i])); - r_.i16[halfway_point + i] = (a_.i32[quarter_point + i] > INT16_MAX) ? INT16_MAX : ((a_.i32[quarter_point +i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, a_.i32[quarter_point + i])); - r_.i16[halfway_point + i + octet_point] = (b_.i32[quarter_point + i] > INT16_MAX) ? INT16_MAX : ((b_.i32[quarter_point + i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, b_.i32[quarter_point +i])); - r_.i16[halfway_point + quarter_point + i] = (a_.i32[quarter_point + octet_point + i] > INT16_MAX) ? INT16_MAX : ((a_.i32[quarter_point + octet_point + i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, a_.i32[quarter_point + octet_point + i])); - r_.i16[halfway_point + quarter_point + i + octet_point] = (b_.i32[quarter_point + octet_point + i] > INT16_MAX) ? INT16_MAX : ((b_.i32[quarter_point + octet_point + i] < INT16_MIN) ? INT16_MIN : HEDLEY_STATIC_CAST(int16_t, b_.i32[quarter_point + octet_point + i])); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_packs_epi32 - #define _mm512_packs_epi32(a, b) simde_mm512_packs_epi32(a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_PACKS_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/packus.h b/ffi-deps/simde/simde/x86/avx512/packus.h deleted file mode 100644 index 3da3b83..0000000 --- a/ffi-deps/simde/simde/x86/avx512/packus.h +++ /dev/null @@ -1,122 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_PACKUS_H) -#define SIMDE_X86_AVX512_PACKUS_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_packus_epi16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_packus_epi16(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256i[0] = simde_mm256_packus_epi16(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_packus_epi16(a_.m256i[1], b_.m256i[1]); - #else - const size_t halfway_point = (sizeof(r_.i8) / sizeof(r_.i8[0])) / 2; - const size_t quarter_point = (sizeof(r_.i8) / sizeof(r_.i8[0])) / 4; - const size_t octet_point = (sizeof(r_.i8) / sizeof(r_.i8[0])) / 8; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < octet_point ; i++) { - r_.u8[i] = (a_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[i])); - r_.u8[i + octet_point] = (b_.i16[i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[i])); - r_.u8[quarter_point + i] = (a_.i16[octet_point + i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[octet_point + i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[octet_point + i])); - r_.u8[quarter_point + i + octet_point] = (b_.i16[octet_point + i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[octet_point + i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[octet_point + i])); - r_.u8[halfway_point + i] = (a_.i16[quarter_point + i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[quarter_point + i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[quarter_point + i])); - r_.u8[halfway_point + i + octet_point] = (b_.i16[quarter_point + i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[quarter_point + i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[quarter_point + i])); - r_.u8[halfway_point + quarter_point + i] = (a_.i16[quarter_point + octet_point + i] > UINT8_MAX) ? UINT8_MAX : ((a_.i16[quarter_point + octet_point + i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, a_.i16[quarter_point + octet_point + i])); - r_.u8[halfway_point + quarter_point + i + octet_point] = (b_.i16[quarter_point + octet_point + i] > UINT8_MAX) ? UINT8_MAX : ((b_.i16[quarter_point + octet_point + i] < 0) ? UINT8_C(0) : HEDLEY_STATIC_CAST(uint8_t, b_.i16[quarter_point + octet_point + i])); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_packus_epi16 - #define _mm512_packus_epi16(a, b) simde_mm512_packus_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_packus_epi32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_packus_epi32(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(a_.m256i) / sizeof(a_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_packus_epi32(a_.m256i[i], b_.m256i[i]); - } - #else - const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2; - const size_t quarter_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 4; - const size_t octet_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 8; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < octet_point ; i++) { - r_.u16[i] = (a_.i32[i] > UINT16_MAX) ? UINT16_MAX : ((a_.i32[i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, a_.i32[i])); - r_.u16[i + octet_point] = (b_.i32[i] > UINT16_MAX) ? UINT16_MAX : ((b_.i32[i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, b_.i32[i])); - r_.u16[quarter_point + i] = (a_.i32[octet_point + i] > UINT16_MAX) ? UINT16_MAX : ((a_.i32[octet_point + i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, a_.i32[octet_point + i])); - r_.u16[quarter_point + i + octet_point] = (b_.i32[octet_point + i] > UINT16_MAX) ? UINT16_MAX : ((b_.i32[octet_point + i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, b_.i32[octet_point + i])); - r_.u16[halfway_point + i] = (a_.i32[quarter_point + i] > UINT16_MAX) ? UINT16_MAX : ((a_.i32[quarter_point +i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, a_.i32[quarter_point + i])); - r_.u16[halfway_point + i + octet_point] = (b_.i32[quarter_point + i] > UINT16_MAX) ? UINT16_MAX : ((b_.i32[quarter_point + i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, b_.i32[quarter_point +i])); - r_.u16[halfway_point + quarter_point + i] = (a_.i32[quarter_point + octet_point + i] > UINT16_MAX) ? UINT16_MAX : ((a_.i32[quarter_point + octet_point + i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, a_.i32[quarter_point + octet_point + i])); - r_.u16[halfway_point + quarter_point + i + octet_point] = (b_.i32[quarter_point + octet_point + i] > UINT16_MAX) ? UINT16_MAX : ((b_.i32[quarter_point + octet_point + i] < 0) ? UINT16_C(0) : HEDLEY_STATIC_CAST(uint16_t, b_.i32[quarter_point + octet_point + i])); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_packus_epi32 - #define _mm512_packus_epi32(a, b) simde_mm512_packus_epi32(a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_PACKUS_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/permutex.h b/ffi-deps/simde/simde/x86/avx512/permutex.h deleted file mode 100644 index 91c35cc..0000000 --- a/ffi-deps/simde/simde/x86/avx512/permutex.h +++ /dev/null @@ -1,101 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2023 Michael R. Crusoe - */ - -#if !defined(SIMDE_X86_AVX512_PERMUTEX_H) -#define SIMDE_X86_AVX512_PERMUTEX_H - -#include "types.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_permutex_epi64 (simde__m256i a, const int imm8) { - simde__m256i_private - a_ = simde__m256i_to_private(a), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[(imm8 >> (i*2)) & 3]; - } - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_permutex_epi64(a, imm8) _mm256_permutex_epi64((a), (imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_permutex_epi64 - #define _mm256_permutex_epi64(a, imm8) simde_mm256_permutex_epi64((a), (imm8)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_permutex_epi64 (simde__m512i a, const int imm8) { - simde__m512i_private - a_ = simde__m512i_to_private(a), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256i_private[0].i64) / sizeof(r_.m256i_private[0].i64[0])) ; i++) { - r_.m256i_private[0].i64[i] = a_.m256i_private[0].i64[(imm8 >> (i*2)) & 3]; - r_.m256i_private[1].i64[i] = a_.m256i_private[1].i64[(imm8 >> (i*2)) & 3]; - } - - return simde__m512i_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_permutex_epi64(a, imm8) _mm512_permutex_epi64((a), (imm8)) -#elif defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm512_permutex_epi64(a, imm8) SIMDE_STATEMENT_EXPR_(({\ - simde__m512i_private simde_mm512_permutex_epi64_a_ = simde__m512i_to_private((a)), simde_mm512_permutex_epi64_r_; \ - simde_mm512_permutex_epi64_r_.m256i[0] = simde_mm256_permutex_epi64(simde_mm512_permutex_epi64_a_.m256i[0], (imm8)); \ - simde_mm512_permutex_epi64_r_.m256i[1] = simde_mm256_permutex_epi64(simde_mm512_permutex_epi64_a_.m256i[1], (imm8)); \ - simde__m512i_from_private(simde_mm512_permutex_epi64_r_); \ - })) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_permutex_epi64 - #define _mm512_permutex_epi64(a, imm8) simde_mm512_permutex_epi64((a), (imm8)) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_mask_permutex_epi64(src, k, a, imm8) _mm512_mask_permutex_epi64((src), (k), (a), (imm8)) -#else - #define simde_mm512_mask_permutex_epi64(src, k, a, imm8) simde_mm512_mask_mov_epi64((src), (k), simde_mm512_permutex_epi64((a), (imm8))) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_permutex_epi64 - #define _mm512_mask_permutex_epi64(src, k, a, imm8) simde_mm512_mask_permutex_epi64((src), (k), (a), (imm8)) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_PERMUTEX_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/permutex2var.h b/ffi-deps/simde/simde/x86/avx512/permutex2var.h deleted file mode 100644 index b6480c2..0000000 --- a/ffi-deps/simde/simde/x86/avx512/permutex2var.h +++ /dev/null @@ -1,1645 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Christopher Moore - */ - -#if !defined(SIMDE_X86_AVX512_PERMUTEX2VAR_H) -#define SIMDE_X86_AVX512_PERMUTEX2VAR_H - -#include "types.h" -#include "and.h" -#include "andnot.h" -#include "blend.h" -#include "mov.h" -#include "or.h" -#include "set1.h" -#include "slli.h" -#include "srli.h" -#include "test.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -/* The following generic code avoids many, nearly identical, repetitions of fairly complex code. - * If the compiler optimizes well, in particular extracting invariant code from loops - * and simplifying code involving constants passed as arguments, it should not be - * significantly slower than specific code. - * Note that when the original vector contains few elements, these implementations - * may not be faster than portable code. - */ -#if defined(SIMDE_X86_SSSE3_NATIVE) || defined(SIMDE_ARM_NEON_A64V8_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_WASM_SIMD128_NATIVE) - #define SIMDE_X_PERMUTEX2VAR_USE_GENERIC -#endif - -#if defined(SIMDE_X_PERMUTEX2VAR_USE_GENERIC) -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_permutex2var128 (const simde__m128i *a, const simde__m128i idx, const simde__m128i *b, const unsigned int log2_index_size, const unsigned int log2_data_length) { - const int idx_mask = (1 << (5 - log2_index_size + log2_data_length)) - 1; - - #if defined(SIMDE_X86_SSE3_NATIVE) - __m128i ra, rb, t, test, select, index; - const __m128i sixteen = _mm_set1_epi8(16); - - /* Avoid the mullo intrinsics which have high latency (and the 32-bit one requires SSE4.1) */ - switch (log2_index_size) { - default: /* Avoid uninitialized variable warning/error */ - case 0: - index = _mm_and_si128(idx, _mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, idx_mask))); - break; - case 1: - index = _mm_and_si128(idx, _mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, idx_mask))); - index = _mm_slli_epi32(index, 1); - t = _mm_slli_epi32(index, 8); - index = _mm_or_si128(index, t); - index = _mm_add_epi16(index, _mm_set1_epi16(0x0100)); - break; - case 2: - index = _mm_and_si128(idx, _mm_set1_epi32(HEDLEY_STATIC_CAST(int32_t, idx_mask))); - index = _mm_slli_epi32(index, 2); - t = _mm_slli_epi32(index, 8); - index = _mm_or_si128(index, t); - t = _mm_slli_epi32(index, 16); - index = _mm_or_si128(index, t); - index = _mm_add_epi32(index, _mm_set1_epi32(0x03020100)); - break; - } - - test = index; - index = _mm_and_si128(index, _mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, (1 << (4 + log2_data_length)) - 1))); - test = _mm_cmpgt_epi8(test, index); - - ra = _mm_shuffle_epi8(a[0], index); - rb = _mm_shuffle_epi8(b[0], index); - - #if defined(SIMDE_X86_SSE4_1_NATIVE) - SIMDE_VECTORIZE - for (int i = 1 ; i < (1 << log2_data_length) ; i++) { - select = _mm_cmplt_epi8(index, sixteen); - index = _mm_sub_epi8(index, sixteen); - ra = _mm_blendv_epi8(_mm_shuffle_epi8(a[i], index), ra, select); - rb = _mm_blendv_epi8(_mm_shuffle_epi8(b[i], index), rb, select); - } - - return _mm_blendv_epi8(ra, rb, test); - #else - SIMDE_VECTORIZE - for (int i = 1 ; i < (1 << log2_data_length) ; i++) { - select = _mm_cmplt_epi8(index, sixteen); - index = _mm_sub_epi8(index, sixteen); - ra = _mm_or_si128(_mm_andnot_si128(select, _mm_shuffle_epi8(a[i], index)), _mm_and_si128(select, ra)); - rb = _mm_or_si128(_mm_andnot_si128(select, _mm_shuffle_epi8(b[i], index)), _mm_and_si128(select, rb)); - } - - return _mm_or_si128(_mm_andnot_si128(test, ra), _mm_and_si128(test, rb)); - #endif - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint8x16_t index, r; - uint16x8_t index16; - uint32x4_t index32; - uint8x16x2_t table2_a, table2_b; - uint8x16x4_t table4_a, table4_b; - - switch (log2_index_size) { - case 0: - index = vandq_u8(simde__m128i_to_neon_u8(idx), vdupq_n_u8(HEDLEY_STATIC_CAST(uint8_t, idx_mask))); - break; - case 1: - index16 = vandq_u16(simde__m128i_to_neon_u16(idx), vdupq_n_u16(HEDLEY_STATIC_CAST(uint16_t, idx_mask))); - index16 = vmulq_n_u16(index16, 0x0202); - index16 = vaddq_u16(index16, vdupq_n_u16(0x0100)); - index = vreinterpretq_u8_u16(index16); - break; - case 2: - index32 = vandq_u32(simde__m128i_to_neon_u32(idx), vdupq_n_u32(HEDLEY_STATIC_CAST(uint32_t, idx_mask))); - index32 = vmulq_n_u32(index32, 0x04040404); - index32 = vaddq_u32(index32, vdupq_n_u32(0x03020100)); - index = vreinterpretq_u8_u32(index32); - break; - } - - uint8x16_t mask = vdupq_n_u8(HEDLEY_STATIC_CAST(uint8_t, (1 << (4 + log2_data_length)) - 1)); - - switch (log2_data_length) { - case 0: - r = vqtbx1q_u8(vqtbl1q_u8(simde__m128i_to_neon_u8(b[0]), vandq_u8(index, mask)), simde__m128i_to_neon_u8(a[0]), index); - break; - case 1: - table2_a.val[0] = simde__m128i_to_neon_u8(a[0]); - table2_a.val[1] = simde__m128i_to_neon_u8(a[1]); - table2_b.val[0] = simde__m128i_to_neon_u8(b[0]); - table2_b.val[1] = simde__m128i_to_neon_u8(b[1]); - r = vqtbx2q_u8(vqtbl2q_u8(table2_b, vandq_u8(index, mask)), table2_a, index); - break; - case 2: - table4_a.val[0] = simde__m128i_to_neon_u8(a[0]); - table4_a.val[1] = simde__m128i_to_neon_u8(a[1]); - table4_a.val[2] = simde__m128i_to_neon_u8(a[2]); - table4_a.val[3] = simde__m128i_to_neon_u8(a[3]); - table4_b.val[0] = simde__m128i_to_neon_u8(b[0]); - table4_b.val[1] = simde__m128i_to_neon_u8(b[1]); - table4_b.val[2] = simde__m128i_to_neon_u8(b[2]); - table4_b.val[3] = simde__m128i_to_neon_u8(b[3]); - r = vqtbx4q_u8(vqtbl4q_u8(table4_b, vandq_u8(index, mask)), table4_a, index); - break; - } - - return simde__m128i_from_neon_u8(r); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) r, ra, rb, t, index, s, thirty_two = vec_splats(HEDLEY_STATIC_CAST(uint8_t, 32)); - SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) index16; - SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) temp32, index32; - SIMDE_POWER_ALTIVEC_VECTOR(SIMDE_POWER_ALTIVEC_BOOL char) select, test; - - switch (log2_index_size) { - default: /* Avoid uninitialized variable warning/error */ - case 0: - index = vec_and(simde__m128i_to_altivec_u8(idx), vec_splats(HEDLEY_STATIC_CAST(uint8_t, idx_mask))); - break; - case 1: - index16 = simde__m128i_to_altivec_u16(idx); - index16 = vec_and(index16, vec_splats(HEDLEY_STATIC_CAST(uint16_t, idx_mask))); - index16 = vec_mladd(index16, vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0202)), vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0100))); - index = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), index16); - break; - case 2: - index32 = simde__m128i_to_altivec_u32(idx); - index32 = vec_and(index32, vec_splats(HEDLEY_STATIC_CAST(uint32_t, idx_mask))); - - /* Multiply index32 by 0x04040404; unfortunately vec_mul isn't available so (mis)use 16-bit vec_mladd */ - temp32 = vec_sl(index32, vec_splats(HEDLEY_STATIC_CAST(unsigned int, 16))); - index32 = vec_add(index32, temp32); - index32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), - vec_mladd(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), index32), - vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0404)), - vec_splat_u16(0))); - - index32 = vec_add(index32, vec_splats(HEDLEY_STATIC_CAST(unsigned int, 0x03020100))); - index = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), index32); - break; - } - - if (log2_data_length == 0) { - r = vec_perm(simde__m128i_to_altivec_u8(a[0]), simde__m128i_to_altivec_u8(b[0]), HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), index)); - } - else { - s = index; - index = vec_and(index, vec_splats(HEDLEY_STATIC_CAST(uint8_t, (1 << (4 + log2_data_length)) - 1))); - test = vec_cmpgt(s, index); - - ra = vec_perm(simde__m128i_to_altivec_u8(a[0]), simde__m128i_to_altivec_u8(a[1]), index); - rb = vec_perm(simde__m128i_to_altivec_u8(b[0]), simde__m128i_to_altivec_u8(b[1]), index); - - SIMDE_VECTORIZE - for (int i = 2 ; i < (1 << log2_data_length) ; i += 2) { - select = vec_cmplt(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), index), - HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), thirty_two)); - index = vec_sub(index, thirty_two); - t = vec_perm(simde__m128i_to_altivec_u8(a[i]), simde__m128i_to_altivec_u8(a[i + 1]), index); - ra = vec_sel(t, ra, select); - t = vec_perm(simde__m128i_to_altivec_u8(b[i]), simde__m128i_to_altivec_u8(b[i + 1]), index); - rb = vec_sel(t, rb, select); - } - - r = vec_sel(ra, rb, test); - } - - return simde__m128i_from_altivec_u8(r); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - const v128_t sixteen = wasm_i8x16_splat(16); - - v128_t index = simde__m128i_to_wasm_v128(idx); - - switch (log2_index_size) { - case 0: - index = wasm_v128_and(index, wasm_i8x16_splat(HEDLEY_STATIC_CAST(int8_t, idx_mask))); - break; - case 1: - index = wasm_v128_and(index, wasm_i16x8_splat(HEDLEY_STATIC_CAST(int16_t, idx_mask))); - index = wasm_i16x8_mul(index, wasm_i16x8_splat(0x0202)); - index = wasm_i16x8_add(index, wasm_i16x8_splat(0x0100)); - break; - case 2: - index = wasm_v128_and(index, wasm_i32x4_splat(HEDLEY_STATIC_CAST(int32_t, idx_mask))); - index = wasm_i32x4_mul(index, wasm_i32x4_splat(0x04040404)); - index = wasm_i32x4_add(index, wasm_i32x4_splat(0x03020100)); - break; - } - - v128_t r = wasm_i8x16_swizzle(simde__m128i_to_wasm_v128(a[0]), index); - - SIMDE_VECTORIZE - for (int i = 1 ; i < (1 << log2_data_length) ; i++) { - index = wasm_i8x16_sub(index, sixteen); - r = wasm_v128_or(r, wasm_i8x16_swizzle(simde__m128i_to_wasm_v128(a[i]), index)); - } - - SIMDE_VECTORIZE - for (int i = 0 ; i < (1 << log2_data_length) ; i++) { - index = wasm_i8x16_sub(index, sixteen); - r = wasm_v128_or(r, wasm_i8x16_swizzle(simde__m128i_to_wasm_v128(b[i]), index)); - } - - return simde__m128i_from_wasm_v128(r); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_x_permutex2var (simde__m128i *r, const simde__m128i *a, const simde__m128i *idx, const simde__m128i *b, const unsigned int log2_index_size, const unsigned int log2_data_length) { - SIMDE_VECTORIZE - for (int i = 0 ; i < (1 << log2_data_length) ; i++) { - r[i] = simde_x_permutex2var128(a, idx[i], b, log2_index_size, log2_data_length); - } -} -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_permutex2var_epi16 (simde__m128i a, simde__m128i idx, simde__m128i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_permutex2var_epi16(a, idx, b); - #elif defined(SIMDE_X_PERMUTEX2VAR_USE_GENERIC) - simde__m128i r; - - simde_x_permutex2var(&r, &a, &idx, &b, 1, 0); - - return r; - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - idx_ = simde__m128i_to_private(idx), - b_ = simde__m128i_to_private(b), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = ((idx_.i16[i] & 8) ? b_ : a_).i16[idx_.i16[i] & 7]; - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_permutex2var_epi16 - #define _mm_permutex2var_epi16(a, idx, b) simde_mm_permutex2var_epi16(a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_permutex2var_epi16 (simde__m128i a, simde__mmask8 k, simde__m128i idx, simde__m128i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_permutex2var_epi16(a, k, idx, b); - #else - return simde_mm_mask_mov_epi16(a, k, simde_mm_permutex2var_epi16(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_permutex2var_epi16 -#define _mm_mask_permutex2var_epi16(a, k, idx, b) simde_mm_mask_permutex2var_epi16(a, k, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask2_permutex2var_epi16 (simde__m128i a, simde__m128i idx, simde__mmask8 k, simde__m128i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask2_permutex2var_epi16(a, idx, k, b); - #else - return simde_mm_mask_mov_epi16(idx, k, simde_mm_permutex2var_epi16(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask2_permutex2var_epi16 -#define _mm_mask2_permutex2var_epi16(a, idx, k, b) simde_mm_mask2_permutex2var_epi16(a, idx, k, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_permutex2var_epi16 (simde__mmask8 k, simde__m128i a, simde__m128i idx, simde__m128i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_permutex2var_epi16(k, a, idx, b); - #else - return simde_mm_maskz_mov_epi16(k, simde_mm_permutex2var_epi16(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_permutex2var_epi16 -#define _mm_maskz_permutex2var_epi16(k, a, idx, b) simde_mm_maskz_permutex2var_epi16(k, a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_permutex2var_epi32 (simde__m128i a, simde__m128i idx, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_permutex2var_epi32(a, idx, b); - #elif defined(SIMDE_X_PERMUTEX2VAR_USE_GENERIC) /* This may not be faster than the portable version */ - simde__m128i r; - - simde_x_permutex2var(&r, &a, &idx, &b, 2, 0); - - return r; - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - idx_ = simde__m128i_to_private(idx), - b_ = simde__m128i_to_private(b), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = ((idx_.i32[i] & 4) ? b_ : a_).i32[idx_.i32[i] & 3]; - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_permutex2var_epi32 - #define _mm_permutex2var_epi32(a, idx, b) simde_mm_permutex2var_epi32(a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_permutex2var_epi32 (simde__m128i a, simde__mmask8 k, simde__m128i idx, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_permutex2var_epi32(a, k, idx, b); - #else - return simde_mm_mask_mov_epi32(a, k, simde_mm_permutex2var_epi32(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_permutex2var_epi32 -#define _mm_mask_permutex2var_epi32(a, k, idx, b) simde_mm_mask_permutex2var_epi32(a, k, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask2_permutex2var_epi32 (simde__m128i a, simde__m128i idx, simde__mmask8 k, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask2_permutex2var_epi32(a, idx, k, b); - #else - return simde_mm_mask_mov_epi32(idx, k, simde_mm_permutex2var_epi32(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask2_permutex2var_epi32 -#define _mm_mask2_permutex2var_epi32(a, idx, k, b) simde_mm_mask2_permutex2var_epi32(a, idx, k, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_permutex2var_epi32 (simde__mmask8 k, simde__m128i a, simde__m128i idx, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_permutex2var_epi32(k, a, idx, b); - #else - return simde_mm_maskz_mov_epi32(k, simde_mm_permutex2var_epi32(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_permutex2var_epi32 -#define _mm_maskz_permutex2var_epi32(k, a, idx, b) simde_mm_maskz_permutex2var_epi32(k, a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_permutex2var_epi64 (simde__m128i a, simde__m128i idx, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_permutex2var_epi64(a, idx, b); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - idx_ = simde__m128i_to_private(idx), - b_ = simde__m128i_to_private(b), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = ((idx_.i64[i] & 2) ? b_ : a_).i64[idx_.i64[i] & 1]; - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_permutex2var_epi64 - #define _mm_permutex2var_epi64(a, idx, b) simde_mm_permutex2var_epi64(a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_permutex2var_epi64 (simde__m128i a, simde__mmask8 k, simde__m128i idx, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_permutex2var_epi64(a, k, idx, b); - #else - return simde_mm_mask_mov_epi64(a, k, simde_mm_permutex2var_epi64(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_permutex2var_epi64 -#define _mm_mask_permutex2var_epi64(a, k, idx, b) simde_mm_mask_permutex2var_epi64(a, k, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask2_permutex2var_epi64 (simde__m128i a, simde__m128i idx, simde__mmask8 k, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask2_permutex2var_epi64(a, idx, k, b); - #else - return simde_mm_mask_mov_epi64(idx, k, simde_mm_permutex2var_epi64(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask2_permutex2var_epi64 -#define _mm_mask2_permutex2var_epi64(a, idx, k, b) simde_mm_mask2_permutex2var_epi64(a, idx, k, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_permutex2var_epi64 (simde__mmask8 k, simde__m128i a, simde__m128i idx, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_permutex2var_epi64(k, a, idx, b); - #else - return simde_mm_maskz_mov_epi64(k, simde_mm_permutex2var_epi64(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_permutex2var_epi64 -#define _mm_maskz_permutex2var_epi64(k, a, idx, b) simde_mm_maskz_permutex2var_epi64(k, a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_permutex2var_epi8 (simde__m128i a, simde__m128i idx, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_permutex2var_epi8(a, idx, b); - #elif defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cvtepi32_epi8(_mm512_permutex2var_epi32(_mm512_cvtepu8_epi32(a), _mm512_cvtepu8_epi32(idx), _mm512_cvtepu8_epi32(b))); - #elif defined(SIMDE_X_PERMUTEX2VAR_USE_GENERIC) - simde__m128i r; - - simde_x_permutex2var(&r, &a, &idx, &b, 0, 0); - - return r; - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - idx_ = simde__m128i_to_private(idx), - b_ = simde__m128i_to_private(b), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = ((idx_.i8[i] & 0x10) ? b_ : a_).i8[idx_.i8[i] & 0x0F]; - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_permutex2var_epi8 - #define _mm_permutex2var_epi8(a, idx, b) simde_mm_permutex2var_epi8(a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_permutex2var_epi8 (simde__m128i a, simde__mmask16 k, simde__m128i idx, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_permutex2var_epi8(a, k, idx, b); - #else - return simde_mm_mask_mov_epi8(a, k, simde_mm_permutex2var_epi8(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_permutex2var_epi8 -#define _mm_mask_permutex2var_epi8(a, k, idx, b) simde_mm_mask_permutex2var_epi8(a, k, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask2_permutex2var_epi8 (simde__m128i a, simde__m128i idx, simde__mmask16 k, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask2_permutex2var_epi8(a, idx, k, b); - #else - return simde_mm_mask_mov_epi8(idx, k, simde_mm_permutex2var_epi8(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask2_permutex2var_epi8 -#define _mm_mask2_permutex2var_epi8(a, idx, k, b) simde_mm_mask2_permutex2var_epi8(a, idx, k, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_permutex2var_epi8 (simde__mmask16 k, simde__m128i a, simde__m128i idx, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_permutex2var_epi8(k, a, idx, b); - #else - return simde_mm_maskz_mov_epi8(k, simde_mm_permutex2var_epi8(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_permutex2var_epi8 -#define _mm_maskz_permutex2var_epi8(k, a, idx, b) simde_mm_maskz_permutex2var_epi8(k, a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_permutex2var_pd (simde__m128d a, simde__m128i idx, simde__m128d b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_permutex2var_pd(a, idx, b); - #else - return simde_mm_castsi128_pd(simde_mm_permutex2var_epi64(simde_mm_castpd_si128(a), idx, simde_mm_castpd_si128(b))); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_permutex2var_pd - #define _mm_permutex2var_pd(a, idx, b) simde_mm_permutex2var_pd(a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_mask_permutex2var_pd (simde__m128d a, simde__mmask8 k, simde__m128i idx, simde__m128d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_permutex2var_pd(a, k, idx, b); - #else - return simde_mm_mask_mov_pd(a, k, simde_mm_permutex2var_pd(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_permutex2var_pd -#define _mm_mask_permutex2var_pd(a, k, idx, b) simde_mm_mask_permutex2var_pd(a, k, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_mask2_permutex2var_pd (simde__m128d a, simde__m128i idx, simde__mmask8 k, simde__m128d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask2_permutex2var_pd(a, idx, k, b); - #else - return simde_mm_mask_mov_pd(simde_mm_castsi128_pd(idx), k, simde_mm_permutex2var_pd(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask2_permutex2var_pd -#define _mm_mask2_permutex2var_pd(a, idx, k, b) simde_mm_mask2_permutex2var_pd(a, idx, k, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_maskz_permutex2var_pd (simde__mmask8 k, simde__m128d a, simde__m128i idx, simde__m128d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_permutex2var_pd(k, a, idx, b); - #else - return simde_mm_maskz_mov_pd(k, simde_mm_permutex2var_pd(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_permutex2var_pd -#define _mm_maskz_permutex2var_pd(k, a, idx, b) simde_mm_maskz_permutex2var_pd(k, a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_permutex2var_ps (simde__m128 a, simde__m128i idx, simde__m128 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_permutex2var_ps(a, idx, b); - #else - return simde_mm_castsi128_ps(simde_mm_permutex2var_epi32(simde_mm_castps_si128(a), idx, simde_mm_castps_si128(b))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_permutex2var_ps - #define _mm_permutex2var_ps(a, idx, b) simde_mm_permutex2var_ps(a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_mask_permutex2var_ps (simde__m128 a, simde__mmask8 k, simde__m128i idx, simde__m128 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_permutex2var_ps(a, k, idx, b); - #else - return simde_mm_mask_mov_ps(a, k, simde_mm_permutex2var_ps(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_permutex2var_ps -#define _mm_mask_permutex2var_ps(a, k, idx, b) simde_mm_mask_permutex2var_ps(a, k, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_mask2_permutex2var_ps (simde__m128 a, simde__m128i idx, simde__mmask8 k, simde__m128 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask2_permutex2var_ps(a, idx, k, b); - #else - return simde_mm_mask_mov_ps(simde_mm_castsi128_ps(idx), k, simde_mm_permutex2var_ps(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask2_permutex2var_ps -#define _mm_mask2_permutex2var_ps(a, idx, k, b) simde_mm_mask2_permutex2var_ps(a, idx, k, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_maskz_permutex2var_ps (simde__mmask8 k, simde__m128 a, simde__m128i idx, simde__m128 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_permutex2var_ps(k, a, idx, b); - #else - return simde_mm_maskz_mov_ps(k, simde_mm_permutex2var_ps(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_permutex2var_ps -#define _mm_maskz_permutex2var_ps(k, a, idx, b) simde_mm_maskz_permutex2var_ps(k, a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_permutex2var_epi16 (simde__m256i a, simde__m256i idx, simde__m256i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_permutex2var_epi16(a, idx, b); - #elif defined(SIMDE_X86_AVX2_NATIVE) - __m256i hilo, hilo2, hi, lo, idx2, ta, tb, select; - const __m256i ones = _mm256_set1_epi16(1); - - idx2 = _mm256_srli_epi32(idx, 1); - - ta = _mm256_permutevar8x32_epi32(a, idx2); - tb = _mm256_permutevar8x32_epi32(b, idx2); - select = _mm256_slli_epi32(idx2, 28); - hilo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(ta), - _mm256_castsi256_ps(tb), - _mm256_castsi256_ps(select))); - idx2 = _mm256_srli_epi32(idx2, 16); - - ta = _mm256_permutevar8x32_epi32(a, idx2); - tb = _mm256_permutevar8x32_epi32(b, idx2); - select = _mm256_slli_epi32(idx2, 28); - hilo2 = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(ta), - _mm256_castsi256_ps(tb), - _mm256_castsi256_ps(select))); - - lo = _mm256_blend_epi16(_mm256_slli_epi32(hilo2, 16), hilo, 0x55); - hi = _mm256_blend_epi16(hilo2, _mm256_srli_epi32(hilo, 16), 0x55); - - select = _mm256_cmpeq_epi16(_mm256_and_si256(idx, ones), ones); - return _mm256_blendv_epi8(lo, hi, select); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a), - idx_ = simde__m256i_to_private(idx), - b_ = simde__m256i_to_private(b), - r_; - - #if defined(SIMDE_X_PERMUTEX2VAR_USE_GENERIC) - simde_x_permutex2var(r_.m128i, a_.m128i, idx_.m128i, b_.m128i, 1, 1); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = ((idx_.i16[i] & 0x10) ? b_ : a_).i16[idx_.i16[i] & 0x0F]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_permutex2var_epi16 - #define _mm256_permutex2var_epi16(a, idx, b) simde_mm256_permutex2var_epi16(a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_permutex2var_epi16 (simde__m256i a, simde__mmask16 k, simde__m256i idx, simde__m256i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_permutex2var_epi16(a, k, idx, b); - #else - return simde_mm256_mask_mov_epi16(a, k, simde_mm256_permutex2var_epi16(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_permutex2var_epi16 -#define _mm256_mask_permutex2var_epi16(a, k, idx, b) simde_mm256_mask_permutex2var_epi16(a, k, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask2_permutex2var_epi16 (simde__m256i a, simde__m256i idx, simde__mmask16 k, simde__m256i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask2_permutex2var_epi16(a, idx, k, b); - #else - return simde_mm256_mask_mov_epi16(idx, k, simde_mm256_permutex2var_epi16(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask2_permutex2var_epi16 -#define _mm256_mask2_permutex2var_epi16(a, idx, k, b) simde_mm256_mask2_permutex2var_epi16(a, idx, k, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_permutex2var_epi16 (simde__mmask16 k, simde__m256i a, simde__m256i idx, simde__m256i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_permutex2var_epi16(k, a, idx, b); - #else - return simde_mm256_maskz_mov_epi16(k, simde_mm256_permutex2var_epi16(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_permutex2var_epi16 -#define _mm256_maskz_permutex2var_epi16(k, a, idx, b) simde_mm256_maskz_permutex2var_epi16(k, a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_permutex2var_epi32 (simde__m256i a, simde__m256i idx, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_permutex2var_epi32(a, idx, b); - #elif defined(SIMDE_X86_AVX2_NATIVE) - __m256i ta, tb, select; - ta = _mm256_permutevar8x32_epi32(a, idx); - tb = _mm256_permutevar8x32_epi32(b, idx); - select = _mm256_slli_epi32(idx, 28); - return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(ta), - _mm256_castsi256_ps(tb), - _mm256_castsi256_ps(select))); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a), - idx_ = simde__m256i_to_private(idx), - b_ = simde__m256i_to_private(b), - r_; - - #if defined(SIMDE_X_PERMUTEX2VAR_USE_GENERIC) - simde_x_permutex2var(r_.m128i, a_.m128i, idx_.m128i, b_.m128i, 2, 1); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = ((idx_.i32[i] & 8) ? b_ : a_).i32[idx_.i32[i] & 7]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_permutex2var_epi32 - #define _mm256_permutex2var_epi32(a, idx, b) simde_mm256_permutex2var_epi32(a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_permutex2var_epi32 (simde__m256i a, simde__mmask8 k, simde__m256i idx, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_permutex2var_epi32(a, k, idx, b); - #else - return simde_mm256_mask_mov_epi32(a, k, simde_mm256_permutex2var_epi32(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_permutex2var_epi32 -#define _mm256_mask_permutex2var_epi32(a, k, idx, b) simde_mm256_mask_permutex2var_epi32(a, k, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask2_permutex2var_epi32 (simde__m256i a, simde__m256i idx, simde__mmask8 k, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask2_permutex2var_epi32(a, idx, k, b); - #else - return simde_mm256_mask_mov_epi32(idx, k, simde_mm256_permutex2var_epi32(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask2_permutex2var_epi32 -#define _mm256_mask2_permutex2var_epi32(a, idx, k, b) simde_mm256_mask2_permutex2var_epi32(a, idx, k, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_permutex2var_epi32 (simde__mmask8 k, simde__m256i a, simde__m256i idx, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_permutex2var_epi32(k, a, idx, b); - #else - return simde_mm256_maskz_mov_epi32(k, simde_mm256_permutex2var_epi32(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_permutex2var_epi32 -#define _mm256_maskz_permutex2var_epi32(k, a, idx, b) simde_mm256_maskz_permutex2var_epi32(k, a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_permutex2var_epi64 (simde__m256i a, simde__m256i idx, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_permutex2var_epi64(a, idx, b); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a), - idx_ = simde__m256i_to_private(idx), - b_ = simde__m256i_to_private(b), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = ((idx_.i64[i] & 4) ? b_ : a_).i64[idx_.i64[i] & 3]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_permutex2var_epi64 - #define _mm256_permutex2var_epi64(a, idx, b) simde_mm256_permutex2var_epi64(a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_permutex2var_epi64 (simde__m256i a, simde__mmask8 k, simde__m256i idx, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_permutex2var_epi64(a, k, idx, b); - #else - return simde_mm256_mask_mov_epi64(a, k, simde_mm256_permutex2var_epi64(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_permutex2var_epi64 -#define _mm256_mask_permutex2var_epi64(a, k, idx, b) simde_mm256_mask_permutex2var_epi64(a, k, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask2_permutex2var_epi64 (simde__m256i a, simde__m256i idx, simde__mmask8 k, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask2_permutex2var_epi64(a, idx, k, b); - #else - return simde_mm256_mask_mov_epi64(idx, k, simde_mm256_permutex2var_epi64(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask2_permutex2var_epi64 -#define _mm256_mask2_permutex2var_epi64(a, idx, k, b) simde_mm256_mask2_permutex2var_epi64(a, idx, k, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_permutex2var_epi64 (simde__mmask8 k, simde__m256i a, simde__m256i idx, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_permutex2var_epi64(k, a, idx, b); - #else - return simde_mm256_maskz_mov_epi64(k, simde_mm256_permutex2var_epi64(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_permutex2var_epi64 -#define _mm256_maskz_permutex2var_epi64(k, a, idx, b) simde_mm256_maskz_permutex2var_epi64(k, a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_permutex2var_epi8 (simde__m256i a, simde__m256i idx, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_permutex2var_epi8(a, idx, b); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_cvtepi16_epi8(_mm512_permutex2var_epi16(_mm512_cvtepu8_epi16(a), _mm512_cvtepu8_epi16(idx), _mm512_cvtepu8_epi16(b))); - #elif defined(SIMDE_X86_AVX2_NATIVE) - __m256i t0, t1, index, select0x10, select0x20, a01, b01; - const __m256i mask = _mm256_set1_epi8(0x3F); - const __m256i a0 = _mm256_permute4x64_epi64(a, (1 << 6) + (0 << 4) + (1 << 2) + (0 << 0)); - const __m256i a1 = _mm256_permute4x64_epi64(a, (3 << 6) + (2 << 4) + (3 << 2) + (2 << 0)); - const __m256i b0 = _mm256_permute4x64_epi64(b, (1 << 6) + (0 << 4) + (1 << 2) + (0 << 0)); - const __m256i b1 = _mm256_permute4x64_epi64(b, (3 << 6) + (2 << 4) + (3 << 2) + (2 << 0)); - - index = _mm256_and_si256(idx, mask); - t0 = _mm256_shuffle_epi8(a0, index); - t1 = _mm256_shuffle_epi8(a1, index); - select0x10 = _mm256_slli_epi64(index, 3); - a01 = _mm256_blendv_epi8(t0, t1, select0x10); - t0 = _mm256_shuffle_epi8(b0, index); - t1 = _mm256_shuffle_epi8(b1, index); - b01 = _mm256_blendv_epi8(t0, t1, select0x10); - select0x20 = _mm256_slli_epi64(index, 2); - return _mm256_blendv_epi8(a01, b01, select0x20); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a), - idx_ = simde__m256i_to_private(idx), - b_ = simde__m256i_to_private(b), - r_; - - #if defined(SIMDE_X_PERMUTEX2VAR_USE_GENERIC) - simde_x_permutex2var(r_.m128i, a_.m128i, idx_.m128i, b_.m128i, 0, 1); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = ((idx_.i8[i] & 0x20) ? b_ : a_).i8[idx_.i8[i] & 0x1F]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_permutex2var_epi8 - #define _mm256_permutex2var_epi8(a, idx, b) simde_mm256_permutex2var_epi8(a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_permutex2var_epi8 (simde__m256i a, simde__mmask32 k, simde__m256i idx, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_permutex2var_epi8(a, k, idx, b); - #else - return simde_mm256_mask_mov_epi8(a, k, simde_mm256_permutex2var_epi8(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_permutex2var_epi8 -#define _mm256_mask_permutex2var_epi8(a, k, idx, b) simde_mm256_mask_permutex2var_epi8(a, k, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask2_permutex2var_epi8 (simde__m256i a, simde__m256i idx, simde__mmask32 k, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask2_permutex2var_epi8(a, idx, k, b); - #else - return simde_mm256_mask_mov_epi8(idx, k, simde_mm256_permutex2var_epi8(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask2_permutex2var_epi8 -#define _mm256_mask2_permutex2var_epi8(a, idx, k, b) simde_mm256_mask2_permutex2var_epi8(a, idx, k, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_permutex2var_epi8 (simde__mmask32 k, simde__m256i a, simde__m256i idx, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_permutex2var_epi8(k, a, idx, b); - #else - return simde_mm256_maskz_mov_epi8(k, simde_mm256_permutex2var_epi8(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_permutex2var_epi8 -#define _mm256_maskz_permutex2var_epi8(k, a, idx, b) simde_mm256_maskz_permutex2var_epi8(k, a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_permutex2var_pd (simde__m256d a, simde__m256i idx, simde__m256d b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_permutex2var_pd(a, idx, b); - #else - return simde_mm256_castsi256_pd(simde_mm256_permutex2var_epi64(simde_mm256_castpd_si256(a), idx, simde_mm256_castpd_si256(b))); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_permutex2var_pd - #define _mm256_permutex2var_pd(a, idx, b) simde_mm256_permutex2var_pd(a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_mask_permutex2var_pd (simde__m256d a, simde__mmask8 k, simde__m256i idx, simde__m256d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_permutex2var_pd(a, k, idx, b); - #else - return simde_mm256_mask_mov_pd(a, k, simde_mm256_permutex2var_pd(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_permutex2var_pd -#define _mm256_mask_permutex2var_pd(a, k, idx, b) simde_mm256_mask_permutex2var_pd(a, k, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_mask2_permutex2var_pd (simde__m256d a, simde__m256i idx, simde__mmask8 k, simde__m256d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask2_permutex2var_pd(a, idx, k, b); - #else - return simde_mm256_mask_mov_pd(simde_mm256_castsi256_pd(idx), k, simde_mm256_permutex2var_pd(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask2_permutex2var_pd -#define _mm256_mask2_permutex2var_pd(a, idx, k, b) simde_mm256_mask2_permutex2var_pd(a, idx, k, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_maskz_permutex2var_pd (simde__mmask8 k, simde__m256d a, simde__m256i idx, simde__m256d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_permutex2var_pd(k, a, idx, b); - #else - return simde_mm256_maskz_mov_pd(k, simde_mm256_permutex2var_pd(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_permutex2var_pd -#define _mm256_maskz_permutex2var_pd(k, a, idx, b) simde_mm256_maskz_permutex2var_pd(k, a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_permutex2var_ps (simde__m256 a, simde__m256i idx, simde__m256 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_permutex2var_ps(a, idx, b); - #else - return simde_mm256_castsi256_ps(simde_mm256_permutex2var_epi32(simde_mm256_castps_si256(a), idx, simde_mm256_castps_si256(b))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_permutex2var_ps - #define _mm256_permutex2var_ps(a, idx, b) simde_mm256_permutex2var_ps(a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_mask_permutex2var_ps (simde__m256 a, simde__mmask8 k, simde__m256i idx, simde__m256 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_permutex2var_ps(a, k, idx, b); - #else - return simde_mm256_mask_mov_ps(a, k, simde_mm256_permutex2var_ps(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_permutex2var_ps -#define _mm256_mask_permutex2var_ps(a, k, idx, b) simde_mm256_mask_permutex2var_ps(a, k, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_mask2_permutex2var_ps (simde__m256 a, simde__m256i idx, simde__mmask8 k, simde__m256 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask2_permutex2var_ps(a, idx, k, b); - #else - return simde_mm256_mask_mov_ps(simde_mm256_castsi256_ps(idx), k, simde_mm256_permutex2var_ps(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask2_permutex2var_ps -#define _mm256_mask2_permutex2var_ps(a, idx, k, b) simde_mm256_mask2_permutex2var_ps(a, idx, k, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_maskz_permutex2var_ps (simde__mmask8 k, simde__m256 a, simde__m256i idx, simde__m256 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_permutex2var_ps(k, a, idx, b); - #else - return simde_mm256_maskz_mov_ps(k, simde_mm256_permutex2var_ps(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_permutex2var_ps -#define _mm256_maskz_permutex2var_ps(k, a, idx, b) simde_mm256_maskz_permutex2var_ps(k, a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_permutex2var_epi16 (simde__m512i a, simde__m512i idx, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_permutex2var_epi16(a, idx, b); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - idx_ = simde__m512i_to_private(idx), - b_ = simde__m512i_to_private(b), - r_; - - #if defined(SIMDE_X86_AVX2_NATIVE) - __m256i hilo, hilo1, hilo2, hi, lo, idx1, idx2, ta, tb, select; - const __m256i ones = _mm256_set1_epi16(1); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256i_private) / sizeof(r_.m256i_private[0])) ; i++) { - idx1 = idx_.m256i[i]; - idx2 = _mm256_srli_epi32(idx1, 1); - - select = _mm256_slli_epi32(idx2, 27); - ta = _mm256_permutevar8x32_epi32(a_.m256i[0], idx2); - tb = _mm256_permutevar8x32_epi32(b_.m256i[0], idx2); - hilo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(ta), - _mm256_castsi256_ps(tb), - _mm256_castsi256_ps(select))); - ta = _mm256_permutevar8x32_epi32(a_.m256i[1], idx2); - tb = _mm256_permutevar8x32_epi32(b_.m256i[1], idx2); - hilo1 = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(ta), - _mm256_castsi256_ps(tb), - _mm256_castsi256_ps(select))); - select = _mm256_add_epi32(select, select); - hilo1 = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(hilo), - _mm256_castsi256_ps(hilo1), - _mm256_castsi256_ps(select))); - - idx2 = _mm256_srli_epi32(idx2, 16); - - select = _mm256_slli_epi32(idx2, 27); - ta = _mm256_permutevar8x32_epi32(a_.m256i[0], idx2); - tb = _mm256_permutevar8x32_epi32(b_.m256i[0], idx2); - hilo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(ta), - _mm256_castsi256_ps(tb), - _mm256_castsi256_ps(select))); - ta = _mm256_permutevar8x32_epi32(a_.m256i[1], idx2); - tb = _mm256_permutevar8x32_epi32(b_.m256i[1], idx2); - hilo2 = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(ta), - _mm256_castsi256_ps(tb), - _mm256_castsi256_ps(select))); - select = _mm256_add_epi32(select, select); - hilo2 = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(hilo), - _mm256_castsi256_ps(hilo2), - _mm256_castsi256_ps(select))); - - lo = _mm256_blend_epi16(_mm256_slli_epi32(hilo2, 16), hilo1, 0x55); - hi = _mm256_blend_epi16(hilo2, _mm256_srli_epi32(hilo1, 16), 0x55); - - select = _mm256_cmpeq_epi16(_mm256_and_si256(idx1, ones), ones); - r_.m256i[i] = _mm256_blendv_epi8(lo, hi, select); - } - #elif defined(SIMDE_X_PERMUTEX2VAR_USE_GENERIC) - simde_x_permutex2var(r_.m128i, a_.m128i, idx_.m128i, b_.m128i, 1, 2); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = ((idx_.i16[i] & 0x20) ? b_ : a_).i16[idx_.i16[i] & 0x1F]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm512_permutex2var_epi16 - #define _mm512_permutex2var_epi16(a, idx, b) simde_mm512_permutex2var_epi16(a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_permutex2var_epi16 (simde__m512i a, simde__mmask32 k, simde__m512i idx, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_permutex2var_epi16(a, k, idx, b); - #else - return simde_mm512_mask_mov_epi16(a, k, simde_mm512_permutex2var_epi16(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_permutex2var_epi16 -#define _mm512_mask_permutex2var_epi16(a, k, idx, b) simde_mm512_mask_permutex2var_epi16(a, k, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask2_permutex2var_epi16 (simde__m512i a, simde__m512i idx, simde__mmask32 k, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask2_permutex2var_epi16(a, idx, k, b); - #else - return simde_mm512_mask_mov_epi16(idx, k, simde_mm512_permutex2var_epi16(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask2_permutex2var_epi16 -#define _mm512_mask2_permutex2var_epi16(a, idx, k, b) simde_mm512_mask2_permutex2var_epi16(a, idx, k, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_permutex2var_epi16 (simde__mmask32 k, simde__m512i a, simde__m512i idx, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_permutex2var_epi16(k, a, idx, b); - #else - return simde_mm512_maskz_mov_epi16(k, simde_mm512_permutex2var_epi16(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_permutex2var_epi16 -#define _mm512_maskz_permutex2var_epi16(k, a, idx, b) simde_mm512_maskz_permutex2var_epi16(k, a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_permutex2var_epi32 (simde__m512i a, simde__m512i idx, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_permutex2var_epi32(a, idx, b); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - idx_ = simde__m512i_to_private(idx), - b_ = simde__m512i_to_private(b), - r_; - - #if defined(SIMDE_X86_AVX2_NATIVE) - __m256i index, t0, t1, a01, b01, select; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256i_private) / sizeof(r_.m256i_private[0])) ; i++) { - index = idx_.m256i[i]; - t0 = _mm256_permutevar8x32_epi32(a_.m256i[0], index); - t1 = _mm256_permutevar8x32_epi32(a_.m256i[1], index); - select = _mm256_slli_epi32(index, 28); - a01 = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(t0), - _mm256_castsi256_ps(t1), - _mm256_castsi256_ps(select))); - t0 = _mm256_permutevar8x32_epi32(b_.m256i[0], index); - t1 = _mm256_permutevar8x32_epi32(b_.m256i[1], index); - b01 = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(t0), - _mm256_castsi256_ps(t1), - _mm256_castsi256_ps(select))); - select = _mm256_slli_epi32(index, 27); - r_.m256i[i] = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a01), - _mm256_castsi256_ps(b01), - _mm256_castsi256_ps(select))); - } - #elif defined(SIMDE_X_PERMUTEX2VAR_USE_GENERIC) - simde_x_permutex2var(r_.m128i, a_.m128i, idx_.m128i, b_.m128i, 2, 2); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = ((idx_.i32[i] & 0x10) ? b_ : a_).i32[idx_.i32[i] & 0x0F]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_permutex2var_epi32 - #define _mm512_permutex2var_epi32(a, idx, b) simde_mm512_permutex2var_epi32(a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_permutex2var_epi32 (simde__m512i a, simde__mmask16 k, simde__m512i idx, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_permutex2var_epi32(a, k, idx, b); - #else - return simde_mm512_mask_mov_epi32(a, k, simde_mm512_permutex2var_epi32(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_permutex2var_epi32 -#define _mm512_mask_permutex2var_epi32(a, k, idx, b) simde_mm512_mask_permutex2var_epi32(a, k, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask2_permutex2var_epi32 (simde__m512i a, simde__m512i idx, simde__mmask16 k, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask2_permutex2var_epi32(a, idx, k, b); - #else - return simde_mm512_mask_mov_epi32(idx, k, simde_mm512_permutex2var_epi32(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask2_permutex2var_epi32 -#define _mm512_mask2_permutex2var_epi32(a, idx, k, b) simde_mm512_mask2_permutex2var_epi32(a, idx, k, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_permutex2var_epi32 (simde__mmask16 k, simde__m512i a, simde__m512i idx, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_permutex2var_epi32(k, a, idx, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_permutex2var_epi32(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_permutex2var_epi32 -#define _mm512_maskz_permutex2var_epi32(k, a, idx, b) simde_mm512_maskz_permutex2var_epi32(k, a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_permutex2var_epi64 (simde__m512i a, simde__m512i idx, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_permutex2var_epi64(a, idx, b); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - idx_ = simde__m512i_to_private(idx), - b_ = simde__m512i_to_private(b), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = ((idx_.i64[i] & 8) ? b_ : a_).i64[idx_.i64[i] & 7]; - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_permutex2var_epi64 - #define _mm512_permutex2var_epi64(a, idx, b) simde_mm512_permutex2var_epi64(a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_permutex2var_epi64 (simde__m512i a, simde__mmask8 k, simde__m512i idx, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_permutex2var_epi64(a, k, idx, b); - #else - return simde_mm512_mask_mov_epi64(a, k, simde_mm512_permutex2var_epi64(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_permutex2var_epi64 -#define _mm512_mask_permutex2var_epi64(a, k, idx, b) simde_mm512_mask_permutex2var_epi64(a, k, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask2_permutex2var_epi64 (simde__m512i a, simde__m512i idx, simde__mmask8 k, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask2_permutex2var_epi64(a, idx, k, b); - #else - return simde_mm512_mask_mov_epi64(idx, k, simde_mm512_permutex2var_epi64(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask2_permutex2var_epi64 -#define _mm512_mask2_permutex2var_epi64(a, idx, k, b) simde_mm512_mask2_permutex2var_epi64(a, idx, k, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_permutex2var_epi64 (simde__mmask8 k, simde__m512i a, simde__m512i idx, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_permutex2var_epi64(k, a, idx, b); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_permutex2var_epi64(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_permutex2var_epi64 -#define _mm512_maskz_permutex2var_epi64(k, a, idx, b) simde_mm512_maskz_permutex2var_epi64(k, a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_permutex2var_epi8 (simde__m512i a, simde__m512i idx, simde__m512i b) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) - return _mm512_permutex2var_epi8(a, idx, b); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - __m512i hilo, hi, lo, hi2, lo2, idx2; - const __m512i ones = _mm512_set1_epi8(1); - const __m512i low_bytes = _mm512_set1_epi16(0x00FF); - - idx2 = _mm512_srli_epi16(idx, 1); - hilo = _mm512_permutex2var_epi16(a, idx2, b); - __mmask64 mask = _mm512_test_epi8_mask(idx, ones); - lo = _mm512_and_si512(hilo, low_bytes); - hi = _mm512_srli_epi16(hilo, 8); - - idx2 = _mm512_srli_epi16(idx, 9); - hilo = _mm512_permutex2var_epi16(a, idx2, b); - lo2 = _mm512_slli_epi16(hilo, 8); - hi2 = _mm512_andnot_si512(low_bytes, hilo); - - lo = _mm512_or_si512(lo, lo2); - hi = _mm512_or_si512(hi, hi2); - - return _mm512_mask_blend_epi8(mask, lo, hi); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - idx_ = simde__m512i_to_private(idx), - b_ = simde__m512i_to_private(b), - r_; - - #if defined(SIMDE_X86_AVX2_NATIVE) - __m256i t0, t1, index, select0x10, select0x20, select0x40, t01, t23, a0123, b0123; - const __m256i mask = _mm256_set1_epi8(0x7F); - const __m256i a0 = _mm256_permute4x64_epi64(a_.m256i[0], (1 << 6) + (0 << 4) + (1 << 2) + (0 << 0)); - const __m256i a1 = _mm256_permute4x64_epi64(a_.m256i[0], (3 << 6) + (2 << 4) + (3 << 2) + (2 << 0)); - const __m256i a2 = _mm256_permute4x64_epi64(a_.m256i[1], (1 << 6) + (0 << 4) + (1 << 2) + (0 << 0)); - const __m256i a3 = _mm256_permute4x64_epi64(a_.m256i[1], (3 << 6) + (2 << 4) + (3 << 2) + (2 << 0)); - const __m256i b0 = _mm256_permute4x64_epi64(b_.m256i[0], (1 << 6) + (0 << 4) + (1 << 2) + (0 << 0)); - const __m256i b1 = _mm256_permute4x64_epi64(b_.m256i[0], (3 << 6) + (2 << 4) + (3 << 2) + (2 << 0)); - const __m256i b2 = _mm256_permute4x64_epi64(b_.m256i[1], (1 << 6) + (0 << 4) + (1 << 2) + (0 << 0)); - const __m256i b3 = _mm256_permute4x64_epi64(b_.m256i[1], (3 << 6) + (2 << 4) + (3 << 2) + (2 << 0)); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256i_private) / sizeof(r_.m256i_private[0])) ; i++) { - index = _mm256_and_si256(idx_.m256i[i], mask); - t0 = _mm256_shuffle_epi8(a0, index); - t1 = _mm256_shuffle_epi8(a1, index); - select0x10 = _mm256_slli_epi64(index, 3); - t01 = _mm256_blendv_epi8(t0, t1, select0x10); - t0 = _mm256_shuffle_epi8(a2, index); - t1 = _mm256_shuffle_epi8(a3, index); - t23 = _mm256_blendv_epi8(t0, t1, select0x10); - select0x20 = _mm256_slli_epi64(index, 2); - a0123 = _mm256_blendv_epi8(t01, t23, select0x20); - t0 = _mm256_shuffle_epi8(b0, index); - t1 = _mm256_shuffle_epi8(b1, index); - t01 = _mm256_blendv_epi8(t0, t1, select0x10); - t0 = _mm256_shuffle_epi8(b2, index); - t1 = _mm256_shuffle_epi8(b3, index); - t23 = _mm256_blendv_epi8(t0, t1, select0x10); - b0123 = _mm256_blendv_epi8(t01, t23, select0x20); - select0x40 = _mm256_slli_epi64(index, 1); - r_.m256i[i] = _mm256_blendv_epi8(a0123, b0123, select0x40); - } - #elif defined(SIMDE_X_PERMUTEX2VAR_USE_GENERIC) - simde_x_permutex2var(r_.m128i, a_.m128i, idx_.m128i, b_.m128i, 0, 2); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = ((idx_.i8[i] & 0x40) ? b_ : a_).i8[idx_.i8[i] & 0x3F]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) - #undef _mm512_permutex2var_epi8 - #define _mm512_permutex2var_epi8(a, idx, b) simde_mm512_permutex2var_epi8(a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_permutex2var_epi8 (simde__m512i a, simde__mmask64 k, simde__m512i idx, simde__m512i b) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) - return _mm512_mask_permutex2var_epi8(a, k, idx, b); - #else - return simde_mm512_mask_mov_epi8(a, k, simde_mm512_permutex2var_epi8(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_permutex2var_epi8 -#define _mm512_mask_permutex2var_epi8(a, k, idx, b) simde_mm512_mask_permutex2var_epi8(a, k, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask2_permutex2var_epi8 (simde__m512i a, simde__m512i idx, simde__mmask64 k, simde__m512i b) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) - return _mm512_mask2_permutex2var_epi8(a, idx, k, b); - #else - return simde_mm512_mask_mov_epi8(idx, k, simde_mm512_permutex2var_epi8(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask2_permutex2var_epi8 -#define _mm512_mask2_permutex2var_epi8(a, idx, k, b) simde_mm512_mask2_permutex2var_epi8(a, idx, k, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_permutex2var_epi8 (simde__mmask64 k, simde__m512i a, simde__m512i idx, simde__m512i b) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) - return _mm512_maskz_permutex2var_epi8(k, a, idx, b); - #else - return simde_mm512_maskz_mov_epi8(k, simde_mm512_permutex2var_epi8(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_permutex2var_epi8 -#define _mm512_maskz_permutex2var_epi8(k, a, idx, b) simde_mm512_maskz_permutex2var_epi8(k, a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_permutex2var_pd (simde__m512d a, simde__m512i idx, simde__m512d b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_permutex2var_pd(a, idx, b); - #else - return simde_mm512_castsi512_pd(simde_mm512_permutex2var_epi64(simde_mm512_castpd_si512(a), idx, simde_mm512_castpd_si512(b))); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_permutex2var_pd - #define _mm512_permutex2var_pd(a, idx, b) simde_mm512_permutex2var_pd(a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_permutex2var_pd (simde__m512d a, simde__mmask8 k, simde__m512i idx, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_permutex2var_pd(a, k, idx, b); - #else - return simde_mm512_mask_mov_pd(a, k, simde_mm512_permutex2var_pd(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_permutex2var_pd -#define _mm512_mask_permutex2var_pd(a, k, idx, b) simde_mm512_mask_permutex2var_pd(a, k, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask2_permutex2var_pd (simde__m512d a, simde__m512i idx, simde__mmask8 k, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask2_permutex2var_pd(a, idx, k, b); - #else - return simde_mm512_mask_mov_pd(simde_mm512_castsi512_pd(idx), k, simde_mm512_permutex2var_pd(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask2_permutex2var_pd -#define _mm512_mask2_permutex2var_pd(a, idx, k, b) simde_mm512_mask2_permutex2var_pd(a, idx, k, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_maskz_permutex2var_pd (simde__mmask8 k, simde__m512d a, simde__m512i idx, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_permutex2var_pd(k, a, idx, b); - #else - return simde_mm512_maskz_mov_pd(k, simde_mm512_permutex2var_pd(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_permutex2var_pd -#define _mm512_maskz_permutex2var_pd(k, a, idx, b) simde_mm512_maskz_permutex2var_pd(k, a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_permutex2var_ps (simde__m512 a, simde__m512i idx, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_permutex2var_ps(a, idx, b); - #else - return simde_mm512_castsi512_ps(simde_mm512_permutex2var_epi32(simde_mm512_castps_si512(a), idx, simde_mm512_castps_si512(b))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_permutex2var_ps - #define _mm512_permutex2var_ps(a, idx, b) simde_mm512_permutex2var_ps(a, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_permutex2var_ps (simde__m512 a, simde__mmask16 k, simde__m512i idx, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_permutex2var_ps(a, k, idx, b); - #else - return simde_mm512_mask_mov_ps(a, k, simde_mm512_permutex2var_ps(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_permutex2var_ps -#define _mm512_mask_permutex2var_ps(a, k, idx, b) simde_mm512_mask_permutex2var_ps(a, k, idx, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask2_permutex2var_ps (simde__m512 a, simde__m512i idx, simde__mmask16 k, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask2_permutex2var_ps(a, idx, k, b); - #else - return simde_mm512_mask_mov_ps(simde_mm512_castsi512_ps(idx), k, simde_mm512_permutex2var_ps(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask2_permutex2var_ps -#define _mm512_mask2_permutex2var_ps(a, idx, k, b) simde_mm512_mask2_permutex2var_ps(a, idx, k, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_permutex2var_ps (simde__mmask16 k, simde__m512 a, simde__m512i idx, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_permutex2var_ps(k, a, idx, b); - #else - return simde_mm512_maskz_mov_ps(k, simde_mm512_permutex2var_ps(a, idx, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_permutex2var_ps -#define _mm512_maskz_permutex2var_ps(k, a, idx, b) simde_mm512_maskz_permutex2var_ps(k, a, idx, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_PERMUTEX2VAR_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/permutexvar.h b/ffi-deps/simde/simde/x86/avx512/permutexvar.h deleted file mode 100644 index 1b4bf7a..0000000 --- a/ffi-deps/simde/simde/x86/avx512/permutexvar.h +++ /dev/null @@ -1,1194 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Christopher Moore - */ - -#if !defined(SIMDE_X86_AVX512_PERMUTEXVAR_H) -#define SIMDE_X86_AVX512_PERMUTEXVAR_H - -#include "types.h" -#include "and.h" -#include "andnot.h" -#include "blend.h" -#include "mov.h" -#include "or.h" -#include "set1.h" -#include "slli.h" -#include "srli.h" -#include "test.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_permutexvar_epi16 (simde__m128i idx, simde__m128i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_permutexvar_epi16(idx, a); - #elif defined(SIMDE_X86_SSSE3_NATIVE) - simde__m128i mask16 = simde_mm_set1_epi16(0x0007); - simde__m128i shift16 = simde_mm_set1_epi16(0x0202); - simde__m128i byte_index16 = simde_mm_set1_epi16(0x0100); - simde__m128i index16 = simde_mm_and_si128(idx, mask16); - index16 = simde_mm_mullo_epi16(index16, shift16); - index16 = simde_mm_add_epi16(index16, byte_index16); - return simde_mm_shuffle_epi8(a, index16); - #else - simde__m128i_private - idx_ = simde__m128i_to_private(idx), - a_ = simde__m128i_to_private(a), - r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint16x8_t mask16 = vdupq_n_u16(0x0007); - uint16x8_t byte_index16 = vdupq_n_u16(0x0100); - uint16x8_t index16 = vandq_u16(idx_.neon_u16, mask16); - index16 = vmulq_n_u16(index16, 0x0202); - index16 = vaddq_u16(index16, byte_index16); - r_.neon_u8 = vqtbl1q_u8(a_.neon_u8, vreinterpretq_u8_u16(index16)); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) index16; - index16 = vec_and(idx_.altivec_u16, vec_splat_u16(7)); - index16 = vec_mladd(index16, vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0202)), vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0100))); - r_.altivec_u8 = vec_perm(a_.altivec_u8, a_.altivec_u8, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), index16)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - const v128_t mask16 = wasm_i16x8_splat(0x0007); - const v128_t shift16 = wasm_i16x8_splat(0x0202); - const v128_t byte_index16 = wasm_i16x8_splat(0x0100); - v128_t index16 = wasm_v128_and(idx_.wasm_v128, mask16); - index16 = wasm_i16x8_mul(index16, shift16); - index16 = wasm_i16x8_add(index16, byte_index16); - r_.wasm_v128 = wasm_i8x16_swizzle(a_.wasm_v128, index16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[idx_.i16[i] & 0x07]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_permutexvar_epi16 - #define _mm_permutexvar_epi16(idx, a) simde_mm_permutexvar_epi16(idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_permutexvar_epi16 (simde__m128i src, simde__mmask8 k, simde__m128i idx, simde__m128i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_permutexvar_epi16(src, k, idx, a); - #else - return simde_mm_mask_mov_epi16(src, k, simde_mm_permutexvar_epi16(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_permutexvar_epi16 - #define _mm_mask_permutexvar_epi16(src, k, idx, a) simde_mm_mask_permutexvar_epi16(src, k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_permutexvar_epi16 (simde__mmask8 k, simde__m128i idx, simde__m128i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_permutexvar_epi16(k, idx, a); - #else - return simde_mm_maskz_mov_epi16(k, simde_mm_permutexvar_epi16(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_permutexvar_epi16 - #define _mm_maskz_permutexvar_epi16(k, idx, a) simde_mm_maskz_permutexvar_epi16(k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_permutexvar_epi8 (simde__m128i idx, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_permutexvar_epi8(idx, a); - #elif defined(SIMDE_X86_SSSE3_NATIVE) - simde__m128i mask = simde_mm_set1_epi8(0x0F); - simde__m128i index = simde_mm_and_si128(idx, mask); - return simde_mm_shuffle_epi8(a, index); - #else - simde__m128i_private - idx_ = simde__m128i_to_private(idx), - a_ = simde__m128i_to_private(a), - r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint8x16_t mask = vdupq_n_u8(0x0F); - uint8x16_t index = vandq_u8(idx_.neon_u8, mask); - r_.neon_u8 = vqtbl1q_u8(a_.neon_u8, index); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_u8 = vec_perm(a_.altivec_u8, a_.altivec_u8, idx_.altivec_u8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - const v128_t mask = wasm_i8x16_splat(0x0F); - v128_t index = wasm_v128_and(idx_.wasm_v128, mask); - r_.wasm_v128 = wasm_i8x16_swizzle(a_.wasm_v128, index); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[idx_.i8[i] & 0x0F]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_permutexvar_epi8 - #define _mm_permutexvar_epi8(idx, a) simde_mm_permutexvar_epi8(idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_permutexvar_epi8 (simde__m128i src, simde__mmask16 k, simde__m128i idx, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_permutexvar_epi8(src, k, idx, a); - #else - return simde_mm_mask_mov_epi8(src, k, simde_mm_permutexvar_epi8(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_permutexvar_epi8 - #define _mm_mask_permutexvar_epi8(src, k, idx, a) simde_mm_mask_permutexvar_epi8(src, k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_permutexvar_epi8 (simde__mmask16 k, simde__m128i idx, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_permutexvar_epi8(k, idx, a); - #else - return simde_mm_maskz_mov_epi8(k, simde_mm_permutexvar_epi8(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_permutexvar_epi8 - #define _mm_maskz_permutexvar_epi8(k, idx, a) simde_mm_maskz_permutexvar_epi8(k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_permutexvar_epi16 (simde__m256i idx, simde__m256i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_permutexvar_epi16(idx, a); - #elif defined(SIMDE_X86_AVX2_NATIVE) - simde__m256i mask16 = simde_mm256_set1_epi16(0x001F); - simde__m256i shift16 = simde_mm256_set1_epi16(0x0202); - simde__m256i byte_index16 = simde_mm256_set1_epi16(0x0100); - simde__m256i index16 = simde_mm256_and_si256(idx, mask16); - index16 = simde_mm256_mullo_epi16(index16, shift16); - simde__m256i lo = simde_mm256_permute4x64_epi64(a, (1 << 6) + (0 << 4) + (1 << 2) + (0 << 0)); - simde__m256i hi = simde_mm256_permute4x64_epi64(a, (3 << 6) + (2 << 4) + (3 << 2) + (2 << 0)); - simde__m256i select = simde_mm256_slli_epi64(index16, 3); - index16 = simde_mm256_add_epi16(index16, byte_index16); - lo = simde_mm256_shuffle_epi8(lo, index16); - hi = simde_mm256_shuffle_epi8(hi, index16); - return simde_mm256_blendv_epi8(lo, hi, select); - #else - simde__m256i_private - idx_ = simde__m256i_to_private(idx), - a_ = simde__m256i_to_private(a), - r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint8x16x2_t table = { { a_.m128i_private[0].neon_u8, - a_.m128i_private[1].neon_u8 } }; - uint16x8_t mask16 = vdupq_n_u16(0x000F); - uint16x8_t byte_index16 = vdupq_n_u16(0x0100); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - uint16x8_t index16 = vandq_u16(idx_.m128i_private[i].neon_u16, mask16); - index16 = vmulq_n_u16(index16, 0x0202); - index16 = vaddq_u16(index16, byte_index16); - r_.m128i_private[i].neon_u8 = vqtbl2q_u8(table, vreinterpretq_u8_u16(index16)); - } - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) index16, mask16, shift16, byte_index16; - mask16 = vec_splat_u16(0x000F); - shift16 = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0202)); - byte_index16 = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0100)); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - index16 = vec_and(idx_.m128i_private[i].altivec_u16, mask16); - index16 = vec_mladd(index16, shift16, byte_index16); - r_.m128i_private[i].altivec_u8 = vec_perm(a_.m128i_private[0].altivec_u8, - a_.m128i_private[1].altivec_u8, - HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), index16)); - } - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t index, index16, r, t; - const v128_t mask16 = wasm_i16x8_splat(0x000F); - const v128_t shift16 = wasm_i16x8_splat(0x0202); - const v128_t byte_index16 = wasm_i16x8_splat(0x0100); - const v128_t sixteen = wasm_i8x16_splat(16); - const v128_t a0 = a_.m128i_private[0].wasm_v128; - const v128_t a1 = a_.m128i_private[1].wasm_v128; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - index16 = wasm_v128_and(idx_.m128i_private[i].wasm_v128, mask16); - index16 = wasm_i16x8_mul(index16, shift16); - index = wasm_i16x8_add(index16, byte_index16); - r = wasm_i8x16_swizzle(a0, index); - - index = wasm_i8x16_sub(index, sixteen); - t = wasm_i8x16_swizzle(a1, index); - r_.m128i_private[i].wasm_v128 = wasm_v128_or(r, t); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[idx_.i16[i] & 0x0F]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_permutexvar_epi16 - #define _mm256_permutexvar_epi16(idx, a) simde_mm256_permutexvar_epi16(idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_permutexvar_epi16 (simde__m256i src, simde__mmask16 k, simde__m256i idx, simde__m256i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_permutexvar_epi16(src, k, idx, a); - #else - return simde_mm256_mask_mov_epi16(src, k, simde_mm256_permutexvar_epi16(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_permutexvar_epi16 - #define _mm256_mask_permutexvar_epi16(src, k, idx, a) simde_mm256_mask_permutexvar_epi16(src, k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_permutexvar_epi16 (simde__mmask16 k, simde__m256i idx, simde__m256i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_permutexvar_epi16(k, idx, a); - #else - return simde_mm256_maskz_mov_epi16(k, simde_mm256_permutexvar_epi16(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_permutexvar_epi16 - #define _mm256_maskz_permutexvar_epi16(k, idx, a) simde_mm256_maskz_permutexvar_epi16(k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_permutexvar_epi32 (simde__m256i idx, simde__m256i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_permutexvar_epi32(idx, a); - #elif defined(SIMDE_X86_AVX2_NATIVE) - return simde_mm256_permutevar8x32_epi32(a, idx); - #else - simde__m256i_private - idx_ = simde__m256i_to_private(idx), - a_ = simde__m256i_to_private(a), - r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint8x16x2_t table = { { a_.m128i_private[0].neon_u8, - a_.m128i_private[1].neon_u8 } }; - uint32x4_t mask32 = vdupq_n_u32(0x00000007); - uint32x4_t byte_index32 = vdupq_n_u32(0x03020100); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - uint32x4_t index32 = vandq_u32(idx_.m128i_private[i].neon_u32, mask32); - index32 = vmulq_n_u32(index32, 0x04040404); - index32 = vaddq_u32(index32, byte_index32); - r_.m128i_private[i].neon_u8 = vqtbl2q_u8(table, vreinterpretq_u8_u32(index32)); - } - #else - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[idx_.i32[i] & 0x07]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_permutexvar_epi32 - #define _mm256_permutexvar_epi32(idx, a) simde_mm256_permutexvar_epi32(idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_permutexvar_epi32 (simde__m256i src, simde__mmask8 k, simde__m256i idx, simde__m256i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_permutexvar_epi32(src, k, idx, a); - #else - return simde_mm256_mask_mov_epi32(src, k, simde_mm256_permutexvar_epi32(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_permutexvar_epi32 - #define _mm256_mask_permutexvar_epi32(src, k, idx, a) simde_mm256_mask_permutexvar_epi32(src, k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_permutexvar_epi32 (simde__mmask8 k, simde__m256i idx, simde__m256i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_permutexvar_epi32(k, idx, a); - #else - return simde_mm256_maskz_mov_epi32(k, simde_mm256_permutexvar_epi32(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_permutexvar_epi32 - #define _mm256_maskz_permutexvar_epi32(k, idx, a) simde_mm256_maskz_permutexvar_epi32(k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_permutexvar_epi64 (simde__m256i idx, simde__m256i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_permutexvar_epi64(idx, a); - #else - simde__m256i_private - idx_ = simde__m256i_to_private(idx), - a_ = simde__m256i_to_private(a), - r_; - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[idx_.i64[i] & 3]; - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_permutexvar_epi64 - #define _mm256_permutexvar_epi64(idx, a) simde_mm256_permutexvar_epi64(idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_permutexvar_epi64 (simde__m256i src, simde__mmask8 k, simde__m256i idx, simde__m256i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_permutexvar_epi64(src, k, idx, a); - #else - return simde_mm256_mask_mov_epi64(src, k, simde_mm256_permutexvar_epi64(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_permutexvar_epi64 - #define _mm256_mask_permutexvar_epi64(src, k, idx, a) simde_mm256_mask_permutexvar_epi64(src, k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_permutexvar_epi64 (simde__mmask8 k, simde__m256i idx, simde__m256i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_permutexvar_epi64(k, idx, a); - #else - return simde_mm256_maskz_mov_epi64(k, simde_mm256_permutexvar_epi64(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_permutexvar_epi64 - #define _mm256_maskz_permutexvar_epi64(k, idx, a) simde_mm256_maskz_permutexvar_epi64(k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_permutexvar_epi8 (simde__m256i idx, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_permutexvar_epi8(idx, a); - #elif defined(SIMDE_X86_AVX2_NATIVE) - simde__m256i mask = simde_mm256_set1_epi8(0x0F); - simde__m256i lo = simde_mm256_permute4x64_epi64(a, (1 << 6) + (0 << 4) + (1 << 2) + (0 << 0)); - simde__m256i hi = simde_mm256_permute4x64_epi64(a, (3 << 6) + (2 << 4) + (3 << 2) + (2 << 0)); - simde__m256i index = simde_mm256_and_si256(idx, mask); - simde__m256i select = simde_mm256_slli_epi64(idx, 3); - lo = simde_mm256_shuffle_epi8(lo, index); - hi = simde_mm256_shuffle_epi8(hi, index); - return simde_mm256_blendv_epi8(lo, hi, select); - #else - simde__m256i_private - idx_ = simde__m256i_to_private(idx), - a_ = simde__m256i_to_private(a), - r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint8x16x2_t table = { { a_.m128i_private[0].neon_u8, - a_.m128i_private[1].neon_u8 } }; - uint8x16_t mask = vdupq_n_u8(0x1F); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - r_.m128i_private[i].neon_u8 = vqtbl2q_u8(table, vandq_u8(idx_.m128i_private[i].neon_u8, mask)); - } - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - r_.m128i_private[i].altivec_u8 = vec_perm(a_.m128i_private[0].altivec_u8, a_.m128i_private[1].altivec_u8, idx_.m128i_private[i].altivec_u8); - } - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t index, r, t; - const v128_t mask = wasm_i8x16_splat(0x1F); - const v128_t sixteen = wasm_i8x16_splat(16); - const v128_t a0 = a_.m128i_private[0].wasm_v128; - const v128_t a1 = a_.m128i_private[1].wasm_v128; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - index = wasm_v128_and(idx_.m128i_private[i].wasm_v128, mask); - r = wasm_i8x16_swizzle(a0, index); - index = wasm_i8x16_sub(index, sixteen); - t = wasm_i8x16_swizzle(a1, index); - r_.m128i_private[i].wasm_v128 = wasm_v128_or(r, t); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[idx_.i8[i] & 0x1F]; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_permutexvar_epi8 - #define _mm256_permutexvar_epi8(idx, a) simde_mm256_permutexvar_epi8(idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_permutexvar_epi8 (simde__m256i src, simde__mmask32 k, simde__m256i idx, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_permutexvar_epi8(src, k, idx, a); - #else - return simde_mm256_mask_mov_epi8(src, k, simde_mm256_permutexvar_epi8(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_permutexvar_epi8 - #define _mm256_mask_permutexvar_epi8(src, k, idx, a) simde_mm256_mask_permutexvar_epi8(src, k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_permutexvar_epi8 (simde__mmask32 k, simde__m256i idx, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_permutexvar_epi8(k, idx, a); - #else - return simde_mm256_maskz_mov_epi8(k, simde_mm256_permutexvar_epi8(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_permutexvar_epi8 - #define _mm256_maskz_permutexvar_epi8(k, idx, a) simde_mm256_maskz_permutexvar_epi8(k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_permutexvar_pd (simde__m256i idx, simde__m256d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_permutexvar_pd(idx, a); - #else - return simde_mm256_castsi256_pd(simde_mm256_permutexvar_epi64(idx, simde_mm256_castpd_si256(a))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_permutexvar_pd - #define _mm256_permutexvar_pd(idx, a) simde_mm256_permutexvar_pd(idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_mask_permutexvar_pd (simde__m256d src, simde__mmask8 k, simde__m256i idx, simde__m256d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_permutexvar_pd(src, k, idx, a); - #else - return simde_mm256_mask_mov_pd(src, k, simde_mm256_permutexvar_pd(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_permutexvar_pd - #define _mm256_mask_permutexvar_pd(src, k, idx, a) simde_mm256_mask_permutexvar_pd(src, k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_maskz_permutexvar_pd (simde__mmask8 k, simde__m256i idx, simde__m256d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_permutexvar_pd(k, idx, a); - #else - return simde_mm256_maskz_mov_pd(k, simde_mm256_permutexvar_pd(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_permutexvar_pd - #define _mm256_maskz_permutexvar_pd(k, idx, a) simde_mm256_maskz_permutexvar_pd(k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_permutexvar_ps (simde__m256i idx, simde__m256 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_permutexvar_ps(idx, a); - #elif defined(SIMDE_X86_AVX2_NATIVE) - return simde_mm256_permutevar8x32_ps(a, idx); - #else - return simde_mm256_castsi256_ps(simde_mm256_permutexvar_epi32(idx, simde_mm256_castps_si256(a))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_permutexvar_ps - #define _mm256_permutexvar_ps(idx, a) simde_mm256_permutexvar_ps(idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_mask_permutexvar_ps (simde__m256 src, simde__mmask8 k, simde__m256i idx, simde__m256 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_permutexvar_ps(src, k, idx, a); - #else - return simde_mm256_mask_mov_ps(src, k, simde_mm256_permutexvar_ps(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_permutexvar_ps - #define _mm256_mask_permutexvar_ps(src, k, idx, a) simde_mm256_mask_permutexvar_ps(src, k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_maskz_permutexvar_ps (simde__mmask8 k, simde__m256i idx, simde__m256 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_permutexvar_ps(k, idx, a); - #else - return simde_mm256_maskz_mov_ps(k, simde_mm256_permutexvar_ps(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_permutexvar_ps - #define _mm256_maskz_permutexvar_ps(k, idx, a) simde_mm256_maskz_permutexvar_ps(k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_permutexvar_epi16 (simde__m512i idx, simde__m512i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_permutexvar_epi16(idx, a); - #else - simde__m512i_private - idx_ = simde__m512i_to_private(idx), - a_ = simde__m512i_to_private(a), - r_; - - #if defined(SIMDE_X86_AVX2_NATIVE) - simde__m256i t0, t1, index, select, a01, a23; - simde__m256i mask = simde_mm256_set1_epi16(0x001F); - simde__m256i shift = simde_mm256_set1_epi16(0x0202); - simde__m256i byte_index = simde_mm256_set1_epi16(0x0100); - simde__m256i a0 = simde_mm256_broadcastsi128_si256(a_.m128i[0]); - simde__m256i a1 = simde_mm256_broadcastsi128_si256(a_.m128i[1]); - simde__m256i a2 = simde_mm256_broadcastsi128_si256(a_.m128i[2]); - simde__m256i a3 = simde_mm256_broadcastsi128_si256(a_.m128i[3]); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256i_private) / sizeof(r_.m256i_private[0])) ; i++) { - index = idx_.m256i[i]; - index = simde_mm256_and_si256(index, mask); - index = simde_mm256_mullo_epi16(index, shift); - index = simde_mm256_add_epi16(index, byte_index); - t0 = simde_mm256_shuffle_epi8(a0, index); - t1 = simde_mm256_shuffle_epi8(a1, index); - select = simde_mm256_slli_epi64(index, 3); - a01 = simde_mm256_blendv_epi8(t0, t1, select); - t0 = simde_mm256_shuffle_epi8(a2, index); - t1 = simde_mm256_shuffle_epi8(a3, index); - a23 = simde_mm256_blendv_epi8(t0, t1, select); - select = simde_mm256_slli_epi64(index, 2); - r_.m256i[i] = simde_mm256_blendv_epi8(a01, a23, select); - } - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint8x16x4_t table = { { a_.m128i_private[0].neon_u8, - a_.m128i_private[1].neon_u8, - a_.m128i_private[2].neon_u8, - a_.m128i_private[3].neon_u8 } }; - uint16x8_t mask16 = vdupq_n_u16(0x001F); - uint16x8_t byte_index16 = vdupq_n_u16(0x0100); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - uint16x8_t index16 = vandq_u16(idx_.m128i_private[i].neon_u16, mask16); - index16 = vmulq_n_u16(index16, 0x0202); - index16 = vaddq_u16(index16, byte_index16); - r_.m128i_private[i].neon_u8 = vqtbl4q_u8(table, vreinterpretq_u8_u16(index16)); - } - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) index16, mask16, shift16, byte_index16; - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) index, test, r01, r23; - mask16 = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x001F)); - shift16 = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0202)); - byte_index16 = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0100)); - test = vec_splats(HEDLEY_STATIC_CAST(unsigned char, 0x20)); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - index16 = vec_and(idx_.m128i_private[i].altivec_u16, mask16); - index16 = vec_mladd(index16, shift16, byte_index16); - index = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), index16); - r01 = vec_perm(a_.m128i_private[0].altivec_u8, a_.m128i_private[1].altivec_u8, index); - r23 = vec_perm(a_.m128i_private[2].altivec_u8, a_.m128i_private[3].altivec_u8, index); - r_.m128i_private[i].altivec_u8 = vec_sel(r01, r23, vec_cmpeq(vec_and(index, test), test)); - } - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t index, r, t; - const v128_t mask = wasm_i16x8_splat(0x001F); - const v128_t shift = wasm_i16x8_splat(0x0202); - const v128_t byte_index = wasm_i16x8_splat(0x0100); - const v128_t sixteen = wasm_i8x16_splat(16); - const v128_t a0 = a_.m128i_private[0].wasm_v128; - const v128_t a1 = a_.m128i_private[1].wasm_v128; - const v128_t a2 = a_.m128i_private[2].wasm_v128; - const v128_t a3 = a_.m128i_private[3].wasm_v128; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - index = wasm_v128_and(idx_.m128i_private[i].wasm_v128, mask); - index = wasm_i16x8_mul(index, shift); - index = wasm_i16x8_add(index, byte_index); - r = wasm_i8x16_swizzle(a0, index); - - index = wasm_i8x16_sub(index, sixteen); - t = wasm_i8x16_swizzle(a1, index); - r = wasm_v128_or(r, t); - - index = wasm_i8x16_sub(index, sixteen); - t = wasm_i8x16_swizzle(a2, index); - r = wasm_v128_or(r, t); - - index = wasm_i8x16_sub(index, sixteen); - t = wasm_i8x16_swizzle(a3, index); - r_.m128i_private[i].wasm_v128 = wasm_v128_or(r, t); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[idx_.i16[i] & 0x1F]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_permutexvar_epi16 - #define _mm512_permutexvar_epi16(idx, a) simde_mm512_permutexvar_epi16(idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_permutexvar_epi16 (simde__m512i src, simde__mmask32 k, simde__m512i idx, simde__m512i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_permutexvar_epi16(src, k, idx, a); - #else - return simde_mm512_mask_mov_epi16(src, k, simde_mm512_permutexvar_epi16(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_permutexvar_epi16 - #define _mm512_mask_permutexvar_epi16(src, k, idx, a) simde_mm512_mask_permutexvar_epi16(src, k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_permutexvar_epi16 (simde__mmask32 k, simde__m512i idx, simde__m512i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_permutexvar_epi16(k, idx, a); - #else - return simde_mm512_maskz_mov_epi16(k, simde_mm512_permutexvar_epi16(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_permutexvar_epi16 - #define _mm512_maskz_permutexvar_epi16(k, idx, a) simde_mm512_maskz_permutexvar_epi16(k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_permutexvar_epi32 (simde__m512i idx, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_permutexvar_epi32(idx, a); - #else - simde__m512i_private - idx_ = simde__m512i_to_private(idx), - a_ = simde__m512i_to_private(a), - r_; - - #if defined(SIMDE_X86_AVX2_NATIVE) - simde__m256i index, r0, r1, select; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256i_private) / sizeof(r_.m256i_private[0])) ; i++) { - index = idx_.m256i[i]; - r0 = simde_mm256_permutevar8x32_epi32(a_.m256i[0], index); - r1 = simde_mm256_permutevar8x32_epi32(a_.m256i[1], index); - select = simde_mm256_slli_epi32(index, 28); - r_.m256i[i] = simde_mm256_castps_si256(simde_mm256_blendv_ps(simde_mm256_castsi256_ps(r0), - simde_mm256_castsi256_ps(r1), - simde_mm256_castsi256_ps(select))); - } - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint8x16x4_t table = { { a_.m128i_private[0].neon_u8, - a_.m128i_private[1].neon_u8, - a_.m128i_private[2].neon_u8, - a_.m128i_private[3].neon_u8 } }; - uint32x4_t mask32 = vdupq_n_u32(0x0000000F); - uint32x4_t byte_index32 = vdupq_n_u32(0x03020100); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - uint32x4_t index32 = vandq_u32(idx_.m128i_private[i].neon_u32, mask32); - index32 = vmulq_n_u32(index32, 0x04040404); - index32 = vaddq_u32(index32, byte_index32); - r_.m128i_private[i].neon_u8 = vqtbl4q_u8(table, vreinterpretq_u8_u32(index32)); - } - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) index32, mask32, byte_index32, temp32, sixteen; - SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) zero, shift; - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) index, test, r01, r23; - mask32 = vec_splats(HEDLEY_STATIC_CAST(unsigned int, 0x0000000F)); - byte_index32 = vec_splats(HEDLEY_STATIC_CAST(unsigned int, 0x03020100)); - zero = vec_splat_u16(0); - shift = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x0404)); - sixteen = vec_splats(HEDLEY_STATIC_CAST(unsigned int, 16)); - test = vec_splats(HEDLEY_STATIC_CAST(unsigned char, 0x20)); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - index32 = vec_and(idx_.m128i_private[i].altivec_u32, mask32); - - /* Multiply index32 by 0x04040404; unfortunately vec_mul isn't available so (mis)use 16-bit vec_mladd */ - temp32 = vec_sl(index32, sixteen); - index32 = vec_add(index32, temp32); - index32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), - vec_mladd(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), index32), - shift, - zero)); - - index32 = vec_add(index32, byte_index32); - index = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), index32); - r01 = vec_perm(a_.m128i_private[0].altivec_u8, a_.m128i_private[1].altivec_u8, index); - r23 = vec_perm(a_.m128i_private[2].altivec_u8, a_.m128i_private[3].altivec_u8, index); - r_.m128i_private[i].altivec_u8 = vec_sel(r01, r23, vec_cmpeq(vec_and(index, test), test)); - } - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t index, r, t; - const v128_t mask = wasm_i32x4_splat(0x0000000F); - const v128_t shift = wasm_i32x4_splat(0x04040404); - const v128_t byte_index = wasm_i32x4_splat(0x03020100); - const v128_t sixteen = wasm_i8x16_splat(16); - const v128_t a0 = a_.m128i_private[0].wasm_v128; - const v128_t a1 = a_.m128i_private[1].wasm_v128; - const v128_t a2 = a_.m128i_private[2].wasm_v128; - const v128_t a3 = a_.m128i_private[3].wasm_v128; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - index = wasm_v128_and(idx_.m128i_private[i].wasm_v128, mask); - index = wasm_i32x4_mul(index, shift); - index = wasm_i32x4_add(index, byte_index); - r = wasm_i8x16_swizzle(a0, index); - - index = wasm_i8x16_sub(index, sixteen); - t = wasm_i8x16_swizzle(a1, index); - r = wasm_v128_or(r, t); - - index = wasm_i8x16_sub(index, sixteen); - t = wasm_i8x16_swizzle(a2, index); - r = wasm_v128_or(r, t); - - index = wasm_i8x16_sub(index, sixteen); - t = wasm_i8x16_swizzle(a3, index); - r_.m128i_private[i].wasm_v128 = wasm_v128_or(r, t); - } - #else - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[idx_.i32[i] & 0x0F]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_permutexvar_epi32 - #define _mm512_permutexvar_epi32(idx, a) simde_mm512_permutexvar_epi32(idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_permutexvar_epi32 (simde__m512i src, simde__mmask16 k, simde__m512i idx, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_permutexvar_epi32(src, k, idx, a); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_permutexvar_epi32(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_permutexvar_epi32 - #define _mm512_mask_permutexvar_epi32(src, k, idx, a) simde_mm512_mask_permutexvar_epi32(src, k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_permutexvar_epi32 (simde__mmask16 k, simde__m512i idx, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_permutexvar_epi32(k, idx, a); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_permutexvar_epi32(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_permutexvar_epi32 - #define _mm512_maskz_permutexvar_epi32(k, idx, a) simde_mm512_maskz_permutexvar_epi32(k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_permutexvar_epi64 (simde__m512i idx, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_permutexvar_epi64(idx, a); - #else - simde__m512i_private - idx_ = simde__m512i_to_private(idx), - a_ = simde__m512i_to_private(a), - r_; - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[idx_.i64[i] & 7]; - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_permutexvar_epi64 - #define _mm512_permutexvar_epi64(idx, a) simde_mm512_permutexvar_epi64(idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_permutexvar_epi64 (simde__m512i src, simde__mmask8 k, simde__m512i idx, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_permutexvar_epi64(src, k, idx, a); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_permutexvar_epi64(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_permutexvar_epi64 - #define _mm512_mask_permutexvar_epi64(src, k, idx, a) simde_mm512_mask_permutexvar_epi64(src, k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_permutexvar_epi64 (simde__mmask8 k, simde__m512i idx, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_permutexvar_epi64(k, idx, a); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_permutexvar_epi64(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_permutexvar_epi64 - #define _mm512_maskz_permutexvar_epi64(k, idx, a) simde_mm512_maskz_permutexvar_epi64(k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_permutexvar_epi8 (simde__m512i idx, simde__m512i a) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) - return _mm512_permutexvar_epi8(idx, a); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - simde__m512i hilo, hi, lo, hi2, lo2, idx2; - simde__m512i ones = simde_mm512_set1_epi8(1); - simde__m512i low_bytes = simde_mm512_set1_epi16(0x00FF); - - idx2 = simde_mm512_srli_epi16(idx, 1); - hilo = simde_mm512_permutexvar_epi16(idx2, a); - simde__mmask64 mask = simde_mm512_test_epi8_mask(idx, ones); - lo = simde_mm512_and_si512(hilo, low_bytes); - hi = simde_mm512_srli_epi16(hilo, 8); - - idx2 = simde_mm512_srli_epi16(idx, 9); - hilo = simde_mm512_permutexvar_epi16(idx2, a); - lo2 = simde_mm512_slli_epi16(hilo, 8); - hi2 = simde_mm512_andnot_si512(low_bytes, hilo); - - lo = simde_mm512_or_si512(lo, lo2); - hi = simde_mm512_or_si512(hi, hi2); - - return simde_mm512_mask_blend_epi8(mask, lo, hi); - #else - simde__m512i_private - idx_ = simde__m512i_to_private(idx), - a_ = simde__m512i_to_private(a), - r_; - - #if defined(SIMDE_X86_AVX2_NATIVE) - simde__m256i t0, t1, index, select, a01, a23; - simde__m256i mask = simde_mm256_set1_epi8(0x3F); - simde__m256i a0 = simde_mm256_broadcastsi128_si256(a_.m128i[0]); - simde__m256i a1 = simde_mm256_broadcastsi128_si256(a_.m128i[1]); - simde__m256i a2 = simde_mm256_broadcastsi128_si256(a_.m128i[2]); - simde__m256i a3 = simde_mm256_broadcastsi128_si256(a_.m128i[3]); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256i_private) / sizeof(r_.m256i_private[0])) ; i++) { - index = idx_.m256i[i]; - index = simde_mm256_and_si256(index, mask); - select = simde_mm256_slli_epi64(index, 3); - t0 = simde_mm256_shuffle_epi8(a0, index); - t1 = simde_mm256_shuffle_epi8(a1, index); - a01 = simde_mm256_blendv_epi8(t0, t1, select); - t0 = simde_mm256_shuffle_epi8(a2, index); - t1 = simde_mm256_shuffle_epi8(a3, index); - a23 = simde_mm256_blendv_epi8(t0, t1, select); - select = simde_mm256_slli_epi64(index, 2); - r_.m256i[i] = simde_mm256_blendv_epi8(a01, a23, select); - } - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint8x16x4_t table = { { a_.m128i_private[0].neon_u8, - a_.m128i_private[1].neon_u8, - a_.m128i_private[2].neon_u8, - a_.m128i_private[3].neon_u8 } }; - uint8x16_t mask = vdupq_n_u8(0x3F); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - r_.m128i_private[i].neon_u8 = vqtbl4q_u8(table, vandq_u8(idx_.m128i_private[i].neon_u8, mask)); - } - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) test, r01, r23; - test = vec_splats(HEDLEY_STATIC_CAST(unsigned char, 0x20)); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - r01 = vec_perm(a_.m128i_private[0].altivec_u8, a_.m128i_private[1].altivec_u8, idx_.m128i_private[i].altivec_u8); - r23 = vec_perm(a_.m128i_private[2].altivec_u8, a_.m128i_private[3].altivec_u8, idx_.m128i_private[i].altivec_u8); - r_.m128i_private[i].altivec_u8 = vec_sel(r01, r23, vec_cmpeq(vec_and(idx_.m128i_private[i].altivec_u8, test), test)); - } - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t index, r, t; - const v128_t mask = wasm_i8x16_splat(0x3F); - const v128_t sixteen = wasm_i8x16_splat(16); - const v128_t a0 = a_.m128i_private[0].wasm_v128; - const v128_t a1 = a_.m128i_private[1].wasm_v128; - const v128_t a2 = a_.m128i_private[2].wasm_v128; - const v128_t a3 = a_.m128i_private[3].wasm_v128; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - index = wasm_v128_and(idx_.m128i_private[i].wasm_v128, mask); - r = wasm_i8x16_swizzle(a0, index); - - index = wasm_i8x16_sub(index, sixteen); - t = wasm_i8x16_swizzle(a1, index); - r = wasm_v128_or(r, t); - - index = wasm_i8x16_sub(index, sixteen); - t = wasm_i8x16_swizzle(a2, index); - r = wasm_v128_or(r, t); - - index = wasm_i8x16_sub(index, sixteen); - t = wasm_i8x16_swizzle(a3, index); - r_.m128i_private[i].wasm_v128 = wasm_v128_or(r, t); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[idx_.i8[i] & 0x3F]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) - #undef _mm512_permutexvar_epi8 - #define _mm512_permutexvar_epi8(idx, a) simde_mm512_permutexvar_epi8(idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_permutexvar_epi8 (simde__m512i src, simde__mmask64 k, simde__m512i idx, simde__m512i a) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) - return _mm512_mask_permutexvar_epi8(src, k, idx, a); - #else - return simde_mm512_mask_mov_epi8(src, k, simde_mm512_permutexvar_epi8(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_permutexvar_epi8 - #define _mm512_mask_permutexvar_epi8(src, k, idx, a) simde_mm512_mask_permutexvar_epi8(src, k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_permutexvar_epi8 (simde__mmask64 k, simde__m512i idx, simde__m512i a) { - #if defined(SIMDE_X86_AVX512VBMI_NATIVE) - return _mm512_maskz_permutexvar_epi8(k, idx, a); - #else - return simde_mm512_maskz_mov_epi8(k, simde_mm512_permutexvar_epi8(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_permutexvar_epi8 - #define _mm512_maskz_permutexvar_epi8(k, idx, a) simde_mm512_maskz_permutexvar_epi8(k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_permutexvar_pd (simde__m512i idx, simde__m512d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_permutexvar_pd(idx, a); - #else - return simde_mm512_castsi512_pd(simde_mm512_permutexvar_epi64(idx, simde_mm512_castpd_si512(a))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_permutexvar_pd - #define _mm512_permutexvar_pd(idx, a) simde_mm512_permutexvar_pd(idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_permutexvar_pd (simde__m512d src, simde__mmask8 k, simde__m512i idx, simde__m512d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_permutexvar_pd(src, k, idx, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_permutexvar_pd(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_permutexvar_pd - #define _mm512_mask_permutexvar_pd(src, k, idx, a) simde_mm512_mask_permutexvar_pd(src, k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_maskz_permutexvar_pd (simde__mmask8 k, simde__m512i idx, simde__m512d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_permutexvar_pd(k, idx, a); - #else - return simde_mm512_maskz_mov_pd(k, simde_mm512_permutexvar_pd(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_permutexvar_pd - #define _mm512_maskz_permutexvar_pd(k, idx, a) simde_mm512_maskz_permutexvar_pd(k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_permutexvar_ps (simde__m512i idx, simde__m512 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_permutexvar_ps(idx, a); - #else - return simde_mm512_castsi512_ps(simde_mm512_permutexvar_epi32(idx, simde_mm512_castps_si512(a))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_permutexvar_ps - #define _mm512_permutexvar_ps(idx, a) simde_mm512_permutexvar_ps(idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512h -simde_mm512_permutexvar_ph (simde__m512i idx, simde__m512h a) { - #if defined(SIMDE_X86_AVX512FP16_NATIVE) - return _mm512_permutexvar_ph(idx, a); - #else - return simde_mm512_castsi512_ph(simde_mm512_permutexvar_epi16(idx, simde_mm512_castph_si512(a))); - #endif -} -#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) - #undef _mm512_permutexvar_ph - #define _mm512_permutexvar_ph(idx, a) simde_mm512_permutexvar_ph(idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_permutexvar_ps (simde__m512 src, simde__mmask16 k, simde__m512i idx, simde__m512 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_permutexvar_ps(src, k, idx, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_permutexvar_ps(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_permutexvar_ps - #define _mm512_mask_permutexvar_ps(src, k, idx, a) simde_mm512_mask_permutexvar_ps(src, k, idx, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_permutexvar_ps (simde__mmask16 k, simde__m512i idx, simde__m512 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_permutexvar_ps(k, idx, a); - #else - return simde_mm512_maskz_mov_ps(k, simde_mm512_permutexvar_ps(idx, a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_permutexvar_ps - #define _mm512_maskz_permutexvar_ps(k, idx, a) simde_mm512_maskz_permutexvar_ps(k, idx, a) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_PERMUTEXVAR_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/popcnt.h b/ffi-deps/simde/simde/x86/avx512/popcnt.h deleted file mode 100644 index b3c8125..0000000 --- a/ffi-deps/simde/simde/x86/avx512/popcnt.h +++ /dev/null @@ -1,1346 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_POPCNT_H) -#define SIMDE_X86_AVX512_POPCNT_H - -#include "types.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_popcnt_epi8 (simde__m128i a) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_popcnt_epi8(a); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vcntq_s8(a_.neon_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_popcnt(a_.wasm_v128); - #elif defined(SIMDE_X86_SSSE3_NATIVE) - const __m128i low_nibble_set = _mm_set1_epi8(0x0f); - const __m128i high_nibble_of_input = _mm_andnot_si128(low_nibble_set, a_.n); - const __m128i low_nibble_of_input = _mm_and_si128(low_nibble_set, a_.n); - const __m128i lut = _mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0); - - r_.n = - _mm_add_epi8( - _mm_shuffle_epi8( - lut, - low_nibble_of_input - ), - _mm_shuffle_epi8( - lut, - _mm_srli_epi16( - high_nibble_of_input, - 4 - ) - ) - ); - #elif defined(SIMDE_X86_SSE2_NATIVE) - /* v -= ((v >> 1) & UINT8_C(0x55)); */ - r_.n = - _mm_sub_epi8( - a_.n, - _mm_and_si128( - _mm_srli_epi16(a_.n, 1), - _mm_set1_epi8(0x55) - ) - ); - - /* v = (v & 0x33) + ((v >> 2) & 0x33); */ - r_.n = - _mm_add_epi8( - _mm_and_si128( - r_.n, - _mm_set1_epi8(0x33) - ), - _mm_and_si128( - _mm_srli_epi16(r_.n, 2), - _mm_set1_epi8(0x33) - ) - ); - - /* v = (v + (v >> 4)) & 0xf */ - r_.n = - _mm_and_si128( - _mm_add_epi8( - r_.n, - _mm_srli_epi16(r_.n, 4) - ), - _mm_set1_epi8(0x0f) - ); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_popcnt(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), a_.altivec_i8))); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - a_.u8 -= ((a_.u8 >> 1) & 0x55); - a_.u8 = ((a_.u8 & 0x33) + ((a_.u8 >> 2) & 0x33)); - a_.u8 = (a_.u8 + (a_.u8 >> 4)) & 15; - r_.u8 = a_.u8 >> ((sizeof(uint8_t) - 1) * CHAR_BIT); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - uint8_t v = HEDLEY_STATIC_CAST(uint8_t, a_.u8[i]); - v -= ((v >> 1) & 0x55); - v = (v & 0x33) + ((v >> 2) & 0x33); - v = (v + (v >> 4)) & 0xf; - r_.u8[i] = v >> (sizeof(uint8_t) - 1) * CHAR_BIT; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_popcnt_epi8 - #define _mm_popcnt_epi8(a) simde_mm_popcnt_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_popcnt_epi8 (simde__m128i src, simde__mmask16 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_popcnt_epi8(src, k, a); - #else - return simde_mm_mask_mov_epi8(src, k, simde_mm_popcnt_epi8(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_popcnt_epi8 - #define _mm_mask_popcnt_epi8(src, k, a) simde_mm_mask_popcnt_epi8(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_popcnt_epi8 (simde__mmask16 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_popcnt_epi8(k, a); - #else - return simde_mm_maskz_mov_epi8(k, simde_mm_popcnt_epi8(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_popcnt_epi8 - #define _mm_maskz_popcnt_epi8(k, a) simde_mm_maskz_popcnt_epi8(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_popcnt_epi16 (simde__m128i a) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_popcnt_epi16(a); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vpaddlq_s8(vcntq_s8(a_.neon_i8)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_extadd_pairwise_i8x16(wasm_i8x16_popcnt(a_.wasm_v128)); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_u16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), vec_popcnt(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), a_.altivec_u16))); - #elif defined(SIMDE_X86_XOP_NATIVE) - const __m128i low_nibble_set = _mm_set1_epi8(0x0f); - const __m128i high_nibble_of_input = _mm_andnot_si128(low_nibble_set, a_.n); - const __m128i low_nibble_of_input = _mm_and_si128(low_nibble_set, a_.n); - const __m128i lut = _mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0); - - r_.n = - _mm_haddw_epi8( - _mm_add_epi8( - _mm_shuffle_epi8( - lut, - low_nibble_of_input - ), - _mm_shuffle_epi8( - lut, - _mm_srli_epi16(high_nibble_of_input, 4) - ) - ) - ); - #elif defined(SIMDE_X86_SSE2_NATIVE) - r_.n = - _mm_sub_epi16( - a_.n, - _mm_and_si128( - _mm_srli_epi16(a_.n, 1), - _mm_set1_epi16(0x5555) - ) - ); - - r_.n = - _mm_add_epi16( - _mm_and_si128( - r_.n, - _mm_set1_epi16(0x3333) - ), - _mm_and_si128( - _mm_srli_epi16(r_.n, 2), - _mm_set1_epi16(0x3333) - ) - ); - - r_.n = - _mm_and_si128( - _mm_add_epi16( - r_.n, - _mm_srli_epi16(r_.n, 4) - ), - _mm_set1_epi16(0x0f0f) - ); - - r_.n = - _mm_srli_epi16( - _mm_mullo_epi16( - r_.n, - _mm_set1_epi16(0x0101) - ), - (sizeof(uint16_t) - 1) * CHAR_BIT - ); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - a_.u16 -= ((a_.u16 >> 1) & UINT16_C(0x5555)); - a_.u16 = ((a_.u16 & UINT16_C(0x3333)) + ((a_.u16 >> 2) & UINT16_C(0x3333))); - a_.u16 = (a_.u16 + (a_.u16 >> 4)) & UINT16_C(0x0f0f); - r_.u16 = (a_.u16 * UINT16_C(0x0101)) >> ((sizeof(uint16_t) - 1) * CHAR_BIT); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - uint16_t v = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i]); - v -= ((v >> 1) & UINT16_C(0x5555)); - v = ((v & UINT16_C(0x3333)) + ((v >> 2) & UINT16_C(0x3333))); - v = (v + (v >> 4)) & UINT16_C(0x0f0f); - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (v * UINT16_C(0x0101))) >> ((sizeof(uint16_t) - 1) * CHAR_BIT); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_popcnt_epi16 - #define _mm_popcnt_epi16(a) simde_mm_popcnt_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_popcnt_epi16 (simde__m128i src, simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_popcnt_epi16(src, k, a); - #else - return simde_mm_mask_mov_epi16(src, k, simde_mm_popcnt_epi16(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_popcnt_epi16 - #define _mm_mask_popcnt_epi16(src, k, a) simde_mm_mask_popcnt_epi16(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_popcnt_epi16 (simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_popcnt_epi16(k, a); - #else - return simde_mm_maskz_mov_epi16(k, simde_mm_popcnt_epi16(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_popcnt_epi16 - #define _mm_maskz_popcnt_epi16(k, a) simde_mm_maskz_popcnt_epi16(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_popcnt_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_popcnt_epi32(a); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vpaddlq_s16(vpaddlq_s8(vcntq_s8(a_.neon_i8))); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_u32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), vec_popcnt(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), a_.altivec_u32))); - #elif defined(SIMDE_X86_XOP_NATIVE) - const __m128i low_nibble_set = _mm_set1_epi8(0x0f); - const __m128i high_nibble_of_input = _mm_andnot_si128(low_nibble_set, a_.n); - const __m128i low_nibble_of_input = _mm_and_si128(low_nibble_set, a_.n); - const __m128i lut = _mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0); - - r_.n = - _mm_haddd_epi8( - _mm_add_epi8( - _mm_shuffle_epi8( - lut, - low_nibble_of_input - ), - _mm_shuffle_epi8( - lut, - _mm_srli_epi16(high_nibble_of_input, 4) - ) - ) - ); - #elif defined(SIMDE_X86_SSE4_1_NATIVE) - r_.n = - _mm_sub_epi32( - a_.n, - _mm_and_si128( - _mm_srli_epi32(a_.n, 1), - _mm_set1_epi32(0x55555555) - ) - ); - - r_.n = - _mm_add_epi32( - _mm_and_si128( - r_.n, - _mm_set1_epi32(0x33333333) - ), - _mm_and_si128( - _mm_srli_epi32(r_.n, 2), - _mm_set1_epi32(0x33333333) - ) - ); - - r_.n = - _mm_and_si128( - _mm_add_epi32( - r_.n, - _mm_srli_epi32(r_.n, 4) - ), - _mm_set1_epi32(0x0f0f0f0f) - ); - - r_.n = - _mm_srli_epi32( - _mm_mullo_epi32( - r_.n, - _mm_set1_epi32(0x01010101) - ), - (sizeof(uint32_t) - 1) * CHAR_BIT - ); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - a_.u32 -= ((a_.u32 >> 1) & UINT32_C(0x55555555)); - a_.u32 = ((a_.u32 & UINT32_C(0x33333333)) + ((a_.u32 >> 2) & UINT32_C(0x33333333))); - a_.u32 = (a_.u32 + (a_.u32 >> 4)) & UINT32_C(0x0f0f0f0f); - r_.u32 = (a_.u32 * UINT32_C(0x01010101)) >> ((sizeof(uint32_t) - 1) * CHAR_BIT); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - uint32_t v = HEDLEY_STATIC_CAST(uint32_t, a_.u32[i]); - v -= ((v >> 1) & UINT32_C(0x55555555)); - v = ((v & UINT32_C(0x33333333)) + ((v >> 2) & UINT32_C(0x33333333))); - v = (v + (v >> 4)) & UINT32_C(0x0f0f0f0f); - r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (v * UINT32_C(0x01010101))) >> ((sizeof(uint32_t) - 1) * CHAR_BIT); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_popcnt_epi32 - #define _mm_popcnt_epi32(a) simde_mm_popcnt_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_popcnt_epi32 (simde__m128i src, simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_popcnt_epi32(src, k, a); - #else - return simde_mm_mask_mov_epi32(src, k, simde_mm_popcnt_epi32(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_popcnt_epi32 - #define _mm_mask_popcnt_epi32(src, k, a) simde_mm_mask_popcnt_epi32(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_popcnt_epi32 (simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_popcnt_epi32(k, a); - #else - return simde_mm_maskz_mov_epi32(k, simde_mm_popcnt_epi32(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_popcnt_epi32 - #define _mm_maskz_popcnt_epi32(k, a) simde_mm_maskz_popcnt_epi32(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_popcnt_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_popcnt_epi64(a); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vpaddlq_s32(vpaddlq_s16(vpaddlq_s8(vcntq_s8(a_.neon_i8)))); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_u64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), vec_popcnt(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), a_.altivec_u64))); - #elif defined(SIMDE_X86_SSSE3_NATIVE) - const __m128i low_nibble_set = _mm_set1_epi8(0x0f); - const __m128i high_nibble_of_input = _mm_andnot_si128(low_nibble_set, a_.n); - const __m128i low_nibble_of_input = _mm_and_si128(low_nibble_set, a_.n); - const __m128i lut = _mm_set_epi8(4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0); - - r_.n = - _mm_sad_epu8( - _mm_add_epi8( - _mm_shuffle_epi8( - lut, - low_nibble_of_input - ), - _mm_shuffle_epi8( - lut, - _mm_srli_epi16(high_nibble_of_input, 4) - ) - ), - _mm_setzero_si128() - ); - #elif defined(SIMDE_X86_SSE2_NATIVE) - r_.n = - _mm_sub_epi8( - a_.n, - _mm_and_si128( - _mm_srli_epi16(a_.n, 1), - _mm_set1_epi8(0x55) - ) - ); - - r_.n = - _mm_add_epi8( - _mm_and_si128( - r_.n, - _mm_set1_epi8(0x33) - ), - _mm_and_si128( - _mm_srli_epi16(r_.n, 2), - _mm_set1_epi8(0x33) - ) - ); - - r_.n = - _mm_and_si128( - _mm_add_epi8( - r_.n, - _mm_srli_epi16(r_.n, 4) - ), - _mm_set1_epi8(0x0f) - ); - - r_.n = - _mm_sad_epu8( - r_.n, - _mm_setzero_si128() - ); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - a_.u64 -= ((a_.u64 >> 1) & UINT64_C(0x5555555555555555)); - a_.u64 = ((a_.u64 & UINT64_C(0x3333333333333333)) + ((a_.u64 >> 2) & UINT64_C(0x3333333333333333))); - a_.u64 = (a_.u64 + (a_.u64 >> 4)) & UINT64_C(0x0f0f0f0f0f0f0f0f); - r_.u64 = (a_.u64 * UINT64_C(0x0101010101010101)) >> ((sizeof(uint64_t) - 1) * CHAR_BIT); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - uint64_t v = HEDLEY_STATIC_CAST(uint64_t, a_.u64[i]); - v -= ((v >> 1) & UINT64_C(0x5555555555555555)); - v = ((v & UINT64_C(0x3333333333333333)) + ((v >> 2) & UINT64_C(0x3333333333333333))); - v = (v + (v >> 4)) & UINT64_C(0x0f0f0f0f0f0f0f0f); - r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, (v * UINT64_C(0x0101010101010101))) >> ((sizeof(uint64_t) - 1) * CHAR_BIT); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_popcnt_epi64 - #define _mm_popcnt_epi64(a) simde_mm_popcnt_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_popcnt_epi64 (simde__m128i src, simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_popcnt_epi64(src, k, a); - #else - return simde_mm_mask_mov_epi64(src, k, simde_mm_popcnt_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_popcnt_epi64 - #define _mm_mask_popcnt_epi64(src, k, a) simde_mm_mask_popcnt_epi64(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_popcnt_epi64 (simde__mmask8 k, simde__m128i a) { - #if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_popcnt_epi64(k, a); - #else - return simde_mm_maskz_mov_epi64(k, simde_mm_popcnt_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_popcnt_epi64 - #define _mm_maskz_popcnt_epi64(k, a) simde_mm_maskz_popcnt_epi64(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_popcnt_epi8 (simde__m256i a) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_popcnt_epi8(a); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_popcnt_epi8(a_.m128i[i]); - } - #elif defined(SIMDE_X86_AVX2_NATIVE) - const __m256i low_nibble_set = _mm256_set1_epi8(0x0f); - const __m256i high_nibble_of_input = _mm256_andnot_si256(low_nibble_set, a_.n); - const __m256i low_nibble_of_input = _mm256_and_si256(low_nibble_set, a_.n); - const __m256i lut = - _mm256_set_epi8( - 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, - 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0 - ); - - r_.n = - _mm256_add_epi8( - _mm256_shuffle_epi8( - lut, - low_nibble_of_input - ), - _mm256_shuffle_epi8( - lut, - _mm256_srli_epi16( - high_nibble_of_input, - 4 - ) - ) - ); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - a_.u8 -= ((a_.u8 >> 1) & 0x55); - a_.u8 = ((a_.u8 & 0x33) + ((a_.u8 >> 2) & 0x33)); - a_.u8 = (a_.u8 + (a_.u8 >> 4)) & 15; - r_.u8 = a_.u8 >> ((sizeof(uint8_t) - 1) * CHAR_BIT); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - uint8_t v = HEDLEY_STATIC_CAST(uint8_t, a_.u8[i]); - v -= ((v >> 1) & 0x55); - v = (v & 0x33) + ((v >> 2) & 0x33); - v = (v + (v >> 4)) & 0xf; - r_.u8[i] = v >> (sizeof(uint8_t) - 1) * CHAR_BIT; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_popcnt_epi8 - #define _mm256_popcnt_epi8(a) simde_mm256_popcnt_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_popcnt_epi8 (simde__m256i src, simde__mmask32 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_popcnt_epi8(src, k, a); - #else - return simde_mm256_mask_mov_epi8(src, k, simde_mm256_popcnt_epi8(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_popcnt_epi8 - #define _mm256_mask_popcnt_epi8(src, k, a) simde_mm256_mask_popcnt_epi8(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_popcnt_epi8 (simde__mmask32 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_popcnt_epi8(k, a); - #else - return simde_mm256_maskz_mov_epi8(k, simde_mm256_popcnt_epi8(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_popcnt_epi8 - #define _mm256_maskz_popcnt_epi8(k, a) simde_mm256_maskz_popcnt_epi8(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_popcnt_epi16 (simde__m256i a) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_popcnt_epi16(a); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_popcnt_epi16(a_.m128i[i]); - } - #elif defined(SIMDE_X86_AVX2_NATIVE) - r_.n = - _mm256_sub_epi16( - a_.n, - _mm256_and_si256( - _mm256_srli_epi16(a_.n, 1), - _mm256_set1_epi16(0x5555) - ) - ); - - r_.n = - _mm256_add_epi16( - _mm256_and_si256( - r_.n, - _mm256_set1_epi16(0x3333) - ), - _mm256_and_si256( - _mm256_srli_epi16(r_.n, 2), - _mm256_set1_epi16(0x3333) - ) - ); - - r_.n = - _mm256_and_si256( - _mm256_add_epi16( - r_.n, - _mm256_srli_epi16(r_.n, 4) - ), - _mm256_set1_epi16(0x0f0f) - ); - - r_.n = - _mm256_srli_epi16( - _mm256_mullo_epi16( - r_.n, - _mm256_set1_epi16(0x0101) - ), - (sizeof(uint16_t) - 1) * CHAR_BIT - ); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - a_.u16 -= ((a_.u16 >> 1) & UINT16_C(0x5555)); - a_.u16 = ((a_.u16 & UINT16_C(0x3333)) + ((a_.u16 >> 2) & UINT16_C(0x3333))); - a_.u16 = (a_.u16 + (a_.u16 >> 4)) & UINT16_C(0x0f0f); - r_.u16 = (a_.u16 * UINT16_C(0x0101)) >> ((sizeof(uint16_t) - 1) * CHAR_BIT); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - uint16_t v = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i]); - v -= ((v >> 1) & UINT16_C(0x5555)); - v = ((v & UINT16_C(0x3333)) + ((v >> 2) & UINT16_C(0x3333))); - v = (v + (v >> 4)) & UINT16_C(0x0f0f); - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (v * UINT16_C(0x0101))) >> ((sizeof(uint16_t) - 1) * CHAR_BIT); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_popcnt_epi16 - #define _mm256_popcnt_epi16(a) simde_mm256_popcnt_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_popcnt_epi16 (simde__m256i src, simde__mmask16 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_popcnt_epi16(src, k, a); - #else - return simde_mm256_mask_mov_epi16(src, k, simde_mm256_popcnt_epi16(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_popcnt_epi16 - #define _mm256_mask_popcnt_epi16(src, k, a) simde_mm256_mask_popcnt_epi16(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_popcnt_epi16 (simde__mmask16 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_popcnt_epi16(k, a); - #else - return simde_mm256_maskz_mov_epi16(k, simde_mm256_popcnt_epi16(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_popcnt_epi16 - #define _mm256_maskz_popcnt_epi16(k, a) simde_mm256_maskz_popcnt_epi16(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_popcnt_epi32 (simde__m256i a) { - #if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_popcnt_epi32(a); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_popcnt_epi32(a_.m128i[i]); - } - #elif defined(SIMDE_X86_AVX2_NATIVE) - r_.n = - _mm256_sub_epi32( - a_.n, - _mm256_and_si256( - _mm256_srli_epi32(a_.n, 1), - _mm256_set1_epi32(0x55555555) - ) - ); - - r_.n = - _mm256_add_epi32( - _mm256_and_si256( - r_.n, - _mm256_set1_epi32(0x33333333) - ), - _mm256_and_si256( - _mm256_srli_epi32(r_.n, 2), - _mm256_set1_epi32(0x33333333) - ) - ); - - r_.n = - _mm256_and_si256( - _mm256_add_epi32( - r_.n, - _mm256_srli_epi32(r_.n, 4) - ), - _mm256_set1_epi32(0x0f0f0f0f) - ); - - r_.n = - _mm256_srli_epi32( - _mm256_mullo_epi32( - r_.n, - _mm256_set1_epi32(0x01010101) - ), - (sizeof(uint32_t) - 1) * CHAR_BIT - ); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - a_.u32 -= ((a_.u32 >> 1) & UINT32_C(0x55555555)); - a_.u32 = ((a_.u32 & UINT32_C(0x33333333)) + ((a_.u32 >> 2) & UINT32_C(0x33333333))); - a_.u32 = (a_.u32 + (a_.u32 >> 4)) & UINT32_C(0x0f0f0f0f); - r_.u32 = (a_.u32 * UINT32_C(0x01010101)) >> ((sizeof(uint32_t) - 1) * CHAR_BIT); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - uint32_t v = HEDLEY_STATIC_CAST(uint32_t, a_.u32[i]); - v -= ((v >> 1) & UINT32_C(0x55555555)); - v = ((v & UINT32_C(0x33333333)) + ((v >> 2) & UINT32_C(0x33333333))); - v = (v + (v >> 4)) & UINT32_C(0x0f0f0f0f); - r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (v * UINT32_C(0x01010101))) >> ((sizeof(uint32_t) - 1) * CHAR_BIT); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_popcnt_epi32 - #define _mm256_popcnt_epi32(a) simde_mm256_popcnt_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_popcnt_epi32 (simde__m256i src, simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_popcnt_epi32(src, k, a); - #else - return simde_mm256_mask_mov_epi32(src, k, simde_mm256_popcnt_epi32(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_popcnt_epi32 - #define _mm256_mask_popcnt_epi32(src, k, a) simde_mm256_mask_popcnt_epi32(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_popcnt_epi32 (simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_popcnt_epi32(k, a); - #else - return simde_mm256_maskz_mov_epi32(k, simde_mm256_popcnt_epi32(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_popcnt_epi32 - #define _mm256_maskz_popcnt_epi32(k, a) simde_mm256_maskz_popcnt_epi32(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_popcnt_epi64 (simde__m256i a) { - #if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_popcnt_epi64(a); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < sizeof(r_.m128i) / sizeof(r_.m128i[0]) ; i++) { - r_.m128i[i] = simde_mm_popcnt_epi64(a_.m128i[i]); - } - #elif defined(SIMDE_X86_AVX2_NATIVE) - const __m256i low_nibble_set = _mm256_set1_epi8(0x0f); - const __m256i high_nibble_of_input = _mm256_andnot_si256(low_nibble_set, a_.n); - const __m256i low_nibble_of_input = _mm256_and_si256(low_nibble_set, a_.n); - const __m256i lut = - _mm256_set_epi8( - 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, - 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0 - ); - - r_.n = - _mm256_sad_epu8( - _mm256_add_epi8( - _mm256_shuffle_epi8( - lut, - low_nibble_of_input - ), - _mm256_shuffle_epi8( - lut, - _mm256_srli_epi16(high_nibble_of_input, 4) - ) - ), - _mm256_setzero_si256() - ); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - a_.u64 -= ((a_.u64 >> 1) & UINT64_C(0x5555555555555555)); - a_.u64 = ((a_.u64 & UINT64_C(0x3333333333333333)) + ((a_.u64 >> 2) & UINT64_C(0x3333333333333333))); - a_.u64 = (a_.u64 + (a_.u64 >> 4)) & UINT64_C(0x0f0f0f0f0f0f0f0f); - r_.u64 = (a_.u64 * UINT64_C(0x0101010101010101)) >> ((sizeof(uint64_t) - 1) * CHAR_BIT); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - uint64_t v = HEDLEY_STATIC_CAST(uint64_t, a_.u64[i]); - v -= ((v >> 1) & UINT64_C(0x5555555555555555)); - v = ((v & UINT64_C(0x3333333333333333)) + ((v >> 2) & UINT64_C(0x3333333333333333))); - v = (v + (v >> 4)) & UINT64_C(0x0f0f0f0f0f0f0f0f); - r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, (v * UINT64_C(0x0101010101010101))) >> ((sizeof(uint64_t) - 1) * CHAR_BIT); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_popcnt_epi64 - #define _mm256_popcnt_epi64(a) simde_mm256_popcnt_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_popcnt_epi64 (simde__m256i src, simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_popcnt_epi64(src, k, a); - #else - return simde_mm256_mask_mov_epi64(src, k, simde_mm256_popcnt_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_popcnt_epi64 - #define _mm256_mask_popcnt_epi64(src, k, a) simde_mm256_mask_popcnt_epi64(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_popcnt_epi64 (simde__mmask8 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_popcnt_epi64(k, a); - #else - return simde_mm256_maskz_mov_epi64(k, simde_mm256_popcnt_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_popcnt_epi64 - #define _mm256_maskz_popcnt_epi64(k, a) simde_mm256_maskz_popcnt_epi64(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_popcnt_epi8 (simde__m512i a) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) - return _mm512_popcnt_epi8(a); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_popcnt_epi8(a_.m128i[i]); - } - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_popcnt_epi8(a_.m256i[i]); - } - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - const __m512i low_nibble_set = _mm512_set1_epi8(0x0f); - const __m512i high_nibble_of_input = _mm512_andnot_si512(low_nibble_set, a_.n); - const __m512i low_nibble_of_input = _mm512_and_si512(low_nibble_set, a_.n); - const __m512i lut = - simde_mm512_set_epi8( - 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, - 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, - 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, - 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0 - ); - - r_.n = - _mm512_add_epi8( - _mm512_shuffle_epi8( - lut, - low_nibble_of_input - ), - _mm512_shuffle_epi8( - lut, - _mm512_srli_epi16( - high_nibble_of_input, - 4 - ) - ) - ); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - a_.u8 -= ((a_.u8 >> 1) & 0x55); - a_.u8 = ((a_.u8 & 0x33) + ((a_.u8 >> 2) & 0x33)); - a_.u8 = (a_.u8 + (a_.u8 >> 4)) & 15; - r_.u8 = a_.u8 >> ((sizeof(uint8_t) - 1) * CHAR_BIT); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - uint8_t v = HEDLEY_STATIC_CAST(uint8_t, a_.u8[i]); - v -= ((v >> 1) & 0x55); - v = (v & 0x33) + ((v >> 2) & 0x33); - v = (v + (v >> 4)) & 0xf; - r_.u8[i] = v >> (sizeof(uint8_t) - 1) * CHAR_BIT; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) - #undef _mm512_popcnt_epi8 - #define _mm512_popcnt_epi8(a) simde_mm512_popcnt_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_popcnt_epi8 (simde__m512i src, simde__mmask64 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) - return _mm512_mask_popcnt_epi8(src, k, a); - #else - return simde_mm512_mask_mov_epi8(src, k, simde_mm512_popcnt_epi8(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_popcnt_epi8 - #define _mm512_mask_popcnt_epi8(src, k, a) simde_mm512_mask_popcnt_epi8(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_popcnt_epi8 (simde__mmask64 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) - return _mm512_maskz_popcnt_epi8(k, a); - #else - return simde_mm512_maskz_mov_epi8(k, simde_mm512_popcnt_epi8(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_popcnt_epi8 - #define _mm512_maskz_popcnt_epi8(k, a) simde_mm512_maskz_popcnt_epi8(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_popcnt_epi16 (simde__m512i a) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) - return _mm512_popcnt_epi16(a); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_popcnt_epi16(a_.m128i[i]); - } - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_popcnt_epi16(a_.m256i[i]); - } - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - r_.n = - _mm512_sub_epi16( - a_.n, - _mm512_and_si512( - _mm512_srli_epi16(a_.n, 1), - _mm512_set1_epi16(0x5555) - ) - ); - - r_.n = - _mm512_add_epi16( - _mm512_and_si512( - r_.n, - _mm512_set1_epi16(0x3333) - ), - _mm512_and_si512( - _mm512_srli_epi16(r_.n, 2), - _mm512_set1_epi16(0x3333) - ) - ); - - r_.n = - _mm512_and_si512( - _mm512_add_epi16( - r_.n, - _mm512_srli_epi16(r_.n, 4) - ), - _mm512_set1_epi16(0x0f0f) - ); - - r_.n = - _mm512_srli_epi16( - _mm512_mullo_epi16( - r_.n, - _mm512_set1_epi16(0x0101) - ), - (sizeof(uint16_t) - 1) * CHAR_BIT - ); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - a_.u16 -= ((a_.u16 >> 1) & UINT16_C(0x5555)); - a_.u16 = ((a_.u16 & UINT16_C(0x3333)) + ((a_.u16 >> 2) & UINT16_C(0x3333))); - a_.u16 = (a_.u16 + (a_.u16 >> 4)) & UINT16_C(0x0f0f); - r_.u16 = (a_.u16 * UINT16_C(0x0101)) >> ((sizeof(uint16_t) - 1) * CHAR_BIT); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - uint16_t v = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i]); - v -= ((v >> 1) & UINT16_C(0x5555)); - v = ((v & UINT16_C(0x3333)) + ((v >> 2) & UINT16_C(0x3333))); - v = (v + (v >> 4)) & UINT16_C(0x0f0f); - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (v * UINT16_C(0x0101))) >> ((sizeof(uint16_t) - 1) * CHAR_BIT); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) - #undef _mm512_popcnt_epi16 - #define _mm512_popcnt_epi16(a) simde_mm512_popcnt_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_popcnt_epi16 (simde__m512i src, simde__mmask32 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) - return _mm512_mask_popcnt_epi16(src, k, a); - #else - return simde_mm512_mask_mov_epi16(src, k, simde_mm512_popcnt_epi16(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_popcnt_epi16 - #define _mm512_mask_popcnt_epi16(src, k, a) simde_mm512_mask_popcnt_epi16(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_popcnt_epi16 (simde__mmask32 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512BITALG_NATIVE) - return _mm512_maskz_popcnt_epi16(k, a); - #else - return simde_mm512_maskz_mov_epi16(k, simde_mm512_popcnt_epi16(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BITALG_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_popcnt_epi16 - #define _mm512_maskz_popcnt_epi16(k, a) simde_mm512_maskz_popcnt_epi16(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_popcnt_epi32 (simde__m512i a) { - #if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) - return _mm512_popcnt_epi32(a); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_popcnt_epi32(a_.m128i[i]); - } - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_popcnt_epi32(a_.m256i[i]); - } - #elif defined(SIMDE_X86_AVX512F_NATIVE) - r_.n = - _mm512_sub_epi32( - a_.n, - _mm512_and_si512( - _mm512_srli_epi32(a_.n, 1), - _mm512_set1_epi32(0x55555555) - ) - ); - - r_.n = - _mm512_add_epi32( - _mm512_and_si512( - r_.n, - _mm512_set1_epi32(0x33333333) - ), - _mm512_and_si512( - _mm512_srli_epi32(r_.n, 2), - _mm512_set1_epi32(0x33333333) - ) - ); - - r_.n = - _mm512_and_si512( - _mm512_add_epi32( - r_.n, - _mm512_srli_epi32(r_.n, 4) - ), - _mm512_set1_epi32(0x0f0f0f0f) - ); - - r_.n = - _mm512_srli_epi32( - _mm512_mullo_epi32( - r_.n, - _mm512_set1_epi32(0x01010101) - ), - (sizeof(uint32_t) - 1) * CHAR_BIT - ); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - a_.u32 -= ((a_.u32 >> 1) & UINT32_C(0x55555555)); - a_.u32 = ((a_.u32 & UINT32_C(0x33333333)) + ((a_.u32 >> 2) & UINT32_C(0x33333333))); - a_.u32 = (a_.u32 + (a_.u32 >> 4)) & UINT32_C(0x0f0f0f0f); - r_.u32 = (a_.u32 * UINT32_C(0x01010101)) >> ((sizeof(uint32_t) - 1) * CHAR_BIT); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - uint32_t v = HEDLEY_STATIC_CAST(uint32_t, a_.u32[i]); - v -= ((v >> 1) & UINT32_C(0x55555555)); - v = ((v & UINT32_C(0x33333333)) + ((v >> 2) & UINT32_C(0x33333333))); - v = (v + (v >> 4)) & UINT32_C(0x0f0f0f0f); - r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (v * UINT32_C(0x01010101))) >> ((sizeof(uint32_t) - 1) * CHAR_BIT); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_popcnt_epi32 - #define _mm512_popcnt_epi32(a) simde_mm512_popcnt_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_popcnt_epi32 (simde__m512i src, simde__mmask16 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) - return _mm512_mask_popcnt_epi32(src, k, a); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_popcnt_epi32(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_popcnt_epi32 - #define _mm512_mask_popcnt_epi32(src, k, a) simde_mm512_mask_popcnt_epi32(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_popcnt_epi32 (simde__mmask16 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) - return _mm512_maskz_popcnt_epi32(k, a); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_popcnt_epi32(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_popcnt_epi32 - #define _mm512_maskz_popcnt_epi32(k, a) simde_mm512_maskz_popcnt_epi32(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_popcnt_epi64 (simde__m512i a) { - #if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) - return _mm512_popcnt_epi64(a); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_popcnt_epi64(a_.m128i[i]); - } - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < sizeof(r_.m256i) / sizeof(r_.m256i[0]) ; i++) { - r_.m256i[i] = simde_mm256_popcnt_epi64(a_.m256i[i]); - } - #elif defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - const __m512i low_nibble_set = _mm512_set1_epi8(0x0f); - const __m512i high_nibble_of_input = _mm512_andnot_si512(low_nibble_set, a_.n); - const __m512i low_nibble_of_input = _mm512_and_si512(low_nibble_set, a_.n); - const __m512i lut = - simde_mm512_set_epi8( - 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, - 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, - 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, - 4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0 - ); - - r_.n = - _mm512_sad_epu8( - _mm512_add_epi8( - _mm512_shuffle_epi8( - lut, - low_nibble_of_input - ), - _mm512_shuffle_epi8( - lut, - _mm512_srli_epi16(high_nibble_of_input, 4) - ) - ), - _mm512_setzero_si512() - ); - #elif defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512DQ_NATIVE) - r_.n = - _mm512_sub_epi64( - a_.n, - _mm512_and_si512( - _mm512_srli_epi64(a_.n, 1), - _mm512_set1_epi64(0x5555555555555555) - ) - ); - - r_.n = - _mm512_add_epi64( - _mm512_and_si512( - r_.n, - _mm512_set1_epi64(0x3333333333333333) - ), - _mm512_and_si512( - _mm512_srli_epi64(r_.n, 2), - _mm512_set1_epi64(0x3333333333333333) - ) - ); - - r_.n = - _mm512_and_si512( - _mm512_add_epi64( - r_.n, - _mm512_srli_epi64(r_.n, 4) - ), - _mm512_set1_epi64(0x0f0f0f0f0f0f0f0f) - ); - - r_.n = - _mm512_srli_epi64( - _mm512_mullo_epi64( - r_.n, - _mm512_set1_epi64(0x0101010101010101) - ), - (sizeof(uint64_t) - 1) * CHAR_BIT - ); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - a_.u64 -= ((a_.u64 >> 1) & UINT64_C(0x5555555555555555)); - a_.u64 = ((a_.u64 & UINT64_C(0x3333333333333333)) + ((a_.u64 >> 2) & UINT64_C(0x3333333333333333))); - a_.u64 = (a_.u64 + (a_.u64 >> 4)) & UINT64_C(0x0f0f0f0f0f0f0f0f); - r_.u64 = (a_.u64 * UINT64_C(0x0101010101010101)) >> ((sizeof(uint64_t) - 1) * CHAR_BIT); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - uint64_t v = HEDLEY_STATIC_CAST(uint64_t, a_.u64[i]); - v -= ((v >> 1) & UINT64_C(0x5555555555555555)); - v = ((v & UINT64_C(0x3333333333333333)) + ((v >> 2) & UINT64_C(0x3333333333333333))); - v = (v + (v >> 4)) & UINT64_C(0x0f0f0f0f0f0f0f0f); - r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, (v * UINT64_C(0x0101010101010101))) >> ((sizeof(uint64_t) - 1) * CHAR_BIT); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_popcnt_epi64 - #define _mm512_popcnt_epi64(a) simde_mm512_popcnt_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_popcnt_epi64 (simde__m512i src, simde__mmask8 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) - return _mm512_mask_popcnt_epi64(src, k, a); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_popcnt_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_popcnt_epi64 - #define _mm512_mask_popcnt_epi64(src, k, a) simde_mm512_mask_popcnt_epi64(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_popcnt_epi64 (simde__mmask8 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512VPOPCNTDQ_NATIVE) - return _mm512_maskz_popcnt_epi64(k, a); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_popcnt_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512VPOPCNTDQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_popcnt_epi64 - #define _mm512_maskz_popcnt_epi64(k, a) simde_mm512_maskz_popcnt_epi64(k, a) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_POPCNT_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/range.h b/ffi-deps/simde/simde/x86/avx512/range.h deleted file mode 100644 index 1d8c0fb..0000000 --- a/ffi-deps/simde/simde/x86/avx512/range.h +++ /dev/null @@ -1,745 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_RANGE_H) -#define SIMDE_X86_AVX512_RANGE_H - -#include "types.h" -#include "max.h" -#include "min.h" -#include "set1.h" -#include "copysign.h" -#include "abs.h" -#include "setzero.h" -#include "cmp.h" -#include "or.h" -#include "andnot.h" -#include "insert.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_range_ps (simde__m128 a, simde__m128 b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m128 r; - - switch (imm8 & 3) { - case 0: - r = simde_mm_min_ps(a, b); - break; - case 1: - r = simde_mm_max_ps(a, b); - break; - case 2: - r = simde_x_mm_select_ps(b, a, simde_mm_cmple_ps(simde_x_mm_abs_ps(a), simde_x_mm_abs_ps(b))); - break; - case 3: - r = simde_x_mm_select_ps(b, a, simde_mm_cmpge_ps(simde_x_mm_abs_ps(a), simde_x_mm_abs_ps(b))); - break; - default: - break; - } - - switch (imm8 & 12) { - case 0: - r = simde_x_mm_copysign_ps(r, a); - break; - case 8: - r = simde_mm_andnot_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C(-0.0)), r); - break; - case 12: - r = simde_mm_or_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C(-0.0)), r); - break; - default: - break; - } - - return r; -} -#if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_range_ps(a, b, imm8) _mm_range_ps((a), (b), (imm8)) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_range_ps - #define _mm_range_ps(a, b, imm8) simde_mm_range_ps(a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_mask_range_ps(src, k, a, b, imm8) _mm_mask_range_ps(src, k, a, b, imm8) -#else - #define simde_mm_mask_range_ps(src, k, a, b, imm8) simde_mm_mask_mov_ps(src, k, simde_mm_range_ps(a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_range_ps - #define _mm_mask_range_ps(src, k, a, b, imm8) simde_mm_mask_range_ps(src, k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_maskz_range_ps(k, a, b, imm8) _mm_maskz_range_ps(k, a, b, imm8) -#else - #define simde_mm_maskz_range_ps(k, a, b, imm8) simde_mm_maskz_mov_ps(k, simde_mm_range_ps(a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_range_ps - #define _mm_maskz_range_ps(k, a, b, imm8) simde_mm_maskz_range_ps(k, a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_range_ps (simde__m256 a, simde__m256 b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m256 r; - - switch (imm8 & 3) { - case 0: - r = simde_mm256_min_ps(a, b); - break; - case 1: - r = simde_mm256_max_ps(a, b); - break; - case 2: - r = simde_x_mm256_select_ps(b, a, simde_mm256_cmp_ps(simde_x_mm256_abs_ps(a), simde_x_mm256_abs_ps(b), SIMDE_CMP_LE_OQ)); - break; - case 3: - r = simde_x_mm256_select_ps(b, a, simde_mm256_cmp_ps(simde_x_mm256_abs_ps(a), simde_x_mm256_abs_ps(b), SIMDE_CMP_GE_OQ)); - break; - default: - break; - } - - switch (imm8 & 12) { - case 0: - r = simde_x_mm256_copysign_ps(r, a); - break; - case 8: - r = simde_mm256_andnot_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C(-0.0)), r); - break; - case 12: - r = simde_mm256_or_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C(-0.0)), r); - break; - default: - break; - } - - return r; -} -#if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_range_ps(a, b, imm8) _mm256_range_ps((a), (b), (imm8)) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm256_range_ps(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m256_private \ - simde_mm256_range_ps_r_ = simde__m256_to_private(simde_mm256_setzero_ps()), \ - simde_mm256_range_ps_a_ = simde__m256_to_private(a), \ - simde_mm256_range_ps_b_ = simde__m256_to_private(b); \ - \ - for (size_t simde_mm256_range_ps_i = 0 ; simde_mm256_range_ps_i < (sizeof(simde_mm256_range_ps_r_.m128) / sizeof(simde_mm256_range_ps_r_.m128[0])) ; simde_mm256_range_ps_i++) { \ - simde_mm256_range_ps_r_.m128[simde_mm256_range_ps_i] = simde_mm_range_ps(simde_mm256_range_ps_a_.m128[simde_mm256_range_ps_i], simde_mm256_range_ps_b_.m128[simde_mm256_range_ps_i], imm8); \ - } \ - \ - simde__m256_from_private(simde_mm256_range_ps_r_); \ - })) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_range_ps - #define _mm256_range_ps(a, b, imm8) simde_mm256_range_ps(a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_mask_range_ps(src, k, a, b, imm8) _mm256_mask_range_ps(src, k, a, b, imm8) -#else - #define simde_mm256_mask_range_ps(src, k, a, b, imm8) simde_mm256_mask_mov_ps(src, k, simde_mm256_range_ps(a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_range_ps - #define _mm256_mask_range_ps(src, k, a, b, imm8) simde_mm256_mask_range_ps(src, k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_maskz_range_ps(k, a, b, imm8) _mm256_maskz_range_ps(k, a, b, imm8) -#else - #define simde_mm256_maskz_range_ps(k, a, b, imm8) simde_mm256_maskz_mov_ps(k, simde_mm256_range_ps(a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_range_ps - #define _mm256_maskz_range_ps(k, a, b, imm8) simde_mm256_maskz_range_ps(k, a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_range_ps (simde__m512 a, simde__m512 b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m512 r; - - switch (imm8 & 3) { - case 0: - r = simde_mm512_min_ps(a, b); - break; - case 1: - r = simde_mm512_max_ps(a, b); - break; - case 2: - r = simde_mm512_mask_mov_ps(b, simde_mm512_cmp_ps_mask(simde_mm512_abs_ps(a), simde_mm512_abs_ps(b), SIMDE_CMP_LE_OS), a); - break; - case 3: - r = simde_mm512_mask_mov_ps(a, simde_mm512_cmp_ps_mask(simde_mm512_abs_ps(b), simde_mm512_abs_ps(a), SIMDE_CMP_GE_OS), b); - break; - default: - break; - } - - switch (imm8 & 12) { - case 0: - r = simde_x_mm512_copysign_ps(r, a); - break; - case 8: - r = simde_mm512_andnot_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C(-0.0)), r); - break; - case 12: - r = simde_mm512_or_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C(-0.0)), r); - break; - default: - break; - } - - return r; -} -#if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm512_range_ps(a, b, imm8) _mm512_range_ps((a), (b), (imm8)) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm512_range_ps(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512_private \ - simde_mm512_range_ps_r_ = simde__m512_to_private(simde_mm512_setzero_ps()), \ - simde_mm512_range_ps_a_ = simde__m512_to_private(a), \ - simde_mm512_range_ps_b_ = simde__m512_to_private(b); \ - \ - for (size_t simde_mm512_range_ps_i = 0 ; simde_mm512_range_ps_i < (sizeof(simde_mm512_range_ps_r_.m128) / sizeof(simde_mm512_range_ps_r_.m128[0])) ; simde_mm512_range_ps_i++) { \ - simde_mm512_range_ps_r_.m128[simde_mm512_range_ps_i] = simde_mm_range_ps(simde_mm512_range_ps_a_.m128[simde_mm512_range_ps_i], simde_mm512_range_ps_b_.m128[simde_mm512_range_ps_i], imm8); \ - } \ - \ - simde__m512_from_private(simde_mm512_range_ps_r_); \ - })) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm512_range_ps(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512_private \ - simde_mm512_range_ps_r_ = simde__m512_to_private(simde_mm512_setzero_ps()), \ - simde_mm512_range_ps_a_ = simde__m512_to_private(a), \ - simde_mm512_range_ps_b_ = simde__m512_to_private(b); \ - \ - for (size_t simde_mm512_range_ps_i = 0 ; simde_mm512_range_ps_i < (sizeof(simde_mm512_range_ps_r_.m256) / sizeof(simde_mm512_range_ps_r_.m256[0])) ; simde_mm512_range_ps_i++) { \ - simde_mm512_range_ps_r_.m256[simde_mm512_range_ps_i] = simde_mm256_range_ps(simde_mm512_range_ps_a_.m256[simde_mm512_range_ps_i], simde_mm512_range_ps_b_.m256[simde_mm512_range_ps_i], imm8); \ - } \ - \ - simde__m512_from_private(simde_mm512_range_ps_r_); \ - })) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_range_ps - #define _mm512_range_ps(a, b, imm8) simde_mm512_range_ps(a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm512_mask_range_ps(src, k, a, b, imm8) _mm512_mask_range_ps(src, k, a, b, imm8) -#else - #define simde_mm512_mask_range_ps(src, k, a, b, imm8) simde_mm512_mask_mov_ps(src, k, simde_mm512_range_ps(a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_range_ps - #define _mm512_mask_range_ps(src, k, a, b, imm8) simde_mm512_mask_range_ps(src, k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm512_maskz_range_ps(k, a, b, imm8) _mm512_maskz_range_ps(k, a, b, imm8) -#else - #define simde_mm512_maskz_range_ps(k, a, b, imm8) simde_mm512_maskz_mov_ps(k, simde_mm512_range_ps(a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_range_ps - #define _mm512_maskz_range_ps(k, a, b, imm8) simde_mm512_maskz_range_ps(k, a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_range_pd (simde__m128d a, simde__m128d b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m128d r; - - switch (imm8 & 3) { - case 0: - r = simde_mm_min_pd(a, b); - break; - case 1: - r = simde_mm_max_pd(a, b); - break; - case 2: - r = simde_x_mm_select_pd(b, a, simde_mm_cmple_pd(simde_x_mm_abs_pd(a), simde_x_mm_abs_pd(b))); - break; - case 3: - r = simde_x_mm_select_pd(b, a, simde_mm_cmpge_pd(simde_x_mm_abs_pd(a), simde_x_mm_abs_pd(b))); - break; - default: - break; - } - - switch (imm8 & 12) { - case 0: - r = simde_x_mm_copysign_pd(r, a); - break; - case 8: - r = simde_mm_andnot_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C(-0.0)), r); - break; - case 12: - r = simde_mm_or_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C(-0.0)), r); - break; - default: - break; - } - - return r; -} -#if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_range_pd(a, b, imm8) _mm_range_pd((a), (b), (imm8)) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_range_pd - #define _mm_range_pd(a, b, imm8) simde_mm_range_pd(a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_mask_range_pd(src, k, a, b, imm8) _mm_mask_range_pd(src, k, a, b, imm8) -#else - #define simde_mm_mask_range_pd(src, k, a, b, imm8) simde_mm_mask_mov_pd(src, k, simde_mm_range_pd(a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_range_pd - #define _mm_mask_range_pd(src, k, a, b, imm8) simde_mm_mask_range_pd(src, k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_maskz_range_pd(k, a, b, imm8) _mm_maskz_range_pd(k, a, b, imm8) -#else - #define simde_mm_maskz_range_pd(k, a, b, imm8) simde_mm_maskz_mov_pd(k, simde_mm_range_pd(a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_range_pd - #define _mm_maskz_range_pd(k, a, b, imm8) simde_mm_maskz_range_pd(k, a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_range_pd (simde__m256d a, simde__m256d b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m256d r; - - switch (imm8 & 3) { - case 0: - r = simde_mm256_min_pd(a, b); - break; - case 1: - r = simde_mm256_max_pd(a, b); - break; - case 2: - r = simde_x_mm256_select_pd(b, a, simde_mm256_cmp_pd(simde_x_mm256_abs_pd(a), simde_x_mm256_abs_pd(b), SIMDE_CMP_LE_OQ)); - break; - case 3: - r = simde_x_mm256_select_pd(b, a, simde_mm256_cmp_pd(simde_x_mm256_abs_pd(a), simde_x_mm256_abs_pd(b), SIMDE_CMP_GE_OQ)); - break; - default: - break; - } - - switch (imm8 & 12) { - case 0: - r = simde_x_mm256_copysign_pd(r, a); - break; - case 8: - r = simde_mm256_andnot_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C(-0.0)), r); - break; - case 12: - r = simde_mm256_or_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C(-0.0)), r); - break; - default: - break; - } - - return r; -} -#if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_range_pd(a, b, imm8) _mm256_range_pd((a), (b), (imm8)) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm256_range_pd(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m256d_private \ - simde_mm256_range_pd_r_ = simde__m256d_to_private(simde_mm256_setzero_pd()), \ - simde_mm256_range_pd_a_ = simde__m256d_to_private(a), \ - simde_mm256_range_pd_b_ = simde__m256d_to_private(b); \ - \ - for (size_t simde_mm256_range_pd_i = 0 ; simde_mm256_range_pd_i < (sizeof(simde_mm256_range_pd_r_.m128d) / sizeof(simde_mm256_range_pd_r_.m128d[0])) ; simde_mm256_range_pd_i++) { \ - simde_mm256_range_pd_r_.m128d[simde_mm256_range_pd_i] = simde_mm_range_pd(simde_mm256_range_pd_a_.m128d[simde_mm256_range_pd_i], simde_mm256_range_pd_b_.m128d[simde_mm256_range_pd_i], imm8); \ - } \ - \ - simde__m256d_from_private(simde_mm256_range_pd_r_); \ - })) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_range_pd - #define _mm256_range_pd(a, b, imm8) simde_mm256_range_pd(a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_mask_range_pd(src, k, a, b, imm8) _mm256_mask_range_pd(src, k, a, b, imm8) -#else - #define simde_mm256_mask_range_pd(src, k, a, b, imm8) simde_mm256_mask_mov_pd(src, k, simde_mm256_range_pd(a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_range_pd - #define _mm256_mask_range_pd(src, k, a, b, imm8) simde_mm256_mask_range_pd(src, k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_maskz_range_pd(k, a, b, imm8) _mm256_maskz_range_pd(k, a, b, imm8) -#else - #define simde_mm256_maskz_range_pd(k, a, b, imm8) simde_mm256_maskz_mov_pd(k, simde_mm256_range_pd(a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_range_pd - #define _mm256_maskz_range_pd(k, a, b, imm8) simde_mm256_maskz_range_pd(k, a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_range_pd (simde__m512d a, simde__m512d b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m512d r; - - switch (imm8 & 3) { - case 0: - r = simde_mm512_min_pd(a, b); - break; - case 1: - r = simde_mm512_max_pd(a, b); - break; - case 2: - r = simde_mm512_mask_mov_pd(b, simde_mm512_cmp_pd_mask(simde_mm512_abs_pd(a), simde_mm512_abs_pd(b), SIMDE_CMP_LE_OS), a); - break; - case 3: - r = simde_mm512_mask_mov_pd(a, simde_mm512_cmp_pd_mask(simde_mm512_abs_pd(b), simde_mm512_abs_pd(a), SIMDE_CMP_GE_OS), b); - break; - default: - break; - } - - switch (imm8 & 12) { - case 0: - r = simde_x_mm512_copysign_pd(r, a); - break; - case 8: - r = simde_mm512_andnot_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C(-0.0)), r); - break; - case 12: - r = simde_mm512_or_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C(-0.0)), r); - break; - default: - break; - } - - return r; -} -#if defined(SIMDE_X86_AVX512DQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm512_range_pd(a, b, imm8) _mm512_range_pd((a), (b), (imm8)) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm512_range_pd(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512d_private \ - simde_mm512_range_pd_r_ = simde__m512d_to_private(simde_mm512_setzero_pd()), \ - simde_mm512_range_pd_a_ = simde__m512d_to_private(a), \ - simde_mm512_range_pd_b_ = simde__m512d_to_private(b); \ - \ - for (size_t simde_mm512_range_pd_i = 0 ; simde_mm512_range_pd_i < (sizeof(simde_mm512_range_pd_r_.m128d) / sizeof(simde_mm512_range_pd_r_.m128d[0])) ; simde_mm512_range_pd_i++) { \ - simde_mm512_range_pd_r_.m128d[simde_mm512_range_pd_i] = simde_mm_range_pd(simde_mm512_range_pd_a_.m128d[simde_mm512_range_pd_i], simde_mm512_range_pd_b_.m128d[simde_mm512_range_pd_i], imm8); \ - } \ - \ - simde__m512d_from_private(simde_mm512_range_pd_r_); \ - })) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm512_range_pd(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512d_private \ - simde_mm512_range_pd_r_ = simde__m512d_to_private(simde_mm512_setzero_pd()), \ - simde_mm512_range_pd_a_ = simde__m512d_to_private(a), \ - simde_mm512_range_pd_b_ = simde__m512d_to_private(b); \ - \ - for (size_t simde_mm512_range_pd_i = 0 ; simde_mm512_range_pd_i < (sizeof(simde_mm512_range_pd_r_.m256d) / sizeof(simde_mm512_range_pd_r_.m256d[0])) ; simde_mm512_range_pd_i++) { \ - simde_mm512_range_pd_r_.m256d[simde_mm512_range_pd_i] = simde_mm256_range_pd(simde_mm512_range_pd_a_.m256d[simde_mm512_range_pd_i], simde_mm512_range_pd_b_.m256d[simde_mm512_range_pd_i], imm8); \ - } \ - \ - simde__m512d_from_private(simde_mm512_range_pd_r_); \ - })) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_range_pd - #define _mm512_range_pd(a, b, imm8) simde_mm512_range_pd(a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm512_mask_range_pd(src, k, a, b, imm8) _mm512_mask_range_pd(src, k, a, b, imm8) -#else - #define simde_mm512_mask_range_pd(src, k, a, b, imm8) simde_mm512_mask_mov_pd(src, k, simde_mm512_range_pd(a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_range_pd - #define _mm512_mask_range_pd(src, k, a, b, imm8) simde_mm512_mask_range_pd(src, k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm512_maskz_range_pd(k, a, b, imm8) _mm512_maskz_range_pd(k, a, b, imm8) -#else - #define simde_mm512_maskz_range_pd(k, a, b, imm8) simde_mm512_maskz_mov_pd(k, simde_mm512_range_pd(a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_range_pd - #define _mm512_maskz_range_pd(k, a, b, imm8) simde_mm512_maskz_range_pd(k, a, b, imm8) -#endif - -#if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - #define simde_x_mm_range_ss(a, b, imm8) simde_mm_move_ss(a, simde_mm_range_ps(a, b, imm8)) -#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - #define simde_x_mm_range_ss(a, b, imm8) simde_mm_move_ss(a, simde_mm_range_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b), imm8)) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128 - simde_x_mm_range_ss (simde__m128 a, simde__m128 b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m128_private - r_ = simde__m128_to_private(a), - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - simde_float32 abs_a = simde_uint32_as_float32(a_.u32[0] & UINT32_C(2147483647)); - simde_float32 abs_b = simde_uint32_as_float32(b_.u32[0] & UINT32_C(2147483647)); - - switch (imm8 & 3) { - case 0: - r_ = simde__m128_to_private(simde_mm_min_ss(a, b)); - break; - case 1: - r_ = simde__m128_to_private(simde_mm_max_ss(a, b)); - break; - case 2: - r_.f32[0] = abs_a <= abs_b ? a_.f32[0] : b_.f32[0]; - break; - case 3: - r_.f32[0] = abs_b >= abs_a ? b_.f32[0] : a_.f32[0]; - break; - default: - break; - } - - switch (imm8 & 12) { - case 0: - r_.f32[0] = simde_uint32_as_float32((a_.u32[0] & UINT32_C(2147483648)) ^ (r_.u32[0] & UINT32_C(2147483647))); - break; - case 8: - r_.f32[0] = simde_uint32_as_float32(r_.u32[0] & UINT32_C(2147483647)); - break; - case 12: - r_.f32[0] = simde_uint32_as_float32(r_.u32[0] | UINT32_C(2147483648)); - break; - default: - break; - } - - return simde__m128_from_private(r_); - } -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm_mask_range_ss(src, k, a, b, imm8) _mm_mask_range_ss(src, k, a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm_mask_range_ss(src, k, a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m128_private \ - simde_mm_mask_range_ss_r_ = simde__m128_to_private(a), \ - simde_mm_mask_range_ss_src_ = simde__m128_to_private(src); \ - \ - if (k & 1) \ - simde_mm_mask_range_ss_r_ = simde__m128_to_private(simde_x_mm_range_ss(a, b, imm8)); \ - else \ - simde_mm_mask_range_ss_r_.f32[0] = simde_mm_mask_range_ss_src_.f32[0]; \ - \ - simde__m128_from_private(simde_mm_mask_range_ss_r_); \ - })) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128 - simde_mm_mask_range_ss (simde__m128 src, simde__mmask8 k, simde__m128 a, simde__m128 b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m128_private - r_ = simde__m128_to_private(a), - src_ = simde__m128_to_private(src); - - if (k & 1) - r_ = simde__m128_to_private(simde_x_mm_range_ss(a, b, imm8)); - else - r_.f32[0] = src_.f32[0]; - - return simde__m128_from_private(r_); - } -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_range_ss - #define _mm_mask_range_ss(src, k, a, b, imm8) simde_mm_mask_range_ss(src, k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm_maskz_range_ss(k, a, b, imm8) _mm_maskz_range_ss(k, a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm_maskz_range_ss(k, a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m128_private simde_mm_maskz_range_ss_r_ = simde__m128_to_private(a); \ - \ - if (k & 1) \ - simde_mm_maskz_range_ss_r_ = simde__m128_to_private(simde_x_mm_range_ss(a, b, imm8)); \ - else \ - simde_mm_maskz_range_ss_r_.f32[0] = SIMDE_FLOAT32_C(0.0); \ - \ - simde__m128_from_private(simde_mm_maskz_range_ss_r_); \ - })) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128 - simde_mm_maskz_range_ss (simde__mmask8 k, simde__m128 a, simde__m128 b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m128_private r_ = simde__m128_to_private(a); - - if (k & 1) - r_ = simde__m128_to_private(simde_x_mm_range_ss(a, b, imm8)); - else - r_.f32[0] = SIMDE_FLOAT32_C(0.0); - - return simde__m128_from_private(r_); - } -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_range_ss - #define _mm_maskz_range_ss(k, a, b, imm8) simde_mm_maskz_range_ss(k, a, b, imm8) -#endif - -#if (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - #define simde_x_mm_range_sd(a, b, imm8) simde_mm_move_sd(a, simde_mm_range_pd(a, b, imm8)) -#elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - #define simde_x_mm_range_sd(a, b, imm8) simde_mm_move_sd(a, simde_mm_range_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b), imm8)) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128d - simde_x_mm_range_sd (simde__m128d a, simde__m128d b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m128d_private - r_ = simde__m128d_to_private(a), - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - simde_float64 abs_a = simde_uint64_as_float64(a_.u64[0] & UINT64_C(9223372036854775807)); - simde_float64 abs_b = simde_uint64_as_float64(b_.u64[0] & UINT64_C(9223372036854775807)); - - switch (imm8 & 3) { - case 0: - r_ = simde__m128d_to_private(simde_mm_min_sd(a, b)); - break; - case 1: - r_ = simde__m128d_to_private(simde_mm_max_sd(a, b)); - break; - case 2: - r_.f64[0] = abs_a <= abs_b ? a_.f64[0] : b_.f64[0]; - break; - case 3: - r_.f64[0] = abs_b >= abs_a ? b_.f64[0] : a_.f64[0]; - break; - default: - break; - } - - switch (imm8 & 12) { - case 0: - r_.f64[0] = simde_uint64_as_float64((a_.u64[0] & UINT64_C(9223372036854775808)) ^ (r_.u64[0] & UINT64_C(9223372036854775807))); - break; - case 8: - r_.f64[0] = simde_uint64_as_float64(r_.u64[0] & UINT64_C(9223372036854775807)); - break; - case 12: - r_.f64[0] = simde_uint64_as_float64(r_.u64[0] | UINT64_C(9223372036854775808)); - break; - default: - break; - } - - return simde__m128d_from_private(r_); - } -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm_mask_range_sd(src, k, a, b, imm8) _mm_mask_range_sd(src, k, a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm_mask_range_sd(src, k, a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m128d_private \ - simde_mm_mask_range_sd_r_ = simde__m128d_to_private(a), \ - simde_mm_mask_range_sd_src_ = simde__m128d_to_private(src); \ - \ - if (k & 1) \ - simde_mm_mask_range_sd_r_ = simde__m128d_to_private(simde_x_mm_range_sd(a, b, imm8)); \ - else \ - simde_mm_mask_range_sd_r_.f64[0] = simde_mm_mask_range_sd_src_.f64[0]; \ - \ - simde__m128d_from_private(simde_mm_mask_range_sd_r_); \ - })) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128d - simde_mm_mask_range_sd (simde__m128d src, simde__mmask8 k, simde__m128d a, simde__m128d b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m128d_private - r_ = simde__m128d_to_private(a), - src_ = simde__m128d_to_private(src); - - if (k & 1) - r_ = simde__m128d_to_private(simde_x_mm_range_sd(a, b, imm8)); - else - r_.f64[0] = src_.f64[0]; - - return simde__m128d_from_private(r_); - } -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_range_sd - #define _mm_mask_range_sd(src, k, a, b, imm8) simde_mm_mask_range_sd(src, k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm_maskz_range_sd(k, a, b, imm8) _mm_maskz_range_sd(k, a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm_maskz_range_sd(k, a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m128d_private simde_mm_maskz_range_sd_r_ = simde__m128d_to_private(a); \ - \ - if (k & 1) \ - simde_mm_maskz_range_sd_r_ = simde__m128d_to_private(simde_x_mm_range_sd(a, b, imm8)); \ - else \ - simde_mm_maskz_range_sd_r_.f64[0] = SIMDE_FLOAT64_C(0.0); \ - \ - simde__m128d_from_private(simde_mm_maskz_range_sd_r_); \ - })) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128d - simde_mm_maskz_range_sd (simde__mmask8 k, simde__m128d a, simde__m128d b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m128d_private r_ = simde__m128d_to_private(a); - - if (k & 1) - r_ = simde__m128d_to_private(simde_x_mm_range_sd(a, b, imm8)); - else - r_.f64[0] = SIMDE_FLOAT64_C(0.0); - - return simde__m128d_from_private(r_); - } -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_range_sd - #define _mm_maskz_range_sd(k, a, b, imm8) simde_mm_maskz_range_sd(k, a, b, imm8) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_RANGE_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/range_round.h b/ffi-deps/simde/simde/x86/avx512/range_round.h deleted file mode 100644 index 7bf1320..0000000 --- a/ffi-deps/simde/simde/x86/avx512/range_round.h +++ /dev/null @@ -1,686 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_RANGE_ROUND_H) -#define SIMDE_X86_AVX512_RANGE_ROUND_H - -#include "types.h" -#include "range.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm512_range_round_ps(a, b, imm8, sae) _mm512_range_round_ps(a, b, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm512_range_round_ps(a, b, imm8, sae) simde_mm512_range_ps(a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm512_range_round_ps(a, b, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512 simde_mm512_range_round_ps_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm512_range_round_ps_envp; \ - int simde_mm512_range_round_ps_x = feholdexcept(&simde_mm512_range_round_ps_envp); \ - simde_mm512_range_round_ps_r = simde_mm512_range_ps(a, b, imm8); \ - if (HEDLEY_LIKELY(simde_mm512_range_round_ps_x == 0)) \ - fesetenv(&simde_mm512_range_round_ps_envp); \ - } \ - else { \ - simde_mm512_range_round_ps_r = simde_mm512_range_ps(a, b, imm8); \ - } \ - \ - simde_mm512_range_round_ps_r; \ - })) - #else - #define simde_mm512_range_round_ps(a, b, imm8, sae) simde_mm512_range_ps(a, b, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512 - simde_mm512_range_round_ps (simde__m512 a, simde__m512 b, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m512 r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm512_range_ps(a, b, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm512_range_ps(a, b, imm8); - #endif - } - else { - r = simde_mm512_range_ps(a, b, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_range_round_ps - #define _mm512_range_round_ps(a, b, imm8, sae) simde_mm512_range_round_ps(a, b, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm512_mask_range_round_ps(src, k, a, b, imm8, sae) _mm512_mask_range_round_ps(src, k, a, b, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm512_mask_range_round_ps(src, k, a, b, imm8, sae) simde_mm512_mask_range_ps(src, k, a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm512_mask_range_round_ps(src, k, a, b, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512 simde_mm512_mask_range_round_ps_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm512_mask_range_round_ps_envp; \ - int simde_mm512_mask_range_round_ps_x = feholdexcept(&simde_mm512_mask_range_round_ps_envp); \ - simde_mm512_mask_range_round_ps_r = simde_mm512_mask_range_ps(src, k, a, b, imm8); \ - if (HEDLEY_LIKELY(simde_mm512_mask_range_round_ps_x == 0)) \ - fesetenv(&simde_mm512_mask_range_round_ps_envp); \ - } \ - else { \ - simde_mm512_mask_range_round_ps_r = simde_mm512_mask_range_ps(src, k, a, b, imm8); \ - } \ - \ - simde_mm512_mask_range_round_ps_r; \ - })) - #else - #define simde_mm512_mask_range_round_ps(src, k, a, b, imm8, sae) simde_mm512_mask_range_ps(src, k, a, b, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512 - simde_mm512_mask_range_round_ps (simde__m512 src, simde__mmask16 k, simde__m512 a, simde__m512 b, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m512 r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm512_mask_range_ps(src, k, a, b, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm512_mask_range_ps(src, k, a, b, imm8); - #endif - } - else { - r = simde_mm512_mask_range_ps(src, k, a, b, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_range_round_ps - #define _mm512_mask_range_round_ps(src, k, a, b, imm8, sae) simde_mm512_mask_range_round_ps(src, k, a, b, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm512_maskz_range_round_ps(k, a, b, imm8, sae) _mm512_maskz_range_round_ps(k, a, b, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm512_maskz_range_round_ps(k, a, b, imm8, sae) simde_mm512_maskz_range_ps(k, a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm512_maskz_range_round_ps(k, a, b, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512 simde_mm512_maskz_range_round_ps_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm512_maskz_range_round_ps_envp; \ - int simde_mm512_maskz_range_round_ps_x = feholdexcept(&simde_mm512_maskz_range_round_ps_envp); \ - simde_mm512_maskz_range_round_ps_r = simde_mm512_maskz_range_ps(k, a, b, imm8); \ - if (HEDLEY_LIKELY(simde_mm512_maskz_range_round_ps_x == 0)) \ - fesetenv(&simde_mm512_maskz_range_round_ps_envp); \ - } \ - else { \ - simde_mm512_maskz_range_round_ps_r = simde_mm512_maskz_range_ps(k, a, b, imm8); \ - } \ - \ - simde_mm512_maskz_range_round_ps_r; \ - })) - #else - #define simde_mm512_maskz_range_round_ps(k, a, b, imm8, sae) simde_mm512_maskz_range_ps(k, a, b, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512 - simde_mm512_maskz_range_round_ps (simde__mmask16 k, simde__m512 a, simde__m512 b, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m512 r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm512_maskz_range_ps(k, a, b, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm512_maskz_range_ps(k, a, b, imm8); - #endif - } - else { - r = simde_mm512_maskz_range_ps(k, a, b, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_range_round_ps - #define _mm512_maskz_range_round_ps(k, a, b, imm8, sae) simde_mm512_maskz_range_round_ps(k, a, b, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm512_range_round_pd(a, b, imm8, sae) _mm512_range_round_pd(a, b, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm512_range_round_pd(a, b, imm8, sae) simde_mm512_range_pd(a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm512_range_round_pd(a, b, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512d simde_mm512_range_round_pd_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm512_range_round_pd_envp; \ - int simde_mm512_range_round_pd_x = feholdexcept(&simde_mm512_range_round_pd_envp); \ - simde_mm512_range_round_pd_r = simde_mm512_range_pd(a, b, imm8); \ - if (HEDLEY_LIKELY(simde_mm512_range_round_pd_x == 0)) \ - fesetenv(&simde_mm512_range_round_pd_envp); \ - } \ - else { \ - simde_mm512_range_round_pd_r = simde_mm512_range_pd(a, b, imm8); \ - } \ - \ - simde_mm512_range_round_pd_r; \ - })) - #else - #define simde_mm512_range_round_pd(a, b, imm8, sae) simde_mm512_range_pd(a, b, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512d - simde_mm512_range_round_pd (simde__m512d a, simde__m512d b, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m512d r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm512_range_pd(a, b, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm512_range_pd(a, b, imm8); - #endif - } - else { - r = simde_mm512_range_pd(a, b, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_range_round_pd - #define _mm512_range_round_pd(a, b, imm8, sae) simde_mm512_range_round_pd(a, b, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm512_mask_range_round_pd(src, k, a, b, imm8, sae) _mm512_mask_range_round_pd(src, k, a, b, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm512_mask_range_round_pd(src, k, a, b, imm8, sae) simde_mm512_mask_range_pd(src, k, a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm512_mask_range_round_pd(src, k, a, b, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512d simde_mm512_mask_range_round_pd_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm512_mask_range_round_pd_envp; \ - int simde_mm512_mask_range_round_pd_x = feholdexcept(&simde_mm512_mask_range_round_pd_envp); \ - simde_mm512_mask_range_round_pd_r = simde_mm512_mask_range_pd(src, k, a, b, imm8); \ - if (HEDLEY_LIKELY(simde_mm512_mask_range_round_pd_x == 0)) \ - fesetenv(&simde_mm512_mask_range_round_pd_envp); \ - } \ - else { \ - simde_mm512_mask_range_round_pd_r = simde_mm512_mask_range_pd(src, k, a, b, imm8); \ - } \ - \ - simde_mm512_mask_range_round_pd_r; \ - })) - #else - #define simde_mm512_mask_range_round_pd(src, k, a, b, imm8, sae) simde_mm512_mask_range_pd(src, k, a, b, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512d - simde_mm512_mask_range_round_pd (simde__m512d src, simde__mmask8 k, simde__m512d a, simde__m512d b, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m512d r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm512_mask_range_pd(src, k, a, b, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm512_mask_range_pd(src, k, a, b, imm8); - #endif - } - else { - r = simde_mm512_mask_range_pd(src, k, a, b, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_range_round_pd - #define _mm512_mask_range_round_pd(src, k, a, b, imm8, sae) simde_mm512_mask_range_round_pd(src, k, a, b, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm512_maskz_range_round_pd(k, a, b, imm8, sae) _mm512_maskz_range_round_pd(k, a, b, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm512_maskz_range_round_pd(k, a, b, imm8, sae) simde_mm512_maskz_range_pd(k, a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm512_maskz_range_round_pd(k, a, b, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512d simde_mm512_maskz_range_round_pd_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm512_maskz_range_round_pd_envp; \ - int simde_mm512_maskz_range_round_pd_x = feholdexcept(&simde_mm512_maskz_range_round_pd_envp); \ - simde_mm512_maskz_range_round_pd_r = simde_mm512_maskz_range_pd(k, a, b, imm8); \ - if (HEDLEY_LIKELY(simde_mm512_maskz_range_round_pd_x == 0)) \ - fesetenv(&simde_mm512_maskz_range_round_pd_envp); \ - } \ - else { \ - simde_mm512_maskz_range_round_pd_r = simde_mm512_maskz_range_pd(k, a, b, imm8); \ - } \ - \ - simde_mm512_maskz_range_round_pd_r; \ - })) - #else - #define simde_mm512_maskz_range_round_pd(k, a, b, imm8, sae) simde_mm512_maskz_range_pd(k, a, b, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512d - simde_mm512_maskz_range_round_pd (simde__mmask8 k, simde__m512d a, simde__m512d b, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m512d r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm512_maskz_range_pd(k, a, b, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm512_maskz_range_pd(k, a, b, imm8); - #endif - } - else { - r = simde_mm512_maskz_range_pd(k, a, b, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_range_round_pd - #define _mm512_maskz_range_round_pd(k, a, b, imm8, sae) simde_mm512_maskz_range_round_pd(k, a, b, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm_range_round_ss(a, b, imm8, sae) _mm_range_round_ss(a, b, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm_range_round_ss(a, b, imm8, sae) simde_x_mm_range_ss(a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm_range_round_ss(a, b, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m128 simde_mm_range_round_ss_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm_range_round_ss_envp; \ - int simde_mm_range_round_ss_x = feholdexcept(&simde_mm_range_round_ss_envp); \ - simde_mm_range_round_ss_r = simde_x_mm_range_ss(a, b, imm8); \ - if (HEDLEY_LIKELY(simde_mm_range_round_ss_x == 0)) \ - fesetenv(&simde_mm_range_round_ss_envp); \ - } \ - else { \ - simde_mm_range_round_ss_r = simde_x_mm_range_ss(a, b, imm8); \ - } \ - \ - simde_mm_range_round_ss_r; \ - })) - #else - #define simde_mm_range_round_ss(a, b, imm8, sae) simde_x_mm_range_ss(a, b, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128 - simde_mm_range_round_ss (simde__m128 a, simde__m128 b, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m128 r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_x_mm_range_ss(a, b, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_x_mm_range_ss(a, b, imm8); - #endif - } - else { - r = simde_x_mm_range_ss(a, b, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm_range_round_ss - #define _mm_range_round_ss(a, b, imm8, sae) simde_mm_range_round_ss(a, b, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm_mask_range_round_ss(src, k, a, b, imm8, sae) _mm_mask_range_round_ss(src, k, a, b, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm_mask_range_round_ss(src, k, a, b, imm8, sae) simde_mm_mask_range_ss(src, k, a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm_mask_range_round_ss(src, k, a, b, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m128 simde_mm_mask_range_round_ss_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm_mask_range_round_ss_envp; \ - int simde_mm_mask_range_round_ss_x = feholdexcept(&simde_mm_mask_range_round_ss_envp); \ - simde_mm_mask_range_round_ss_r = simde_mm_mask_range_ss(src, k, a, b, imm8); \ - if (HEDLEY_LIKELY(simde_mm_mask_range_round_ss_x == 0)) \ - fesetenv(&simde_mm_mask_range_round_ss_envp); \ - } \ - else { \ - simde_mm_mask_range_round_ss_r = simde_mm_mask_range_ss(src, k, a, b, imm8); \ - } \ - \ - simde_mm_mask_range_round_ss_r; \ - })) - #else - #define simde_mm_mask_range_round_ss(src, k, a, b, imm8, sae) simde_mm_mask_range_ss(src, k, a, b, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128 - simde_mm_mask_range_round_ss (simde__m128 src, simde__mmask8 k, simde__m128 a, simde__m128 b, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m128 r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm_mask_range_ss(src, k, a, b, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm_mask_range_ss(src, k, a, b, imm8); - #endif - } - else { - r = simde_mm_mask_range_ss(src, k, a, b, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_range_round_ss - #define _mm_mask_range_round_ss(src, k, a, b, imm8, sae) simde_mm_mask_range_round_ss(src, k, a, b, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm_maskz_range_round_ss(k, a, b, imm8, sae) _mm_maskz_range_round_ss(k, a, b, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm_maskz_range_round_ss(k, a, b, imm8, sae) simde_mm_maskz_range_ss(k, a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm_maskz_range_round_ss(k, a, b, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m128 simde_mm_maskz_range_round_ss_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm_maskz_range_round_ss_envp; \ - int simde_mm_maskz_range_round_ss_x = feholdexcept(&simde_mm_maskz_range_round_ss_envp); \ - simde_mm_maskz_range_round_ss_r = simde_mm_maskz_range_ss(k, a, b, imm8); \ - if (HEDLEY_LIKELY(simde_mm_maskz_range_round_ss_x == 0)) \ - fesetenv(&simde_mm_maskz_range_round_ss_envp); \ - } \ - else { \ - simde_mm_maskz_range_round_ss_r = simde_mm_maskz_range_ss(k, a, b, imm8); \ - } \ - \ - simde_mm_maskz_range_round_ss_r; \ - })) - #else - #define simde_mm_maskz_range_round_ss(k, a, b, imm8, sae) simde_mm_maskz_range_ss(k, a, b, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128 - simde_mm_maskz_range_round_ss (simde__mmask8 k, simde__m128 a, simde__m128 b, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m128 r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm_maskz_range_ss(k, a, b, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm_maskz_range_ss(k, a, b, imm8); - #endif - } - else { - r = simde_mm_maskz_range_ss(k, a, b, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_range_round_ss - #define _mm_maskz_range_round_ss(k, a, b, imm8, sae) simde_mm_maskz_range_round_ss(k, a, b, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm_range_round_sd(a, b, imm8, sae) _mm_range_round_sd(a, b, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm_range_round_sd(a, b, imm8, sae) simde_x_mm_range_sd(a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm_range_round_sd(a, b, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m128d simde_mm_range_round_sd_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm_range_round_sd_envp; \ - int simde_mm_range_round_sd_x = feholdexcept(&simde_mm_range_round_sd_envp); \ - simde_mm_range_round_sd_r = simde_x_mm_range_sd(a, b, imm8); \ - if (HEDLEY_LIKELY(simde_mm_range_round_sd_x == 0)) \ - fesetenv(&simde_mm_range_round_sd_envp); \ - } \ - else { \ - simde_mm_range_round_sd_r = simde_x_mm_range_sd(a, b, imm8); \ - } \ - \ - simde_mm_range_round_sd_r; \ - })) - #else - #define simde_mm_range_round_sd(a, b, imm8, sae) simde_x_mm_range_sd(a, b, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128d - simde_mm_range_round_sd (simde__m128d a, simde__m128d b, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m128d r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_x_mm_range_sd(a, b, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_x_mm_range_sd(a, b, imm8); - #endif - } - else { - r = simde_x_mm_range_sd(a, b, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm_range_round_sd - #define _mm_range_round_sd(a, b, imm8, sae) simde_mm_range_round_sd(a, b, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm_mask_range_round_sd(src, k, a, b, imm8, sae) _mm_mask_range_round_sd(src, k, a, b, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm_mask_range_round_sd(src, k, a, b, imm8, sae) simde_mm_mask_range_sd(src, k, a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm_mask_range_round_sd(src, k, a, b, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m128d simde_mm_mask_range_round_sd_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm_mask_range_round_sd_envp; \ - int simde_mm_mask_range_round_sd_x = feholdexcept(&simde_mm_mask_range_round_sd_envp); \ - simde_mm_mask_range_round_sd_r = simde_mm_mask_range_sd(src, k, a, b, imm8); \ - if (HEDLEY_LIKELY(simde_mm_mask_range_round_sd_x == 0)) \ - fesetenv(&simde_mm_mask_range_round_sd_envp); \ - } \ - else { \ - simde_mm_mask_range_round_sd_r = simde_mm_mask_range_sd(src, k, a, b, imm8); \ - } \ - \ - simde_mm_mask_range_round_sd_r; \ - })) - #else - #define simde_mm_mask_range_round_sd(src, k, a, b, imm8, sae) simde_mm_mask_range_sd(src, k, a, b, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128d - simde_mm_mask_range_round_sd (simde__m128d src, simde__mmask8 k, simde__m128d a, simde__m128d b, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m128d r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm_mask_range_sd(src, k, a, b, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm_mask_range_sd(src, k, a, b, imm8); - #endif - } - else { - r = simde_mm_mask_range_sd(src, k, a, b, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_range_round_sd - #define _mm_mask_range_round_sd(src, k, a, b, imm8, sae) simde_mm_mask_range_round_sd(src, k, a, b, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512DQ_NATIVE) - #define simde_mm_maskz_range_round_sd(k, a, b, imm8, sae) _mm_maskz_range_round_sd(k, a, b, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm_maskz_range_round_sd(k, a, b, imm8, sae) simde_mm_maskz_range_sd(k, a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm_maskz_range_round_sd(k, a, b, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m128d simde_mm_maskz_range_round_sd_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm_maskz_range_round_sd_envp; \ - int simde_mm_maskz_range_round_sd_x = feholdexcept(&simde_mm_maskz_range_round_sd_envp); \ - simde_mm_maskz_range_round_sd_r = simde_mm_maskz_range_sd(k, a, b, imm8); \ - if (HEDLEY_LIKELY(simde_mm_maskz_range_round_sd_x == 0)) \ - fesetenv(&simde_mm_maskz_range_round_sd_envp); \ - } \ - else { \ - simde_mm_maskz_range_round_sd_r = simde_mm_maskz_range_sd(k, a, b, imm8); \ - } \ - \ - simde_mm_maskz_range_round_sd_r; \ - })) - #else - #define simde_mm_maskz_range_round_sd(k, a, b, imm8, sae) simde_mm_maskz_range_sd(k, a, b, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128d - simde_mm_maskz_range_round_sd (simde__mmask8 k, simde__m128d a, simde__m128d b, int imm8, int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m128d r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm_maskz_range_sd(k, a, b, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm_maskz_range_sd(k, a, b, imm8); - #endif - } - else { - r = simde_mm_maskz_range_sd(k, a, b, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_range_round_sd - #define _mm_maskz_range_round_sd(k, a, b, imm8, sae) simde_mm_maskz_range_round_sd(k, a, b, imm8, sae) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_RANGE_ROUND_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/rcp.h b/ffi-deps/simde/simde/x86/avx512/rcp.h deleted file mode 100644 index b1b394c..0000000 --- a/ffi-deps/simde/simde/x86/avx512/rcp.h +++ /dev/null @@ -1,65 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2023 Michael R. Crusoe - */ - -#if !defined(SIMDE_X86_AVX512_RCP_H) -#define SIMDE_X86_AVX512_RCP_H - -#include "types.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -// TODO: "The maximum relative error for this approximation is less than 2^-14." -// vs 1.5*2^-12 for _mm{,256}_rcp_ps - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_rcp14_ps (simde__m512 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_rcp14_ps(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = SIMDE_FLOAT32_C(1.0) / a_.f32[i]; - } - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_rcp14_ps - #define _mm512_rcp14_ps(a) simde_mm512_rcp14_ps(a) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_RCP_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/reduce.h b/ffi-deps/simde/simde/x86/avx512/reduce.h deleted file mode 100644 index c007572..0000000 --- a/ffi-deps/simde/simde/x86/avx512/reduce.h +++ /dev/null @@ -1,355 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2023 Michael R. Crusoe - */ - -#if !defined(SIMDE_X86_AVX512_REDUCE_H) -#define SIMDE_X86_AVX512_REDUCE_H - -#include "types.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -#if defined(__clang__) && SIMDE_FLOAT16_API == SIMDE_FLOAT16_API_FP16 -SIMDE_DIAGNOSTIC_DISABLE_DOUBLE_PROMOTION_ -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde_float16 -simde_mm512_reduce_max_ph(simde__m512h a) { - #if defined(SIMDE_X86_AVX512FP16_NATIVE) - return _mm512_reduce_max_ph(a); - #else - simde__m512h_private a_; - simde_float16 r; - a_ = simde__m512h_to_private(a); - - r = SIMDE_NINFINITYHF; - #if defined(SIMDE_FLOAT16_VECTOR) - SIMDE_VECTORIZE_REDUCTION(max:r) - #endif - for (size_t i = 0 ; i < (sizeof(a_.f16) / sizeof(a_.f16[0])) ; i++) { - r = simde_float16_to_float32(a_.f16[i]) > simde_float16_to_float32(r) ? a_.f16[i] : r; - } - return r; - #endif -} -#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) -# define _mm512_reduce_max_ph(a) simde_mm512_reduce_max_ph((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde_float16 -simde_mm512_reduce_min_ph(simde__m512h a) { - #if defined(SIMDE_X86_AVX512FP16_NATIVE) - return _mm512_reduce_min_ph(a); - #else - simde__m512h_private a_; - simde_float16 r; - a_ = simde__m512h_to_private(a); - - r = SIMDE_INFINITYHF; - #if defined(SIMDE_FLOAT16_VECTOR) - SIMDE_VECTORIZE_REDUCTION(min:r) - #endif - for (size_t i = 0 ; i < (sizeof(a_.f16) / sizeof(a_.f16[0])) ; i++) { - r = simde_float16_to_float32(a_.f16[i]) < simde_float16_to_float32(r) ? a_.f16[i] : r; - } - return r; - #endif -} -#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) -# define _mm512_reduce_min_ph(a) simde_mm512_reduce_min_ph((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm512_reduce_max_epi32(simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_reduce_max_epi32(a); - #else - simde__m512i_private a_; - int32_t r; - a_ = simde__m512i_to_private(a); - - r = -INT32_MAX; - SIMDE_VECTORIZE_REDUCTION(max:r) - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r = a_.i32[i] > r ? a_.i32[i] : r; - } - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) -# define _mm512_reduce_max_epi32(a) simde_mm512_reduce_max_epi32((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int64_t -simde_mm512_reduce_max_epi64(simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_reduce_max_epi64(a); - #else - simde__m512i_private a_; - int64_t r; - a_ = simde__m512i_to_private(a); - - r = -INT64_MAX; - SIMDE_VECTORIZE_REDUCTION(max:r) - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r = a_.i64[i] > r ? a_.i64[i] : r; - } - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) -# define _mm512_reduce_max_epi64(a) simde_mm512_reduce_max_epi64((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -uint32_t -simde_mm512_reduce_max_epu32(simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_reduce_max_epu32(a); - #else - simde__m512i_private a_; - uint32_t r; - a_ = simde__m512i_to_private(a); - - r = 0; - SIMDE_VECTORIZE_REDUCTION(max:r) - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - r = a_.u32[i] > r ? a_.u32[i] : r; - } - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) -# define _mm512_reduce_max_epu32(a) simde_mm512_reduce_max_epu32((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -uint64_t -simde_mm512_reduce_max_epu64(simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_reduce_max_epu64(a); - #else - simde__m512i_private a_; - uint64_t r; - a_ = simde__m512i_to_private(a); - - r = 0; - SIMDE_VECTORIZE_REDUCTION(max:r) - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - r = a_.u64[i] > r ? a_.u64[i] : r; - } - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) -# define _mm512_reduce_max_epu64(a) simde_mm512_reduce_max_epu64((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde_float64 -simde_mm512_reduce_max_pd(simde__m512d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_reduce_max_pd(a); - #else - simde__m512d_private a_; - simde_float64 r; - a_ = simde__m512d_to_private(a); - - r = -SIMDE_MATH_INFINITY; - SIMDE_VECTORIZE_REDUCTION(max:r) - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - r = a_.f64[i] > r ? a_.f64[i] : r; - } - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) -# define _mm512_reduce_max_pd(a) simde_mm512_reduce_max_pd((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde_float32 -simde_mm512_reduce_max_ps(simde__m512 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_reduce_max_ps(a); - #else - simde__m512_private a_; - simde_float32 r; - a_ = simde__m512_to_private(a); - - r = -SIMDE_MATH_INFINITYF; - SIMDE_VECTORIZE_REDUCTION(max:r) - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r = a_.f32[i] > r ? a_.f32[i] : r; - } - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) -# define _mm512_reduce_max_ps(a) simde_mm512_reduce_max_ps((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm512_reduce_min_epi32(simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_reduce_min_epi32(a); - #else - simde__m512i_private a_; - int32_t r; - a_ = simde__m512i_to_private(a); - - r = INT32_MAX; - SIMDE_VECTORIZE_REDUCTION(min:r) - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r = a_.i32[i] < r ? a_.i32[i] : r; - } - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) -# define _mm512_reduce_min_epi32(a) simde_mm512_reduce_min_epi32((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int64_t -simde_mm512_reduce_min_epi64(simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_reduce_min_epi64(a); - #else - simde__m512i_private a_; - int64_t r; - a_ = simde__m512i_to_private(a); - - r = INT64_MAX; - SIMDE_VECTORIZE_REDUCTION(min:r) - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r = a_.i64[i] < r ? a_.i64[i] : r; - } - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) -# define _mm512_reduce_min_epi64(a) simde_mm512_reduce_min_epi64((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -uint32_t -simde_mm512_reduce_min_epu32(simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_reduce_min_epu32(a); - #else - simde__m512i_private a_; - uint32_t r; - a_ = simde__m512i_to_private(a); - - r = UINT32_MAX; - SIMDE_VECTORIZE_REDUCTION(min:r) - for (size_t i = 0 ; i < (sizeof(a_.u32) / sizeof(a_.u32[0])) ; i++) { - r = a_.u32[i] < r ? a_.u32[i] : r; - } - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) -# define _mm512_reduce_min_epu32(a) simde_mm512_reduce_min_epu32((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -uint64_t -simde_mm512_reduce_min_epu64(simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_reduce_min_epu64(a); - #else - simde__m512i_private a_; - uint64_t r; - a_ = simde__m512i_to_private(a); - - r = UINT64_MAX; - SIMDE_VECTORIZE_REDUCTION(min:r) - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - r = a_.u64[i] < r ? a_.u64[i] : r; - } - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) -# define _mm512_reduce_min_epu64(a) simde_mm512_reduce_min_epu64((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde_float64 -simde_mm512_reduce_min_pd(simde__m512d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_reduce_min_pd(a); - #else - simde__m512d_private a_; - simde_float64 r; - a_ = simde__m512d_to_private(a); - - r = SIMDE_MATH_INFINITY; - SIMDE_VECTORIZE_REDUCTION(min:r) - for (size_t i = 0 ; i < (sizeof(a_.f64) / sizeof(a_.f64[0])) ; i++) { - r = a_.f64[i] < r ? a_.f64[i] : r; - } - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) -# define _mm512_reduce_min_pd(a) simde_mm512_reduce_min_pd((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde_float32 -simde_mm512_reduce_min_ps(simde__m512 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_reduce_min_ps(a); - #else - simde__m512_private a_; - simde_float32 r; - a_ = simde__m512_to_private(a); - - r = SIMDE_MATH_INFINITYF; - SIMDE_VECTORIZE_REDUCTION(min:r) - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r = a_.f32[i] < r ? a_.f32[i] : r; - } - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) -# define _mm512_reduce_min_ps(a) simde_mm512_reduce_min_ps((a)) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_REDUCE_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/rol.h b/ffi-deps/simde/simde/x86/avx512/rol.h deleted file mode 100644 index 5bdf98b..0000000 --- a/ffi-deps/simde/simde/x86/avx512/rol.h +++ /dev/null @@ -1,410 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_ROL_H) -#define SIMDE_X86_AVX512_ROL_H - -#include "types.h" -#include "mov.h" -#include "or.h" -#include "srli.h" -#include "slli.h" -#include "../avx2.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_rol_epi32(a, imm8) _mm_rol_epi32(a, imm8) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128i - simde_mm_rol_epi32 (simde__m128i a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE (imm8, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_rl(a_.altivec_i32, vec_splats(HEDLEY_STATIC_CAST(unsigned int, imm8))); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - switch (imm8 & 31) { - case 0: - r_ = a_; - break; - default: - r_.u32 = (a_.u32 << (imm8 & 31)) | (a_.u32 >> (32 - (imm8 & 31))); - break; - } - #else - switch (imm8 & 31) { - case 0: - r_ = a_; - break; - default: - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] << (imm8 & 31)) | (a_.u32[i] >> (32 - (imm8 & 31))); - } - break; - } - #endif - - return simde__m128i_from_private(r_); - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_rol_epi32 - #define _mm_rol_epi32(a, imm8) simde_mm_rol_epi32(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_mask_rol_epi32(src, k, a, imm8) _mm_mask_rol_epi32(src, k, a, imm8) -#else - #define simde_mm_mask_rol_epi32(src, k, a, imm8) simde_mm_mask_mov_epi32(src, k, simde_mm_rol_epi32(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_rol_epi32 - #define _mm_mask_rol_epi32(src, k, a, imm8) simde_mm_mask_rol_epi32(src, k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_maskz_rol_epi32(k, a, imm8) _mm_maskz_rol_epi32(k, a, imm8) -#else - #define simde_mm_maskz_rol_epi32(k, a, imm8) simde_mm_maskz_mov_epi32(k, simde_mm_rol_epi32(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_rol_epi32 - #define _mm_maskz_rol_epi32(k, a, imm8) simde_mm_maskz_rol_epi32(k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_rol_epi32(a, imm8) _mm256_rol_epi32(a, imm8) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m256i - simde_mm256_rol_epi32 (simde__m256i a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE (imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - r_.m128i_private[i].altivec_i32 = vec_rl(a_.m128i_private[i].altivec_i32, vec_splats(HEDLEY_STATIC_CAST(unsigned int, imm8))); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - switch (imm8 & 31) { - case 0: - r_ = a_; - break; - default: - r_.u32 = (a_.u32 << (imm8 & 31)) | (a_.u32 >> (32 - (imm8 & 31))); - break; - } - #else - switch (imm8 & 31) { - case 0: - r_ = a_; - break; - default: - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] << (imm8 & 31)) | (a_.u32[i] >> (32 - (imm8 & 31))); - } - break; - } - #endif - - return simde__m256i_from_private(r_); - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_rol_epi32 - #define _mm256_rol_epi32(a, imm8) simde_mm256_rol_epi32(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_mask_rol_epi32(src, k, a, imm8) _mm256_mask_rol_epi32(src, k, a, imm8) -#else - #define simde_mm256_mask_rol_epi32(src, k, a, imm8) simde_mm256_mask_mov_epi32(src, k, simde_mm256_rol_epi32(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_rol_epi32 - #define _mm256_mask_rol_epi32(src, k, a, imm8) simde_mm256_mask_rol_epi32(src, k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_maskz_rol_epi32(k, a, imm8) _mm256_maskz_rol_epi32(k, a, imm8) -#else - #define simde_mm256_maskz_rol_epi32(k, a, imm8) simde_mm256_maskz_mov_epi32(k, simde_mm256_rol_epi32(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_rol_epi32 - #define _mm256_maskz_rol_epi32(k, a, imm8) simde_mm256_maskz_rol_epi32(k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_rol_epi32(a, imm8) _mm512_rol_epi32(a, imm8) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512i - simde_mm512_rol_epi32 (simde__m512i a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE (imm8, 0, 255) { - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - r_.m128i_private[i].altivec_i32 = vec_rl(a_.m128i_private[i].altivec_i32, vec_splats(HEDLEY_STATIC_CAST(unsigned int, imm8))); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - switch (imm8 & 31) { - case 0: - r_ = a_; - break; - default: - r_.u32 = (a_.u32 << (imm8 & 31)) | (a_.u32 >> (32 - (imm8 & 31))); - break; - } - #else - switch (imm8 & 31) { - case 0: - r_ = a_; - break; - default: - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] << (imm8 & 31)) | (a_.u32[i] >> (32 - (imm8 & 31))); - } - break; - } - #endif - - return simde__m512i_from_private(r_); - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_rol_epi32 - #define _mm512_rol_epi32(a, imm8) simde_mm512_rol_epi32(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_mask_rol_epi32(src, k, a, imm8) _mm512_mask_rol_epi32(src, k, a, imm8) -#else - #define simde_mm512_mask_rol_epi32(src, k, a, imm8) simde_mm512_mask_mov_epi32(src, k, simde_mm512_rol_epi32(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_rol_epi32 - #define _mm512_mask_rol_epi32(src, k, a, imm8) simde_mm512_mask_rol_epi32(src, k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_maskz_rol_epi32(k, a, imm8) _mm512_maskz_rol_epi32(k, a, imm8) -#else - #define simde_mm512_maskz_rol_epi32(k, a, imm8) simde_mm512_maskz_mov_epi32(k, simde_mm512_rol_epi32(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_rol_epi32 - #define _mm512_maskz_rol_epi32(k, a, imm8) simde_mm512_maskz_rol_epi32(k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_rol_epi64(a, imm8) _mm_rol_epi64(a, imm8) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128i - simde_mm_rol_epi64 (simde__m128i a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE (imm8, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_i64 = vec_rl(a_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, imm8))); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - switch (imm8 & 63) { - case 0: - r_ = a_; - break; - default: - r_.u64 = (a_.u64 << (imm8 & 63)) | (a_.u64 >> (64 - (imm8 & 63))); - break; - } - #else - switch (imm8 & 63) { - case 0: - r_ = a_; - break; - default: - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] << (imm8 & 63)) | (a_.u64[i] >> (64 - (imm8 & 63))); - } - break; - } - #endif - - return simde__m128i_from_private(r_); - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_rol_epi64 - #define _mm_rol_epi64(a, imm8) simde_mm_rol_epi64(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_mask_rol_epi64(src, k, a, imm8) _mm_mask_rol_epi64(src, k, a, imm8) -#else - #define simde_mm_mask_rol_epi64(src, k, a, imm8) simde_mm_mask_mov_epi64(src, k, simde_mm_rol_epi64(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_rol_epi64 - #define _mm_mask_rol_epi64(src, k, a, imm8) simde_mm_mask_rol_epi64(src, k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_maskz_rol_epi64(k, a, imm8) _mm_maskz_rol_epi64(k, a, imm8) -#else - #define simde_mm_maskz_rol_epi64(k, a, imm8) simde_mm_maskz_mov_epi64(k, simde_mm_rol_epi64(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_rol_epi64 - #define _mm_maskz_rol_epi64(k, a, imm8) simde_mm_maskz_rol_epi64(k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_rol_epi64(a, imm8) _mm256_rol_epi64(a, imm8) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m256i - simde_mm256_rol_epi64 (simde__m256i a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE (imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - r_.m128i_private[i].altivec_i64 = vec_rl(a_.m128i_private[i].altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, imm8))); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - switch (imm8 & 63) { - case 0: - r_ = a_; - break; - default: - r_.u64 = (a_.u64 << (imm8 & 63)) | (a_.u64 >> (64 - (imm8 & 63))); - break; - } - #else - switch (imm8 & 63) { - case 0: - r_ = a_; - break; - default: - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] << (imm8 & 63)) | (a_.u64[i] >> (64 - (imm8 & 63))); - } - break; - } - #endif - - return simde__m256i_from_private(r_); - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_rol_epi64 - #define _mm256_rol_epi64(a, imm8) simde_mm256_rol_epi64(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_mask_rol_epi64(src, k, a, imm8) _mm256_mask_rol_epi64(src, k, a, imm8) -#else - #define simde_mm256_mask_rol_epi64(src, k, a, imm8) simde_mm256_mask_mov_epi64(src, k, simde_mm256_rol_epi64(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_rol_epi64 - #define _mm256_mask_rol_epi64(src, k, a, imm8) simde_mm256_mask_rol_epi64(src, k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_maskz_rol_epi64(k, a, imm8) _mm256_maskz_rol_epi64(k, a, imm8) -#else - #define simde_mm256_maskz_rol_epi64(k, a, imm8) simde_mm256_maskz_mov_epi64(k, simde_mm256_rol_epi64(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_rol_epi64 - #define _mm256_maskz_rol_epi64(k, a, imm8) simde_mm256_maskz_rol_epi64(k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_rol_epi64(a, imm8) _mm512_rol_epi64(a, imm8) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512i - simde_mm512_rol_epi64 (simde__m512i a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE (imm8, 0, 255) { - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - r_.m128i_private[i].altivec_i64 = vec_rl(a_.m128i_private[i].altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, imm8))); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - switch (imm8 & 63) { - case 0: - r_ = a_; - break; - default: - r_.u64 = (a_.u64 << (imm8 & 63)) | (a_.u64 >> (64 - (imm8 & 63))); - break; - } - #else - switch (imm8 & 63) { - case 0: - r_ = a_; - break; - default: - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] << (imm8 & 63)) | (a_.u64[i] >> (64 - (imm8 & 63))); - } - break; - } - #endif - - return simde__m512i_from_private(r_); - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_rol_epi64 - #define _mm512_rol_epi64(a, imm8) simde_mm512_rol_epi64(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_mask_rol_epi64(src, k, a, imm8) _mm512_mask_rol_epi64(src, k, a, imm8) -#else - #define simde_mm512_mask_rol_epi64(src, k, a, imm8) simde_mm512_mask_mov_epi64(src, k, simde_mm512_rol_epi64(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_rol_epi64 - #define _mm512_mask_rol_epi64(src, k, a, imm8) simde_mm512_mask_rol_epi64(src, k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_maskz_rol_epi64(k, a, imm8) _mm512_maskz_rol_epi64(k, a, imm8) -#else - #define simde_mm512_maskz_rol_epi64(k, a, imm8) simde_mm512_maskz_mov_epi64(k, simde_mm512_rol_epi64(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_rol_epi64 - #define _mm512_maskz_rol_epi64(k, a, imm8) simde_mm512_maskz_rol_epi64(k, a, imm8) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_ROL_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/rolv.h b/ffi-deps/simde/simde/x86/avx512/rolv.h deleted file mode 100644 index a14442f..0000000 --- a/ffi-deps/simde/simde/x86/avx512/rolv.h +++ /dev/null @@ -1,415 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_ROLV_H) -#define SIMDE_X86_AVX512_ROLV_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" -#include "srlv.h" -#include "sllv.h" -#include "or.h" -#include "and.h" -#include "sub.h" -#include "set1.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_rolv_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_rolv_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_u32 = vec_rl(a_.altivec_u32, b_.altivec_u32); - - return simde__m128i_from_private(r_); - #else - HEDLEY_STATIC_CAST(void, r_); - HEDLEY_STATIC_CAST(void, a_); - HEDLEY_STATIC_CAST(void, b_); - - simde__m128i - count1 = simde_mm_and_si128(b, simde_mm_set1_epi32(31)), - count2 = simde_mm_sub_epi32(simde_mm_set1_epi32(32), count1); - - return simde_mm_or_si128(simde_mm_sllv_epi32(a, count1), simde_mm_srlv_epi32(a, count2)); - #endif - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_rolv_epi32 - #define _mm_rolv_epi32(a, b) simde_mm_rolv_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_rolv_epi32 (simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_rolv_epi32(src, k, a, b); - #else - return simde_mm_mask_mov_epi32(src, k, simde_mm_rolv_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_rolv_epi32 - #define _mm_mask_rolv_epi32(src, k, a, b) simde_mm_mask_rolv_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_rolv_epi32 (simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_rolv_epi32(k, a, b); - #else - return simde_mm_maskz_mov_epi32(k, simde_mm_rolv_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_rolv_epi32 - #define _mm_maskz_rolv_epi32(k, a, b) simde_mm_maskz_rolv_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_rolv_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_rolv_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - r_.m128i_private[i].altivec_u32 = vec_rl(a_.m128i_private[i].altivec_u32, b_.m128i_private[i].altivec_u32); - } - - return simde__m256i_from_private(r_); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_rolv_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_rolv_epi32(a_.m128i[1], b_.m128i[1]); - - return simde__m256i_from_private(r_); - #else - HEDLEY_STATIC_CAST(void, r_); - HEDLEY_STATIC_CAST(void, a_); - HEDLEY_STATIC_CAST(void, b_); - - simde__m256i - count1 = simde_mm256_and_si256(b, simde_mm256_set1_epi32(31)), - count2 = simde_mm256_sub_epi32(simde_mm256_set1_epi32(32), count1); - - return simde_mm256_or_si256(simde_mm256_sllv_epi32(a, count1), simde_mm256_srlv_epi32(a, count2)); - #endif - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_rolv_epi32 - #define _mm256_rolv_epi32(a, b) simde_mm256_rolv_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_rolv_epi32 (simde__m256i src, simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_rolv_epi32(src, k, a, b); - #else - return simde_mm256_mask_mov_epi32(src, k, simde_mm256_rolv_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_rolv_epi32 - #define _mm256_mask_rolv_epi32(src, k, a, b) simde_mm256_mask_rolv_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_rolv_epi32 (simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_rolv_epi32(k, a, b); - #else - return simde_mm256_maskz_mov_epi32(k, simde_mm256_rolv_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_rolv_epi32 - #define _mm256_maskz_rolv_epi32(k, a, b) simde_mm256_maskz_rolv_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_rolv_epi32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_rolv_epi32(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - r_.m128i_private[i].altivec_u32 = vec_rl(a_.m128i_private[i].altivec_u32, b_.m128i_private[i].altivec_u32); - } - - return simde__m512i_from_private(r_); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256i[0] = simde_mm256_rolv_epi32(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_rolv_epi32(a_.m256i[1], b_.m256i[1]); - - return simde__m512i_from_private(r_); - #else - HEDLEY_STATIC_CAST(void, r_); - HEDLEY_STATIC_CAST(void, a_); - HEDLEY_STATIC_CAST(void, b_); - - simde__m512i - count1 = simde_mm512_and_si512(b, simde_mm512_set1_epi32(31)), - count2 = simde_mm512_sub_epi32(simde_mm512_set1_epi32(32), count1); - - return simde_mm512_or_si512(simde_mm512_sllv_epi32(a, count1), simde_mm512_srlv_epi32(a, count2)); - #endif - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_rolv_epi32 - #define _mm512_rolv_epi32(a, b) simde_mm512_rolv_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_rolv_epi32 (simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_rolv_epi32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_rolv_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_rolv_epi32 - #define _mm512_mask_rolv_epi32(src, k, a, b) simde_mm512_mask_rolv_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_rolv_epi32 (simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_rolv_epi32(k, a, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_rolv_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_rolv_epi32 - #define _mm512_maskz_rolv_epi32(k, a, b) simde_mm512_maskz_rolv_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_rolv_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_rolv_epi64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_u64 = vec_rl(a_.altivec_u64, b_.altivec_u64); - - return simde__m128i_from_private(r_); - #else - HEDLEY_STATIC_CAST(void, r_); - HEDLEY_STATIC_CAST(void, a_); - HEDLEY_STATIC_CAST(void, b_); - - simde__m128i - count1 = simde_mm_and_si128(b, simde_mm_set1_epi64x(63)), - count2 = simde_mm_sub_epi64(simde_mm_set1_epi64x(64), count1); - - return simde_mm_or_si128(simde_mm_sllv_epi64(a, count1), simde_mm_srlv_epi64(a, count2)); - #endif - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_rolv_epi64 - #define _mm_rolv_epi64(a, b) simde_mm_rolv_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_rolv_epi64 (simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_rolv_epi64(src, k, a, b); - #else - return simde_mm_mask_mov_epi64(src, k, simde_mm_rolv_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_rolv_epi64 - #define _mm_mask_rolv_epi64(src, k, a, b) simde_mm_mask_rolv_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_rolv_epi64 (simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_rolv_epi64(k, a, b); - #else - return simde_mm_maskz_mov_epi64(k, simde_mm_rolv_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_rolv_epi64 - #define _mm_maskz_rolv_epi64(k, a, b) simde_mm_maskz_rolv_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_rolv_epi64 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_rolv_epi64(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - r_.m128i_private[i].altivec_u64 = vec_rl(a_.m128i_private[i].altivec_u64, b_.m128i_private[i].altivec_u64); - } - - return simde__m256i_from_private(r_); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) - r_.m128i[0] = simde_mm_rolv_epi64(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_rolv_epi64(a_.m128i[1], b_.m128i[1]); - - return simde__m256i_from_private(r_); - #else - HEDLEY_STATIC_CAST(void, r_); - HEDLEY_STATIC_CAST(void, a_); - HEDLEY_STATIC_CAST(void, b_); - - simde__m256i - count1 = simde_mm256_and_si256(b, simde_mm256_set1_epi64x(63)), - count2 = simde_mm256_sub_epi64(simde_mm256_set1_epi64x(64), count1); - - return simde_mm256_or_si256(simde_mm256_sllv_epi64(a, count1), simde_mm256_srlv_epi64(a, count2)); - #endif - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_rolv_epi64 - #define _mm256_rolv_epi64(a, b) simde_mm256_rolv_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_rolv_epi64 (simde__m256i src, simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_rolv_epi64(src, k, a, b); - #else - return simde_mm256_mask_mov_epi64(src, k, simde_mm256_rolv_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_rolv_epi64 - #define _mm256_mask_rolv_epi64(src, k, a, b) simde_mm256_mask_rolv_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_rolv_epi64 (simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_rolv_epi64(k, a, b); - #else - return simde_mm256_maskz_mov_epi64(k, simde_mm256_rolv_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_rolv_epi64 - #define _mm256_maskz_rolv_epi64(k, a, b) simde_mm256_maskz_rolv_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_rolv_epi64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_rolv_epi64(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - r_.m128i_private[i].altivec_u64 = vec_rl(a_.m128i_private[i].altivec_u64, b_.m128i_private[i].altivec_u64); - } - - return simde__m512i_from_private(r_); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256i[0] = simde_mm256_rolv_epi64(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_rolv_epi64(a_.m256i[1], b_.m256i[1]); - - return simde__m512i_from_private(r_); - #else - HEDLEY_STATIC_CAST(void, r_); - HEDLEY_STATIC_CAST(void, a_); - HEDLEY_STATIC_CAST(void, b_); - - simde__m512i - count1 = simde_mm512_and_si512(b, simde_mm512_set1_epi64(63)), - count2 = simde_mm512_sub_epi64(simde_mm512_set1_epi64(64), count1); - - return simde_mm512_or_si512(simde_mm512_sllv_epi64(a, count1), simde_mm512_srlv_epi64(a, count2)); - #endif - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_rolv_epi64 - #define _mm512_rolv_epi64(a, b) simde_mm512_rolv_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_rolv_epi64 (simde__m512i src, simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_rolv_epi64(src, k, a, b); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_rolv_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_rolv_epi64 - #define _mm512_mask_rolv_epi64(src, k, a, b) simde_mm512_mask_rolv_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_rolv_epi64 (simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_rolv_epi64(k, a, b); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_rolv_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_rolv_epi64 - #define _mm512_maskz_rolv_epi64(k, a, b) simde_mm512_maskz_rolv_epi64(k, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_ROLV_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/ror.h b/ffi-deps/simde/simde/x86/avx512/ror.h deleted file mode 100644 index 7cac56c..0000000 --- a/ffi-deps/simde/simde/x86/avx512/ror.h +++ /dev/null @@ -1,410 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_ROR_H) -#define SIMDE_X86_AVX512_ROR_H - -#include "types.h" -#include "mov.h" -#include "or.h" -#include "srli.h" -#include "slli.h" -#include "../avx2.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_ror_epi32(a, imm8) _mm_ror_epi32(a, imm8) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128i - simde_mm_ror_epi32 (simde__m128i a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE (imm8, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_rl(a_.altivec_i32, vec_splats(HEDLEY_STATIC_CAST(unsigned int, 32 - imm8))); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - switch (imm8 & 31) { - case 0: - r_ = a_; - break; - default: - r_.u32 = (a_.u32 >> (imm8 & 31)) | (a_.u32 << (32 - (imm8 & 31))); - break; - } - #else - switch (imm8 & 31) { - case 0: - r_ = a_; - break; - default: - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] >> (imm8 & 31)) | (a_.u32[i] << (32 - (imm8 & 31))); - } - break; - } - #endif - - return simde__m128i_from_private(r_); - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_ror_epi32 - #define _mm_ror_epi32(a, imm8) simde_mm_ror_epi32(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_mask_ror_epi32(src, k, a, imm8) _mm_mask_ror_epi32(src, k, a, imm8) -#else - #define simde_mm_mask_ror_epi32(src, k, a, imm8) simde_mm_mask_mov_epi32(src, k, simde_mm_ror_epi32(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_ror_epi32 - #define _mm_mask_ror_epi32(src, k, a, imm8) simde_mm_mask_ror_epi32(src, k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_maskz_ror_epi32(k, a, imm8) _mm_maskz_ror_epi32(k, a, imm8) -#else - #define simde_mm_maskz_ror_epi32(k, a, imm8) simde_mm_maskz_mov_epi32(k, simde_mm_ror_epi32(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_ror_epi32 - #define _mm_maskz_ror_epi32(k, a, imm8) simde_mm_maskz_ror_epi32(k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_ror_epi32(a, imm8) _mm256_ror_epi32(a, imm8) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m256i - simde_mm256_ror_epi32 (simde__m256i a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE (imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - r_.m128i_private[i].altivec_i32 = vec_rl(a_.m128i_private[i].altivec_i32, vec_splats(HEDLEY_STATIC_CAST(unsigned int, 32 - imm8))); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - switch (imm8 & 31) { - case 0: - r_ = a_; - break; - default: - r_.u32 = (a_.u32 >> (imm8 & 31)) | (a_.u32 << (32 - (imm8 & 31))); - break; - } - #else - switch (imm8 & 31) { - case 0: - r_ = a_; - break; - default: - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] >> (imm8 & 31)) | (a_.u32[i] << (32 - (imm8 & 31))); - } - break; - } - #endif - - return simde__m256i_from_private(r_); - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_ror_epi32 - #define _mm256_ror_epi32(a, imm8) simde_mm256_ror_epi32(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_mask_ror_epi32(src, k, a, imm8) _mm256_mask_ror_epi32(src, k, a, imm8) -#else - #define simde_mm256_mask_ror_epi32(src, k, a, imm8) simde_mm256_mask_mov_epi32(src, k, simde_mm256_ror_epi32(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_ror_epi32 - #define _mm256_mask_ror_epi32(src, k, a, imm8) simde_mm256_mask_ror_epi32(src, k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_maskz_ror_epi32(k, a, imm8) _mm256_maskz_ror_epi32(k, a, imm8) -#else - #define simde_mm256_maskz_ror_epi32(k, a, imm8) simde_mm256_maskz_mov_epi32(k, simde_mm256_ror_epi32(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_ror_epi32 - #define _mm256_maskz_ror_epi32(k, a, imm8) simde_mm256_maskz_ror_epi32(k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_ror_epi32(a, imm8) _mm512_ror_epi32(a, imm8) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512i - simde_mm512_ror_epi32 (simde__m512i a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE (imm8, 0, 255) { - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - r_.m128i_private[i].altivec_i32 = vec_rl(a_.m128i_private[i].altivec_i32, vec_splats(HEDLEY_STATIC_CAST(unsigned int, 32 - imm8))); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - switch (imm8 & 31) { - case 0: - r_ = a_; - break; - default: - r_.u32 = (a_.u32 >> (imm8 & 31)) | (a_.u32 << (32 - (imm8 & 31))); - break; - } - #else - switch (imm8 & 31) { - case 0: - r_ = a_; - break; - default: - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] >> (imm8 & 31)) | (a_.u32[i] << (32 - (imm8 & 31))); - } - break; - } - #endif - - return simde__m512i_from_private(r_); - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_ror_epi32 - #define _mm512_ror_epi32(a, imm8) simde_mm512_ror_epi32(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_mask_ror_epi32(src, k, a, imm8) _mm512_mask_ror_epi32(src, k, a, imm8) -#else - #define simde_mm512_mask_ror_epi32(src, k, a, imm8) simde_mm512_mask_mov_epi32(src, k, simde_mm512_ror_epi32(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_ror_epi32 - #define _mm512_mask_ror_epi32(src, k, a, imm8) simde_mm512_mask_ror_epi32(src, k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_maskz_ror_epi32(k, a, imm8) _mm512_maskz_ror_epi32(k, a, imm8) -#else - #define simde_mm512_maskz_ror_epi32(k, a, imm8) simde_mm512_maskz_mov_epi32(k, simde_mm512_ror_epi32(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_ror_epi32 - #define _mm512_maskz_ror_epi32(k, a, imm8) simde_mm512_maskz_ror_epi32(k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_ror_epi64(a, imm8) _mm_ror_epi64(a, imm8) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128i - simde_mm_ror_epi64 (simde__m128i a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE (imm8, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_i64 = vec_rl(a_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 64 - imm8))); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - switch (imm8 & 63) { - case 0: - r_ = a_; - break; - default: - r_.u64 = (a_.u64 >> (imm8 & 63)) | (a_.u64 << (64 - (imm8 & 63))); - break; - } - #else - switch (imm8 & 63) { - case 0: - r_ = a_; - break; - default: - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] >> (imm8 & 63)) | (a_.u64[i] << (64 - (imm8 & 63))); - } - break; - } - #endif - - return simde__m128i_from_private(r_); - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_ror_epi64 - #define _mm_ror_epi64(a, imm8) simde_mm_ror_epi64(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_mask_ror_epi64(src, k, a, imm8) _mm_mask_ror_epi64(src, k, a, imm8) -#else - #define simde_mm_mask_ror_epi64(src, k, a, imm8) simde_mm_mask_mov_epi64(src, k, simde_mm_ror_epi64(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_ror_epi64 - #define _mm_mask_ror_epi64(src, k, a, imm8) simde_mm_mask_ror_epi64(src, k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_maskz_ror_epi64(k, a, imm8) _mm_maskz_ror_epi64(k, a, imm8) -#else - #define simde_mm_maskz_ror_epi64(k, a, imm8) simde_mm_maskz_mov_epi64(k, simde_mm_ror_epi64(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_ror_epi64 - #define _mm_maskz_ror_epi64(k, a, imm8) simde_mm_maskz_ror_epi64(k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_ror_epi64(a, imm8) _mm256_ror_epi64(a, imm8) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m256i - simde_mm256_ror_epi64 (simde__m256i a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE (imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - r_.m128i_private[i].altivec_i64 = vec_rl(a_.m128i_private[i].altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 64 - imm8))); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - switch (imm8 & 63) { - case 0: - r_ = a_; - break; - default: - r_.u64 = (a_.u64 >> (imm8 & 63)) | (a_.u64 << (64 - (imm8 & 63))); - break; - } - #else - switch (imm8 & 63) { - case 0: - r_ = a_; - break; - default: - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] >> (imm8 & 63)) | (a_.u64[i] << (64 - (imm8 & 63))); - } - break; - } - #endif - - return simde__m256i_from_private(r_); - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_ror_epi64 - #define _mm256_ror_epi64(a, imm8) simde_mm256_ror_epi64(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_mask_ror_epi64(src, k, a, imm8) _mm256_mask_ror_epi64(src, k, a, imm8) -#else - #define simde_mm256_mask_ror_epi64(src, k, a, imm8) simde_mm256_mask_mov_epi64(src, k, simde_mm256_ror_epi64(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_ror_epi64 - #define _mm256_mask_ror_epi64(src, k, a, imm8) simde_mm256_mask_ror_epi64(src, k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_maskz_ror_epi64(k, a, imm8) _mm256_maskz_ror_epi64(k, a, imm8) -#else - #define simde_mm256_maskz_ror_epi64(k, a, imm8) simde_mm256_maskz_mov_epi64(k, simde_mm256_ror_epi64(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_ror_epi64 - #define _mm256_maskz_ror_epi64(k, a, imm8) simde_mm256_maskz_ror_epi64(k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_ror_epi64(a, imm8) _mm512_ror_epi64(a, imm8) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512i - simde_mm512_ror_epi64 (simde__m512i a, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE (imm8, 0, 255) { - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - r_.m128i_private[i].altivec_i64 = vec_rl(a_.m128i_private[i].altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 64 - imm8))); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - switch (imm8 & 63) { - case 0: - r_ = a_; - break; - default: - r_.u64 = (a_.u64 >> (imm8 & 63)) | (a_.u64 << (64 - (imm8 & 63))); - break; - } - #else - switch (imm8 & 63) { - case 0: - r_ = a_; - break; - default: - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] >> (imm8 & 63)) | (a_.u64[i] << (64 - (imm8 & 63))); - } - break; - } - #endif - - return simde__m512i_from_private(r_); - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_ror_epi64 - #define _mm512_ror_epi64(a, imm8) simde_mm512_ror_epi64(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_mask_ror_epi64(src, k, a, imm8) _mm512_mask_ror_epi64(src, k, a, imm8) -#else - #define simde_mm512_mask_ror_epi64(src, k, a, imm8) simde_mm512_mask_mov_epi64(src, k, simde_mm512_ror_epi64(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_ror_epi64 - #define _mm512_mask_ror_epi64(src, k, a, imm8) simde_mm512_mask_ror_epi64(src, k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_maskz_ror_epi64(k, a, imm8) _mm512_maskz_ror_epi64(k, a, imm8) -#else - #define simde_mm512_maskz_ror_epi64(k, a, imm8) simde_mm512_maskz_mov_epi64(k, simde_mm512_ror_epi64(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_ror_epi64 - #define _mm512_maskz_ror_epi64(k, a, imm8) simde_mm512_maskz_ror_epi64(k, a, imm8) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_ROR_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/rorv.h b/ffi-deps/simde/simde/x86/avx512/rorv.h deleted file mode 100644 index ae87cec..0000000 --- a/ffi-deps/simde/simde/x86/avx512/rorv.h +++ /dev/null @@ -1,391 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_RORV_H) -#define SIMDE_X86_AVX512_RORV_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" -#include "srlv.h" -#include "sllv.h" -#include "or.h" -#include "and.h" -#include "sub.h" -#include "set1.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_rorv_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_rorv_epi32(a, b); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - r_.altivec_i32 = vec_rl(a_.altivec_i32, vec_sub(vec_splats(HEDLEY_STATIC_CAST(unsigned int, 32)), b_.altivec_u32)); - return simde__m128i_from_private(r_); - #else - simde__m128i - count1 = simde_mm_and_si128(b, simde_mm_set1_epi32(31)), - count2 = simde_mm_sub_epi32(simde_mm_set1_epi32(32), count1); - return simde_mm_or_si128(simde_mm_srlv_epi32(a, count1), simde_mm_sllv_epi32(a, count2)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_rorv_epi32 - #define _mm_rorv_epi32(a, b) simde_mm_rorv_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_rorv_epi32 (simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_rorv_epi32(src, k, a, b); - #else - return simde_mm_mask_mov_epi32(src, k, simde_mm_rorv_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_rorv_epi32 - #define _mm_mask_rorv_epi32(src, k, a, b) simde_mm_mask_rorv_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_rorv_epi32 (simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_rorv_epi32(k, a, b); - #else - return simde_mm_maskz_mov_epi32(k, simde_mm_rorv_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_rorv_epi32 - #define _mm_maskz_rorv_epi32(k, a, b) simde_mm_maskz_rorv_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_rorv_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_rorv_epi32(a, b); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - r_.m128i_private[i].altivec_i32 = vec_rl(a_.m128i_private[i].altivec_i32, vec_sub(vec_splats(HEDLEY_STATIC_CAST(unsigned int, 32)), b_.m128i_private[i].altivec_u32)); - } - - return simde__m256i_from_private(r_); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - r_.m128i[0] = simde_mm_rorv_epi32(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_rorv_epi32(a_.m128i[1], b_.m128i[1]); - - return simde__m256i_from_private(r_); - #else - simde__m256i - count1 = simde_mm256_and_si256(b, simde_mm256_set1_epi32(31)), - count2 = simde_mm256_sub_epi32(simde_mm256_set1_epi32(32), count1); - return simde_mm256_or_si256(simde_mm256_srlv_epi32(a, count1), simde_mm256_sllv_epi32(a, count2)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_rorv_epi32 - #define _mm256_rorv_epi32(a, b) simde_mm256_rorv_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_rorv_epi32 (simde__m256i src, simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_rorv_epi32(src, k, a, b); - #else - return simde_mm256_mask_mov_epi32(src, k, simde_mm256_rorv_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_rorv_epi32 - #define _mm256_mask_rorv_epi32(src, k, a, b) simde_mm256_mask_rorv_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_rorv_epi32 (simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_rorv_epi32(k, a, b); - #else - return simde_mm256_maskz_mov_epi32(k, simde_mm256_rorv_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_rorv_epi32 - #define _mm256_maskz_rorv_epi32(k, a, b) simde_mm256_maskz_rorv_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_rorv_epi32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_rorv_epi32(a, b); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - r_.m128i_private[i].altivec_i32 = vec_rl(a_.m128i_private[i].altivec_i32, vec_sub(vec_splats(HEDLEY_STATIC_CAST(unsigned int, 32)), b_.m128i_private[i].altivec_u32)); - } - - return simde__m512i_from_private(r_); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - r_.m256i[0] = simde_mm256_rorv_epi32(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_rorv_epi32(a_.m256i[1], b_.m256i[1]); - - return simde__m512i_from_private(r_); - #else - simde__m512i - count1 = simde_mm512_and_si512(b, simde_mm512_set1_epi32(31)), - count2 = simde_mm512_sub_epi32(simde_mm512_set1_epi32(32), count1); - return simde_mm512_or_si512(simde_mm512_srlv_epi32(a, count1), simde_mm512_sllv_epi32(a, count2)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_rorv_epi32 - #define _mm512_rorv_epi32(a, b) simde_mm512_rorv_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_rorv_epi32 (simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_rorv_epi32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_rorv_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_rorv_epi32 - #define _mm512_mask_rorv_epi32(src, k, a, b) simde_mm512_mask_rorv_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_rorv_epi32 (simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_rorv_epi32(k, a, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_rorv_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_rorv_epi32 - #define _mm512_maskz_rorv_epi32(k, a, b) simde_mm512_maskz_rorv_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_rorv_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_rorv_epi64(a, b); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - r_.altivec_i64 = vec_rl(a_.altivec_i64, vec_sub(vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 64)), b_.altivec_u64)); - return simde__m128i_from_private(r_); - #else - simde__m128i - count1 = simde_mm_and_si128(b, simde_mm_set1_epi64x(63)), - count2 = simde_mm_sub_epi64(simde_mm_set1_epi64x(64), count1); - return simde_mm_or_si128(simde_mm_srlv_epi64(a, count1), simde_mm_sllv_epi64(a, count2)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_rorv_epi64 - #define _mm_rorv_epi64(a, b) simde_mm_rorv_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_rorv_epi64 (simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_rorv_epi64(src, k, a, b); - #else - return simde_mm_mask_mov_epi64(src, k, simde_mm_rorv_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_rorv_epi64 - #define _mm_mask_rorv_epi64(src, k, a, b) simde_mm_mask_rorv_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_rorv_epi64 (simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_rorv_epi64(k, a, b); - #else - return simde_mm_maskz_mov_epi64(k, simde_mm_rorv_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_rorv_epi64 - #define _mm_maskz_rorv_epi64(k, a, b) simde_mm_maskz_rorv_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_rorv_epi64 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_rorv_epi64(a, b); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - r_.m128i_private[i].altivec_i64 = vec_rl(a_.m128i_private[i].altivec_i64, vec_sub(vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 64)), b_.m128i_private[i].altivec_u64)); - } - - return simde__m256i_from_private(r_); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - r_.m128i[0] = simde_mm_rorv_epi64(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_rorv_epi64(a_.m128i[1], b_.m128i[1]); - - return simde__m256i_from_private(r_); - #else - simde__m256i - count1 = simde_mm256_and_si256(b, simde_mm256_set1_epi64x(63)), - count2 = simde_mm256_sub_epi64(simde_mm256_set1_epi64x(64), count1); - return simde_mm256_or_si256(simde_mm256_srlv_epi64(a, count1), simde_mm256_sllv_epi64(a, count2)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_rorv_epi64 - #define _mm256_rorv_epi64(a, b) simde_mm256_rorv_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_rorv_epi64 (simde__m256i src, simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_rorv_epi64(src, k, a, b); - #else - return simde_mm256_mask_mov_epi64(src, k, simde_mm256_rorv_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_rorv_epi64 - #define _mm256_mask_rorv_epi64(src, k, a, b) simde_mm256_mask_rorv_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_rorv_epi64 (simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_rorv_epi64(k, a, b); - #else - return simde_mm256_maskz_mov_epi64(k, simde_mm256_rorv_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_rorv_epi64 - #define _mm256_maskz_rorv_epi64(k, a, b) simde_mm256_maskz_rorv_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_rorv_epi64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_rorv_epi64(a, b); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - for (size_t i = 0 ; i < (sizeof(r_.m128i_private) / sizeof(r_.m128i_private[0])) ; i++) { - r_.m128i_private[i].altivec_i64 = vec_rl(a_.m128i_private[i].altivec_i64, vec_sub(vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 64)), b_.m128i_private[i].altivec_u64)); - } - - return simde__m512i_from_private(r_); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - r_.m256i[0] = simde_mm256_rorv_epi64(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_rorv_epi64(a_.m256i[1], b_.m256i[1]); - - return simde__m512i_from_private(r_); - #else - simde__m512i - count1 = simde_mm512_and_si512(b, simde_mm512_set1_epi64(63)), - count2 = simde_mm512_sub_epi64(simde_mm512_set1_epi64(64), count1); - return simde_mm512_or_si512(simde_mm512_srlv_epi64(a, count1), simde_mm512_sllv_epi64(a, count2)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_rorv_epi64 - #define _mm512_rorv_epi64(a, b) simde_mm512_rorv_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_rorv_epi64 (simde__m512i src, simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_rorv_epi64(src, k, a, b); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_rorv_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_rorv_epi64 - #define _mm512_mask_rorv_epi64(src, k, a, b) simde_mm512_mask_rorv_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_rorv_epi64 (simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_rorv_epi64(k, a, b); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_rorv_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_rorv_epi64 - #define _mm512_maskz_rorv_epi64(k, a, b) simde_mm512_maskz_rorv_epi64(k, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_RORV_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/round.h b/ffi-deps/simde/simde/x86/avx512/round.h deleted file mode 100644 index 684dbe0..0000000 --- a/ffi-deps/simde/simde/x86/avx512/round.h +++ /dev/null @@ -1,282 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_ROUND_H) -#define SIMDE_X86_AVX512_ROUND_H - -#include "types.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -#if SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_x_mm512_round_ps(a, rounding) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512_private \ - simde_x_mm512_round_ps_r_ = simde__m512_to_private(simde_mm512_setzero_ps()), \ - simde_x_mm512_round_ps_a_ = simde__m512_to_private(a); \ - \ - for (size_t simde_x_mm512_round_ps_i = 0 ; simde_x_mm512_round_ps_i < (sizeof(simde_x_mm512_round_ps_r_.m256) / sizeof(simde_x_mm512_round_ps_r_.m256[0])) ; simde_x_mm512_round_ps_i++) { \ - simde_x_mm512_round_ps_r_.m256[simde_x_mm512_round_ps_i] = simde_mm256_round_ps(simde_x_mm512_round_ps_a_.m256[simde_x_mm512_round_ps_i], rounding); \ - } \ - \ - simde__m512_from_private(simde_x_mm512_round_ps_r_); \ - })) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512 - simde_x_mm512_round_ps (simde__m512 a, int rounding) - SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) { - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - /* For architectures which lack a current direction SIMD instruction. - * - * Note that NEON actually has a current rounding mode instruction, - * but in ARMv8+ the rounding mode is ignored and nearest is always - * used, so we treat ARMv7 as having a rounding mode but ARMv8 as - * not. */ - #if \ - defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \ - defined(SIMDE_ARM_NEON_A32V8) - if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION) - rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE()) << 13; - #endif - - switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { - case SIMDE_MM_FROUND_CUR_DIRECTION: - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) { - r_.m128_private[i].altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_round(a_.m128_private[i].altivec_f32)); - } - #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399) - for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) { - r_.m128_private[i].neon_f32 = vrndiq_f32(a_.m128_private[i].neon_f32); - } - #elif defined(simde_math_nearbyintf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_nearbyintf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_ps()); - #endif - break; - - case SIMDE_MM_FROUND_TO_NEAREST_INT: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) { - r_.m128_private[i].altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_rint(a_.m128_private[i].altivec_f32)); - } - #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) { - r_.m128_private[i].neon_f32 = vrndnq_f32(a_.m128_private[i].neon_f32); - } - #elif defined(simde_math_roundevenf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_roundevenf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_ps()); - #endif - break; - - case SIMDE_MM_FROUND_TO_NEG_INF: - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) { - r_.m128_private[i].altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_floor(a_.m128_private[i].altivec_f32)); - } - #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) { - r_.m128_private[i].neon_f32 = vrndmq_f32(a_.m128_private[i].neon_f32); - } - #elif defined(simde_math_floorf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_floorf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_ps()); - #endif - break; - - case SIMDE_MM_FROUND_TO_POS_INF: - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) { - r_.m128_private[i].altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_ceil(a_.m128_private[i].altivec_f32)); - } - #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) { - r_.m128_private[i].neon_f32 = vrndpq_f32(a_.m128_private[i].neon_f32); - } - #elif defined(simde_math_ceilf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_ceilf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_ps()); - #endif - break; - - case SIMDE_MM_FROUND_TO_ZERO: - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) { - r_.m128_private[i].altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_trunc(a_.m128_private[i].altivec_f32)); - } - #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) { - r_.m128_private[i].neon_f32 = vrndq_f32(a_.m128_private[i].neon_f32); - } - #elif defined(simde_math_truncf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_truncf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_ps()); - #endif - break; - - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_ps()); - } - - return simde__m512_from_private(r_); - } -#endif - -#if SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_x_mm512_round_pd(a, rounding) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512d_private \ - simde_x_mm512_round_pd_r_ = simde__m512d_to_private(simde_mm512_setzero_pd()), \ - simde_x_mm512_round_pd_a_ = simde__m512d_to_private(a); \ - \ - for (size_t simde_x_mm512_round_pd_i = 0 ; simde_x_mm512_round_pd_i < (sizeof(simde_x_mm512_round_pd_r_.m256d) / sizeof(simde_x_mm512_round_pd_r_.m256d[0])) ; simde_x_mm512_round_pd_i++) { \ - simde_x_mm512_round_pd_r_.m256d[simde_x_mm512_round_pd_i] = simde_mm256_round_pd(simde_x_mm512_round_pd_a_.m256d[simde_x_mm512_round_pd_i], rounding); \ - } \ - \ - simde__m512d_from_private(simde_x_mm512_round_pd_r_); \ - })) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512d - simde_x_mm512_round_pd (simde__m512d a, int rounding) - SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) { - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - /* For architectures which lack a current direction SIMD instruction. */ - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION) - rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE()) << 13; - #endif - - switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { - case SIMDE_MM_FROUND_CUR_DIRECTION: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128d_private) / sizeof(r_.m128d_private[0])) ; i++) { - r_.m128d_private[i].altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.m128d_private[i].altivec_f64)); - } - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128d_private) / sizeof(r_.m128d_private[0])) ; i++) { - r_.m128d_private[i].neon_f64 = vrndiq_f64(a_.m128d_private[i].neon_f64); - } - #elif defined(simde_math_nearbyint) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_nearbyint(a_.f64[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_NEAREST_INT: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128d_private) / sizeof(r_.m128d_private[0])) ; i++) { - r_.m128d_private[i].altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.m128d_private[i].altivec_f64)); - } - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128d_private) / sizeof(r_.m128d_private[0])) ; i++) { - r_.m128d_private[i].neon_f64 = vrndaq_f64(a_.m128d_private[i].neon_f64); - } - #elif defined(simde_math_roundeven) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_roundeven(a_.f64[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_NEG_INF: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128d_private) / sizeof(r_.m128d_private[0])) ; i++) { - r_.m128d_private[i].altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_floor(a_.m128d_private[i].altivec_f64)); - } - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128d_private) / sizeof(r_.m128d_private[0])) ; i++) { - r_.m128d_private[i].neon_f64 = vrndmq_f64(a_.m128d_private[i].neon_f64); - } - #elif defined(simde_math_floor) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_floor(a_.f64[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_POS_INF: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128d_private) / sizeof(r_.m128d_private[0])) ; i++) { - r_.m128d_private[i].altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_ceil(a_.m128d_private[i].altivec_f64)); - } - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128d_private) / sizeof(r_.m128d_private[0])) ; i++) { - r_.m128d_private[i].neon_f64 = vrndpq_f64(a_.m128d_private[i].neon_f64); - } - #elif defined(simde_math_ceil) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_ceil(a_.f64[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_ZERO: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128d_private) / sizeof(r_.m128d_private[0])) ; i++) { - r_.m128d_private[i].altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_trunc(a_.m128d_private[i].altivec_f64)); - } - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - for (size_t i = 0 ; i < (sizeof(r_.m128d_private) / sizeof(r_.m128d_private[0])) ; i++) { - r_.m128d_private[i].neon_f64 = vrndq_f64(a_.m128d_private[i].neon_f64); - } - #elif defined(simde_math_trunc) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_trunc(a_.f64[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_pd()); - #endif - break; - - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm512_setzero_pd()); - } - - return simde__m512d_from_private(r_); - } -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_ROUND_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/roundscale.h b/ffi-deps/simde/simde/x86/avx512/roundscale.h deleted file mode 100644 index 80c9abf..0000000 --- a/ffi-deps/simde/simde/x86/avx512/roundscale.h +++ /dev/null @@ -1,616 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_ROUNDSCALE_H) -#define SIMDE_X86_AVX512_ROUNDSCALE_H - -#include "types.h" -#include "andnot.h" -#include "set1.h" -#include "mul.h" -#include "round.h" -#include "cmpeq.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm_roundscale_ps(a, imm8) _mm_roundscale_ps((a), (imm8)) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128 - simde_mm_roundscale_ps_internal_ (simde__m128 result, simde__m128 a, int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - HEDLEY_STATIC_CAST(void, imm8); - - simde__m128 r, clear_sign; - - clear_sign = simde_mm_andnot_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C(-0.0)), result); - r = simde_x_mm_select_ps(result, a, simde_mm_cmpeq_ps(clear_sign, simde_mm_set1_ps(SIMDE_MATH_INFINITYF))); - - return r; - } - #define simde_mm_roundscale_ps(a, imm8) \ - simde_mm_roundscale_ps_internal_( \ - simde_mm_mul_ps( \ - simde_mm_round_ps( \ - simde_mm_mul_ps( \ - a, \ - simde_mm_set1_ps(simde_math_exp2f(((imm8 >> 4) & 15)))), \ - ((imm8) & 15) \ - ), \ - simde_mm_set1_ps(simde_math_exp2f(-((imm8 >> 4) & 15))) \ - ), \ - (a), \ - (imm8) \ - ) -#endif -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_roundscale_ps - #define _mm_roundscale_ps(a, imm8) simde_mm_roundscale_ps(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_mask_roundscale_ps(src, k, a, imm8) _mm_mask_roundscale_ps(src, k, a, imm8) -#else - #define simde_mm_mask_roundscale_ps(src, k, a, imm8) simde_mm_mask_mov_ps(src, k, simde_mm_roundscale_ps(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_roundscale_ps - #define _mm_mask_roundscale_ps(src, k, a, imm8) simde_mm_mask_roundscale_ps(src, k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_maskz_roundscale_ps(k, a, imm8) _mm_maskz_roundscale_ps(k, a, imm8) -#else - #define simde_mm_maskz_roundscale_ps(k, a, imm8) simde_mm_maskz_mov_ps(k, simde_mm_roundscale_ps(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_roundscale_ps - #define _mm_maskz_roundscale_ps(k, a, imm8) simde_mm_maskz_roundscale_ps(k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm256_roundscale_ps(a, imm8) _mm256_roundscale_ps((a), (imm8)) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm256_roundscale_ps(a, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m256_private \ - simde_mm256_roundscale_ps_r_ = simde__m256_to_private(simde_mm256_setzero_ps()), \ - simde_mm256_roundscale_ps_a_ = simde__m256_to_private(a); \ - \ - for (size_t simde_mm256_roundscale_ps_i = 0 ; simde_mm256_roundscale_ps_i < (sizeof(simde_mm256_roundscale_ps_r_.m128) / sizeof(simde_mm256_roundscale_ps_r_.m128[0])) ; simde_mm256_roundscale_ps_i++) { \ - simde_mm256_roundscale_ps_r_.m128[simde_mm256_roundscale_ps_i] = simde_mm_roundscale_ps(simde_mm256_roundscale_ps_a_.m128[simde_mm256_roundscale_ps_i], imm8); \ - } \ - \ - simde__m256_from_private(simde_mm256_roundscale_ps_r_); \ - })) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m256 - simde_mm256_roundscale_ps_internal_ (simde__m256 result, simde__m256 a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - HEDLEY_STATIC_CAST(void, imm8); - - simde__m256 r, clear_sign; - - clear_sign = simde_mm256_andnot_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C(-0.0)), result); - r = simde_x_mm256_select_ps(result, a, simde_mm256_castsi256_ps(simde_mm256_cmpeq_epi32(simde_mm256_castps_si256(clear_sign), simde_mm256_castps_si256(simde_mm256_set1_ps(SIMDE_MATH_INFINITYF))))); - - return r; - } - #define simde_mm256_roundscale_ps(a, imm8) \ - simde_mm256_roundscale_ps_internal_( \ - simde_mm256_mul_ps( \ - simde_mm256_round_ps( \ - simde_mm256_mul_ps( \ - a, \ - simde_mm256_set1_ps(simde_math_exp2f(((imm8 >> 4) & 15)))), \ - ((imm8) & 15) \ - ), \ - simde_mm256_set1_ps(simde_math_exp2f(-((imm8 >> 4) & 15))) \ - ), \ - (a), \ - (imm8) \ - ) -#endif -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm256_roundscale_ps - #define _mm256_roundscale_ps(a, imm8) simde_mm256_roundscale_ps(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_mask_roundscale_ps(src, k, a, imm8) _mm256_mask_roundscale_ps(src, k, a, imm8) -#else - #define simde_mm256_mask_roundscale_ps(src, k, a, imm8) simde_mm256_mask_mov_ps(src, k, simde_mm256_roundscale_ps(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_roundscale_ps - #define _mm256_mask_roundscale_ps(src, k, a, imm8) simde_mm256_mask_roundscale_ps(src, k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_maskz_roundscale_ps(k, a, imm8) _mm256_maskz_roundscale_ps(k, a, imm8) -#else - #define simde_mm256_maskz_roundscale_ps(k, a, imm8) simde_mm256_maskz_mov_ps(k, simde_mm256_roundscale_ps(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_roundscale_ps - #define _mm256_maskz_roundscale_ps(k, a, imm8) simde_mm256_maskz_roundscale_ps(k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_roundscale_ps(a, imm8) _mm512_roundscale_ps((a), (imm8)) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm512_roundscale_ps(a, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512_private \ - simde_mm512_roundscale_ps_r_ = simde__m512_to_private(simde_mm512_setzero_ps()), \ - simde_mm512_roundscale_ps_a_ = simde__m512_to_private(a); \ - \ - for (size_t simde_mm512_roundscale_ps_i = 0 ; simde_mm512_roundscale_ps_i < (sizeof(simde_mm512_roundscale_ps_r_.m256) / sizeof(simde_mm512_roundscale_ps_r_.m256[0])) ; simde_mm512_roundscale_ps_i++) { \ - simde_mm512_roundscale_ps_r_.m256[simde_mm512_roundscale_ps_i] = simde_mm256_roundscale_ps(simde_mm512_roundscale_ps_a_.m256[simde_mm512_roundscale_ps_i], imm8); \ - } \ - \ - simde__m512_from_private(simde_mm512_roundscale_ps_r_); \ - })) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512 - simde_mm512_roundscale_ps_internal_ (simde__m512 result, simde__m512 a, int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - HEDLEY_STATIC_CAST(void, imm8); - - simde__m512 r, clear_sign; - - clear_sign = simde_mm512_andnot_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C(-0.0)), result); - r = simde_mm512_mask_mov_ps(result, simde_mm512_cmpeq_epi32_mask(simde_mm512_castps_si512(clear_sign), simde_mm512_castps_si512(simde_mm512_set1_ps(SIMDE_MATH_INFINITYF))), a); - - return r; - } - #define simde_mm512_roundscale_ps(a, imm8) \ - simde_mm512_roundscale_ps_internal_( \ - simde_mm512_mul_ps( \ - simde_x_mm512_round_ps( \ - simde_mm512_mul_ps( \ - a, \ - simde_mm512_set1_ps(simde_math_exp2f(((imm8 >> 4) & 15)))), \ - ((imm8) & 15) \ - ), \ - simde_mm512_set1_ps(simde_math_exp2f(-((imm8 >> 4) & 15))) \ - ), \ - (a), \ - (imm8) \ - ) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_roundscale_ps - #define _mm512_roundscale_ps(a, imm8) simde_mm512_roundscale_ps(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_mask_roundscale_ps(src, k, a, imm8) _mm512_mask_roundscale_ps(src, k, a, imm8) -#else - #define simde_mm512_mask_roundscale_ps(src, k, a, imm8) simde_mm512_mask_mov_ps(src, k, simde_mm512_roundscale_ps(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_roundscale_ps - #define _mm512_mask_roundscale_ps(src, k, a, imm8) simde_mm512_mask_roundscale_ps(src, k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_maskz_roundscale_ps(k, a, imm8) _mm512_maskz_roundscale_ps(k, a, imm8) -#else - #define simde_mm512_maskz_roundscale_ps(k, a, imm8) simde_mm512_maskz_mov_ps(k, simde_mm512_roundscale_ps(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_roundscale_ps - #define _mm512_maskz_roundscale_ps(k, a, imm8) simde_mm512_maskz_roundscale_ps(k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm_roundscale_pd(a, imm8) _mm_roundscale_pd((a), (imm8)) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128d - simde_mm_roundscale_pd_internal_ (simde__m128d result, simde__m128d a, int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - HEDLEY_STATIC_CAST(void, imm8); - - simde__m128d r, clear_sign; - - clear_sign = simde_mm_andnot_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C(-0.0)), result); - r = simde_x_mm_select_pd(result, a, simde_mm_cmpeq_pd(clear_sign, simde_mm_set1_pd(SIMDE_MATH_INFINITY))); - - return r; - } - #define simde_mm_roundscale_pd(a, imm8) \ - simde_mm_roundscale_pd_internal_( \ - simde_mm_mul_pd( \ - simde_mm_round_pd( \ - simde_mm_mul_pd( \ - a, \ - simde_mm_set1_pd(simde_math_exp2(((imm8 >> 4) & 15)))), \ - ((imm8) & 15) \ - ), \ - simde_mm_set1_pd(simde_math_exp2(-((imm8 >> 4) & 15))) \ - ), \ - (a), \ - (imm8) \ - ) -#endif -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_roundscale_pd - #define _mm_roundscale_pd(a, imm8) simde_mm_roundscale_pd(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_mask_roundscale_pd(src, k, a, imm8) _mm_mask_roundscale_pd(src, k, a, imm8) -#else - #define simde_mm_mask_roundscale_pd(src, k, a, imm8) simde_mm_mask_mov_pd(src, k, simde_mm_roundscale_pd(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_roundscale_pd - #define _mm_mask_roundscale_pd(src, k, a, imm8) simde_mm_mask_roundscale_pd(src, k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_maskz_roundscale_pd(k, a, imm8) _mm_maskz_roundscale_pd(k, a, imm8) -#else - #define simde_mm_maskz_roundscale_pd(k, a, imm8) simde_mm_maskz_mov_pd(k, simde_mm_roundscale_pd(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_roundscale_pd - #define _mm_maskz_roundscale_pd(k, a, imm8) simde_mm_maskz_roundscale_pd(k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm256_roundscale_pd(a, imm8) _mm256_roundscale_pd((a), (imm8)) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm256_roundscale_pd(a, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m256d_private \ - simde_mm256_roundscale_pd_r_ = simde__m256d_to_private(simde_mm256_setzero_pd()), \ - simde_mm256_roundscale_pd_a_ = simde__m256d_to_private(a); \ - \ - for (size_t simde_mm256_roundscale_pd_i = 0 ; simde_mm256_roundscale_pd_i < (sizeof(simde_mm256_roundscale_pd_r_.m128d) / sizeof(simde_mm256_roundscale_pd_r_.m128d[0])) ; simde_mm256_roundscale_pd_i++) { \ - simde_mm256_roundscale_pd_r_.m128d[simde_mm256_roundscale_pd_i] = simde_mm_roundscale_pd(simde_mm256_roundscale_pd_a_.m128d[simde_mm256_roundscale_pd_i], imm8); \ - } \ - \ - simde__m256d_from_private(simde_mm256_roundscale_pd_r_); \ - })) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m256d - simde_mm256_roundscale_pd_internal_ (simde__m256d result, simde__m256d a, int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - HEDLEY_STATIC_CAST(void, imm8); - - simde__m256d r, clear_sign; - - clear_sign = simde_mm256_andnot_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C(-0.0)), result); - r = simde_x_mm256_select_pd(result, a, simde_mm256_castsi256_pd(simde_mm256_cmpeq_epi64(simde_mm256_castpd_si256(clear_sign), simde_mm256_castpd_si256(simde_mm256_set1_pd(SIMDE_MATH_INFINITY))))); - - return r; - } - #define simde_mm256_roundscale_pd(a, imm8) \ - simde_mm256_roundscale_pd_internal_( \ - simde_mm256_mul_pd( \ - simde_mm256_round_pd( \ - simde_mm256_mul_pd( \ - a, \ - simde_mm256_set1_pd(simde_math_exp2(((imm8 >> 4) & 15)))), \ - ((imm8) & 15) \ - ), \ - simde_mm256_set1_pd(simde_math_exp2(-((imm8 >> 4) & 15))) \ - ), \ - (a), \ - (imm8) \ - ) -#endif -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm256_roundscale_pd - #define _mm256_roundscale_pd(a, imm8) simde_mm256_roundscale_pd(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_mask_roundscale_pd(src, k, a, imm8) _mm256_mask_roundscale_pd(src, k, a, imm8) -#else - #define simde_mm256_mask_roundscale_pd(src, k, a, imm8) simde_mm256_mask_mov_pd(src, k, simde_mm256_roundscale_pd(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_roundscale_pd - #define _mm256_mask_roundscale_pd(src, k, a, imm8) simde_mm256_mask_roundscale_pd(src, k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_maskz_roundscale_pd(k, a, imm8) _mm256_maskz_roundscale_pd(k, a, imm8) -#else - #define simde_mm256_maskz_roundscale_pd(k, a, imm8) simde_mm256_maskz_mov_pd(k, simde_mm256_roundscale_pd(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_roundscale_pd - #define _mm256_maskz_roundscale_pd(k, a, imm8) simde_mm256_maskz_roundscale_pd(k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_roundscale_pd(a, imm8) _mm512_roundscale_pd((a), (imm8)) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm512_roundscale_pd(a, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512d_private \ - simde_mm512_roundscale_pd_r_ = simde__m512d_to_private(simde_mm512_setzero_pd()), \ - simde_mm512_roundscale_pd_a_ = simde__m512d_to_private(a); \ - \ - for (size_t simde_mm512_roundscale_pd_i = 0 ; simde_mm512_roundscale_pd_i < (sizeof(simde_mm512_roundscale_pd_r_.m256d) / sizeof(simde_mm512_roundscale_pd_r_.m256d[0])) ; simde_mm512_roundscale_pd_i++) { \ - simde_mm512_roundscale_pd_r_.m256d[simde_mm512_roundscale_pd_i] = simde_mm256_roundscale_pd(simde_mm512_roundscale_pd_a_.m256d[simde_mm512_roundscale_pd_i], imm8); \ - } \ - \ - simde__m512d_from_private(simde_mm512_roundscale_pd_r_); \ - })) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512d - simde_mm512_roundscale_pd_internal_ (simde__m512d result, simde__m512d a, int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - HEDLEY_STATIC_CAST(void, imm8); - - simde__m512d r, clear_sign; - - clear_sign = simde_mm512_andnot_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C(-0.0)), result); - r = simde_mm512_mask_mov_pd(result, simde_mm512_cmpeq_epi64_mask(simde_mm512_castpd_si512(clear_sign), simde_mm512_castpd_si512(simde_mm512_set1_pd(SIMDE_MATH_INFINITY))), a); - - return r; - } - #define simde_mm512_roundscale_pd(a, imm8) \ - simde_mm512_roundscale_pd_internal_( \ - simde_mm512_mul_pd( \ - simde_x_mm512_round_pd( \ - simde_mm512_mul_pd( \ - a, \ - simde_mm512_set1_pd(simde_math_exp2(((imm8 >> 4) & 15)))), \ - ((imm8) & 15) \ - ), \ - simde_mm512_set1_pd(simde_math_exp2(-((imm8 >> 4) & 15))) \ - ), \ - (a), \ - (imm8) \ - ) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_roundscale_pd - #define _mm512_roundscale_pd(a, imm8) simde_mm512_roundscale_pd(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_mask_roundscale_pd(src, k, a, imm8) _mm512_mask_roundscale_pd(src, k, a, imm8) -#else - #define simde_mm512_mask_roundscale_pd(src, k, a, imm8) simde_mm512_mask_mov_pd(src, k, simde_mm512_roundscale_pd(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_roundscale_pd - #define _mm512_mask_roundscale_pd(src, k, a, imm8) simde_mm512_mask_roundscale_pd(src, k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_maskz_roundscale_pd(k, a, imm8) _mm512_maskz_roundscale_pd(k, a, imm8) -#else - #define simde_mm512_maskz_roundscale_pd(k, a, imm8) simde_mm512_maskz_mov_pd(k, simde_mm512_roundscale_pd(a, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_roundscale_pd - #define _mm512_maskz_roundscale_pd(k, a, imm8) simde_mm512_maskz_roundscale_pd(k, a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm_roundscale_ss(a, b, imm8) _mm_roundscale_ss((a), (b), (imm8)) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128 - simde_mm_roundscale_ss_internal_ (simde__m128 result, simde__m128 b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - HEDLEY_STATIC_CAST(void, imm8); - - simde__m128_private - r_ = simde__m128_to_private(result), - b_ = simde__m128_to_private(b); - - if(simde_math_isinff(r_.f32[0])) - r_.f32[0] = b_.f32[0]; - - return simde__m128_from_private(r_); - } - #define simde_mm_roundscale_ss(a, b, imm8) \ - simde_mm_roundscale_ss_internal_( \ - simde_mm_mul_ss( \ - simde_mm_round_ss( \ - a, \ - simde_mm_mul_ss( \ - b, \ - simde_mm_set1_ps(simde_math_exp2f(((imm8 >> 4) & 15)))), \ - ((imm8) & 15) \ - ), \ - simde_mm_set1_ps(simde_math_exp2f(-((imm8 >> 4) & 15))) \ - ), \ - (b), \ - (imm8) \ - ) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_roundscale_ss - #define _mm_roundscale_ss(a, b, imm8) simde_mm_roundscale_ss(a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_92035) - #define simde_mm_mask_roundscale_ss(src, k, a, b, imm8) _mm_mask_roundscale_ss((src), (k), (a), (b), (imm8)) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128 - simde_mm_mask_roundscale_ss_internal_ (simde__m128 a, simde__m128 b, simde__mmask8 k) { - simde__m128 r; - - if(k & 1) - r = a; - else - r = b; - - return r; - } - #define simde_mm_mask_roundscale_ss(src, k, a, b, imm8) \ - simde_mm_mask_roundscale_ss_internal_( \ - simde_mm_roundscale_ss( \ - a, \ - b, \ - imm8 \ - ), \ - simde_mm_move_ss( \ - (a), \ - (src) \ - ), \ - (k) \ - ) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_roundscale_ss - #define _mm_mask_roundscale_ss(src, k, a, b, imm8) simde_mm_mask_roundscale_ss(src, k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_92035) - #define simde_mm_maskz_roundscale_ss(k, a, b, imm8) _mm_maskz_roundscale_ss((k), (a), (b), (imm8)) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128 - simde_mm_maskz_roundscale_ss_internal_ (simde__m128 a, simde__m128 b, simde__mmask8 k) { - simde__m128 r; - - if(k & 1) - r = a; - else - r = b; - - return r; - } - #define simde_mm_maskz_roundscale_ss(k, a, b, imm8) \ - simde_mm_maskz_roundscale_ss_internal_( \ - simde_mm_roundscale_ss( \ - a, \ - b, \ - imm8 \ - ), \ - simde_mm_move_ss( \ - (a), \ - simde_mm_setzero_ps() \ - ), \ - (k) \ - ) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_roundscale_ss - #define _mm_maskz_roundscale_ss(k, a, b, imm8) simde_mm_maskz_roundscale_ss(k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm_roundscale_sd(a, b, imm8) _mm_roundscale_sd((a), (b), (imm8)) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128d - simde_mm_roundscale_sd_internal_ (simde__m128d result, simde__m128d b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - HEDLEY_STATIC_CAST(void, imm8); - - simde__m128d_private - r_ = simde__m128d_to_private(result), - b_ = simde__m128d_to_private(b); - - if(simde_math_isinf(r_.f64[0])) - r_.f64[0] = b_.f64[0]; - - return simde__m128d_from_private(r_); - } - #define simde_mm_roundscale_sd(a, b, imm8) \ - simde_mm_roundscale_sd_internal_( \ - simde_mm_mul_sd( \ - simde_mm_round_sd( \ - a, \ - simde_mm_mul_sd( \ - b, \ - simde_mm_set1_pd(simde_math_exp2(((imm8 >> 4) & 15)))), \ - ((imm8) & 15) \ - ), \ - simde_mm_set1_pd(simde_math_exp2(-((imm8 >> 4) & 15))) \ - ), \ - (b), \ - (imm8) \ - ) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_roundscale_sd - #define _mm_roundscale_sd(a, b, imm8) simde_mm_roundscale_sd(a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_92035) - #define simde_mm_mask_roundscale_sd(src, k, a, b, imm8) _mm_mask_roundscale_sd((src), (k), (a), (b), (imm8)) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128d - simde_mm_mask_roundscale_sd_internal_ (simde__m128d a, simde__m128d b, simde__mmask8 k) { - simde__m128d r; - - if(k & 1) - r = a; - else - r = b; - - return r; - } - #define simde_mm_mask_roundscale_sd(src, k, a, b, imm8) \ - simde_mm_mask_roundscale_sd_internal_( \ - simde_mm_roundscale_sd( \ - a, \ - b, \ - imm8 \ - ), \ - simde_mm_move_sd( \ - (a), \ - (src) \ - ), \ - (k) \ - ) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_roundscale_sd - #define _mm_mask_roundscale_sd(src, k, a, b, imm8) simde_mm_mask_roundscale_sd(src, k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_92035) - #define simde_mm_maskz_roundscale_sd(k, a, b, imm8) _mm_maskz_roundscale_sd((k), (a), (b), (imm8)) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m128d - simde_mm_maskz_roundscale_sd_internal_ (simde__m128d a, simde__m128d b, simde__mmask8 k) { - simde__m128d r; - - if(k & 1) - r = a; - else - r = b; - - return r; - } - #define simde_mm_maskz_roundscale_sd(k, a, b, imm8) \ - simde_mm_maskz_roundscale_sd_internal_( \ - simde_mm_roundscale_sd( \ - a, \ - b, \ - imm8 \ - ), \ - simde_mm_move_sd( \ - (a), \ - simde_mm_setzero_pd() \ - ), \ - (k) \ - ) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_roundscale_sd - #define _mm_maskz_roundscale_sd(k, a, b, imm8) simde_mm_maskz_roundscale_sd(k, a, b, imm8) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_ROUNDSCALE_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/roundscale_round.h b/ffi-deps/simde/simde/x86/avx512/roundscale_round.h deleted file mode 100644 index f941e48..0000000 --- a/ffi-deps/simde/simde/x86/avx512/roundscale_round.h +++ /dev/null @@ -1,690 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_ROUNDSCALE_ROUND_H) -#define SIMDE_X86_AVX512_ROUNDSCALE_ROUND_H - -#include "types.h" -#include "roundscale.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -#if defined(HEDLEY_MSVC_VERSION) -#pragma warning( push ) -#pragma warning( disable : 4244 ) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_roundscale_round_ps(a, imm8, sae) _mm512_roundscale_round_ps(a, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm512_roundscale_round_ps(a, imm8, sae) simde_mm512_roundscale_ps(a, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm512_roundscale_round_ps(a,imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512 simde_mm512_roundscale_round_ps_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm512_roundscale_round_ps_envp; \ - int simde_mm512_roundscale_round_ps_x = feholdexcept(&simde_mm512_roundscale_round_ps_envp); \ - simde_mm512_roundscale_round_ps_r = simde_mm512_roundscale_ps(a, imm8); \ - if (HEDLEY_LIKELY(simde_mm512_roundscale_round_ps_x == 0)) \ - fesetenv(&simde_mm512_roundscale_round_ps_envp); \ - } \ - else { \ - simde_mm512_roundscale_round_ps_r = simde_mm512_roundscale_ps(a, imm8); \ - } \ - \ - simde_mm512_roundscale_round_ps_r; \ - })) - #else - #define simde_mm512_roundscale_round_ps(a, imm8, sae) simde_mm512_roundscale_ps(a, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512 - simde_mm512_roundscale_round_ps (simde__m512 a, int imm8, int sae) - SIMDE_REQUIRE_RANGE(imm8, 0, 15) { - simde__m512 r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm512_roundscale_ps(a, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm512_roundscale_ps(a, imm8); - #endif - } - else { - r = simde_mm512_roundscale_ps(a, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_roundscale_round_ps - #define _mm512_roundscale_round_ps(a, imm8, sae) simde_mm512_roundscale_round_ps(a, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_92035) - #define simde_mm512_mask_roundscale_round_ps(src, k, a, imm8, sae) _mm512_mask_roundscale_round_ps(src, k, a, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm512_mask_roundscale_round_ps(src, k, a, imm8, sae) simde_mm512_mask_roundscale_ps(src, k, a, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm512_mask_roundscale_round_ps(src, k, a, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512 simde_mm512_mask_roundscale_round_ps_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm512_mask_roundscale_round_ps_envp; \ - int simde_mm512_mask_roundscale_round_ps_x = feholdexcept(&simde_mm512_mask_roundscale_round_ps_envp); \ - simde_mm512_mask_roundscale_round_ps_r = simde_mm512_mask_roundscale_ps(src, k, a, imm8); \ - if (HEDLEY_LIKELY(simde_mm512_mask_roundscale_round_ps_x == 0)) \ - fesetenv(&simde_mm512_mask_roundscale_round_ps_envp); \ - } \ - else { \ - simde_mm512_mask_roundscale_round_ps_r = simde_mm512_mask_roundscale_ps(src, k, a, imm8); \ - } \ - \ - simde_mm512_mask_roundscale_round_ps_r; \ - })) - #else - #define simde_mm512_mask_roundscale_round_ps(src, k, a, imm8, sae) simde_mm512_mask_roundscale_ps(src, k, a, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512 - simde_mm512_mask_roundscale_round_ps (simde__m512 src, simde__mmask8 k, simde__m512 a, int imm8, int sae) - SIMDE_REQUIRE_RANGE(imm8, 0, 15) { - simde__m512 r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm512_mask_roundscale_ps(src, k, a, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm512_mask_roundscale_ps(src, k, a, imm8); - #endif - } - else { - r = simde_mm512_mask_roundscale_ps(src, k, a, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_roundscale_round_ps - #define _mm512_mask_roundscale_round_ps(src, k, a, imm8, sae) simde_mm512_mask_roundscale_round_ps(src, k, a, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_92035) - #define simde_mm512_maskz_roundscale_round_ps(k, a, imm8, sae) _mm512_maskz_roundscale_round_ps(k, a, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm512_maskz_roundscale_round_ps(k, a, imm8, sae) simde_mm512_maskz_roundscale_ps(k, a, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm512_maskz_roundscale_round_ps(k, a, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512 simde_mm512_maskz_roundscale_round_ps_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm512_maskz_roundscale_round_ps_envp; \ - int simde_mm512_maskz_roundscale_round_ps_x = feholdexcept(&simde_mm512_maskz_roundscale_round_ps_envp); \ - simde_mm512_maskz_roundscale_round_ps_r = simde_mm512_maskz_roundscale_ps(k, a, imm8); \ - if (HEDLEY_LIKELY(simde_mm512_maskz_roundscale_round_ps_x == 0)) \ - fesetenv(&simde_mm512_maskz_roundscale_round_ps_envp); \ - } \ - else { \ - simde_mm512_maskz_roundscale_round_ps_r = simde_mm512_maskz_roundscale_ps(k, a, imm8); \ - } \ - \ - simde_mm512_maskz_roundscale_round_ps_r; \ - })) - #else - #define simde_mm512_maskz_roundscale_round_ps(src, k, a, imm8, sae) simde_mm512_maskz_roundscale_ps(k, a, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512 - simde_mm512_maskz_roundscale_round_ps (simde__mmask8 k, simde__m512 a, int imm8, int sae) - SIMDE_REQUIRE_RANGE(imm8, 0, 15) { - simde__m512 r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm512_maskz_roundscale_ps(k, a, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm512_maskz_roundscale_ps(k, a, imm8); - #endif - } - else { - r = simde_mm512_maskz_roundscale_ps(k, a, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_roundscale_round_ps - #define _mm512_maskz_roundscale_round_ps(k, a, imm8, sae) simde_mm512_maskz_roundscale_round_ps(k, a, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_roundscale_round_pd(a, imm8, sae) _mm512_roundscale_round_pd(a, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm512_roundscale_round_pd(a, imm8, sae) simde_mm512_roundscale_pd(a, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm512_roundscale_round_pd(a, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512d simde_mm512_roundscale_round_pd_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm512_roundscale_round_pd_envp; \ - int simde_mm512_roundscale_round_pd_x = feholdexcept(&simde_mm512_roundscale_round_pd_envp); \ - simde_mm512_roundscale_round_pd_r = simde_mm512_roundscale_pd(a, imm8); \ - if (HEDLEY_LIKELY(simde_mm512_roundscale_round_pd_x == 0)) \ - fesetenv(&simde_mm512_roundscale_round_pd_envp); \ - } \ - else { \ - simde_mm512_roundscale_round_pd_r = simde_mm512_roundscale_pd(a, imm8); \ - } \ - \ - simde_mm512_roundscale_round_pd_r; \ - })) - #else - #define simde_mm512_roundscale_round_pd(a, imm8, sae) simde_mm512_roundscale_pd(a, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512d - simde_mm512_roundscale_round_pd (simde__m512d a, int imm8, int sae) - SIMDE_REQUIRE_RANGE(imm8, 0, 15) { - simde__m512d r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm512_roundscale_pd(a, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm512_roundscale_pd(a, imm8); - #endif - } - else { - r = simde_mm512_roundscale_pd(a, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_roundscale_round_pd - #define _mm512_roundscale_round_pd(a, imm8, sae) simde_mm512_roundscale_round_pd(a, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_92035) - #define simde_mm512_mask_roundscale_round_pd(src, k, a, imm8, sae) _mm512_mask_roundscale_round_pd(src, k, a, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm512_mask_roundscale_round_pd(src, k, a, imm8, sae) simde_mm512_mask_roundscale_pd(src, k, a, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm512_mask_roundscale_round_pd(src, k, a, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512d simde_mm512_mask_roundscale_round_pd_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm512_mask_roundscale_round_pd_envp; \ - int simde_mm512_mask_roundscale_round_pd_x = feholdexcept(&simde_mm512_mask_roundscale_round_pd_envp); \ - simde_mm512_mask_roundscale_round_pd_r = simde_mm512_mask_roundscale_pd(src, k, a, imm8); \ - if (HEDLEY_LIKELY(simde_mm512_mask_roundscale_round_pd_x == 0)) \ - fesetenv(&simde_mm512_mask_roundscale_round_pd_envp); \ - } \ - else { \ - simde_mm512_mask_roundscale_round_pd_r = simde_mm512_mask_roundscale_pd(src, k, a, imm8); \ - } \ - \ - simde_mm512_mask_roundscale_round_pd_r; \ - })) - #else - #define simde_mm512_mask_roundscale_round_pd(src, k, a, imm8, sae) simde_mm512_mask_roundscale_pd(src, k, a, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512d - simde_mm512_mask_roundscale_round_pd (simde__m512d src, simde__mmask8 k, simde__m512d a, int imm8, int sae) - SIMDE_REQUIRE_RANGE(imm8, 0, 15) { - simde__m512d r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm512_mask_roundscale_pd(src, k, a, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm512_mask_roundscale_pd(src, k, a, imm8); - #endif - } - else { - r = simde_mm512_mask_roundscale_pd(src, k, a, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_roundscale_round_pd - #define _mm512_mask_roundscale_round_pd(src, k, a, imm8, sae) simde_mm512_mask_roundscale_round_pd(src, k, a, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_92035) - #define simde_mm512_maskz_roundscale_round_pd(k, a, imm8, sae) _mm512_maskz_roundscale_round_pd(k, a, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm512_maskz_roundscale_round_pd(k, a, imm8, sae) simde_mm512_maskz_roundscale_pd(k, a, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm512_maskz_roundscale_round_pd(k, a, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512d simde_mm512_maskz_roundscale_round_pd_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm512_maskz_roundscale_round_pd_envp; \ - int simde_mm512_maskz_roundscale_round_pd_x = feholdexcept(&simde_mm512_maskz_roundscale_round_pd_envp); \ - simde_mm512_maskz_roundscale_round_pd_r = simde_mm512_maskz_roundscale_pd(k, a, imm8); \ - if (HEDLEY_LIKELY(simde_mm512_maskz_roundscale_round_pd_x == 0)) \ - fesetenv(&simde_mm512_maskz_roundscale_round_pd_envp); \ - } \ - else { \ - simde_mm512_maskz_roundscale_round_pd_r = simde_mm512_maskz_roundscale_pd(k, a, imm8); \ - } \ - \ - simde_mm512_maskz_roundscale_round_pd_r; \ - })) - #else - #define simde_mm512_maskz_roundscale_round_pd(src, k, a, imm8, sae) simde_mm512_maskz_roundscale_pd(k, a, imm8) - #endif -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512d - simde_mm512_maskz_roundscale_round_pd (simde__mmask8 k, simde__m512d a, int imm8, int sae) - SIMDE_REQUIRE_RANGE(imm8, 0, 15) { - simde__m512d r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm512_maskz_roundscale_pd(k, a, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm512_maskz_roundscale_pd(k, a, imm8); - #endif - } - else { - r = simde_mm512_maskz_roundscale_pd(k, a, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_roundscale_round_pd - #define _mm512_maskz_roundscale_round_pd(k, a, imm8, sae) simde_mm512_maskz_roundscale_round_pd(k, a, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm_roundscale_round_ss(a, b, imm8, sae) _mm_roundscale_round_ss(a, b, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm_roundscale_round_ss(a, b, imm8, sae) simde_mm_roundscale_ss(a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm_roundscale_round_ss(a, b, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m128 simde_mm_roundscale_round_ss_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm_roundscale_round_ss_envp; \ - int simde_mm_roundscale_round_ss_x = feholdexcept(&simde_mm_roundscale_round_ss_envp); \ - simde_mm_roundscale_round_ss_r = simde_mm_roundscale_ss(a, b, imm8); \ - if (HEDLEY_LIKELY(simde_mm_roundscale_round_ss_x == 0)) \ - fesetenv(&simde_mm_roundscale_round_ss_envp); \ - } \ - else { \ - simde_mm_roundscale_round_ss_r = simde_mm_roundscale_ss(a, b, imm8); \ - } \ - \ - simde_mm_roundscale_round_ss_r; \ - })) - #else - #define simde_mm_roundscale_round_ss(a, b, imm8, sae) simde_mm_roundscale_ss(a, b, imm8) - #endif -#elif !(defined(HEDLEY_MSVC_VERSION) && defined(SIMDE_X86_AVX_NATIVE)) - SIMDE_FUNCTION_ATTRIBUTES - simde__m128 - simde_mm_roundscale_round_ss (simde__m128 a, simde__m128 b, const int imm8, const int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m128 r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm_roundscale_ss(a, b, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm_roundscale_ss(a, b, imm8); - #endif - } - else { - r = simde_mm_roundscale_ss(a, b, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_roundscale_round_ss - #define _mm_roundscale_round_ss(a, b, imm8, sae) simde_mm_roundscale_round_ss(a, b, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_92035) - #define simde_mm_mask_roundscale_round_ss(src, k, a, b, imm8, sae) _mm_mask_roundscale_round_ss(src, k, a, b, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm_mask_roundscale_round_ss(src, k, a, b, imm8, sae) simde_mm_mask_roundscale_ss(src, k, a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm_mask_roundscale_round_ss(src, k, a, b, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m128 simde_mm_mask_roundscale_round_ss_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm_mask_roundscale_round_ss_envp; \ - int simde_mm_mask_roundscale_round_ss_x = feholdexcept(&simde_mm_mask_roundscale_round_ss_envp); \ - simde_mm_mask_roundscale_round_ss_r = simde_mm_mask_roundscale_ss(src, k, a, b, imm8); \ - if (HEDLEY_LIKELY(simde_mm_mask_roundscale_round_ss_x == 0)) \ - fesetenv(&simde_mm_mask_roundscale_round_ss_envp); \ - } \ - else { \ - simde_mm_mask_roundscale_round_ss_r = simde_mm_mask_roundscale_ss(src, k, a, b, imm8); \ - } \ - \ - simde_mm_mask_roundscale_round_ss_r; \ - })) - #else - #define simde_mm_mask_roundscale_round_ss(src, k, a, b, imm8, sae) simde_mm_mask_roundscale_ss(src, k, a, b, imm8) - #endif -#elif !(defined(HEDLEY_MSVC_VERSION) && defined(SIMDE_X86_AVX_NATIVE)) - SIMDE_FUNCTION_ATTRIBUTES - simde__m128 - simde_mm_mask_roundscale_round_ss (simde__m128 src, simde__mmask8 k, simde__m128 a, simde__m128 b, const int imm8, const int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m128 r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm_mask_roundscale_ss(src, k, a, b, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm_mask_roundscale_ss(src, k, a, b, imm8); - #endif - } - else { - r = simde_mm_mask_roundscale_ss(src, k, a, b, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_roundscale_round_ss - #define _mm_mask_roundscale_round_ss(src, k, a, b, imm8, sae) simde_mm_mask_roundscale_round_ss(src, k, a, b, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_92035) - #define simde_mm_maskz_roundscale_round_ss(k, a, b, imm8, sae) _mm_maskz_roundscale_round_ss(k, a, b, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm_maskz_roundscale_round_ss(k, a, b, imm8, sae) simde_mm_maskz_roundscale_ss(k, a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm_maskz_roundscale_round_ss(k, a, b, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m128 simde_mm_maskz_roundscale_round_ss_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm_maskz_roundscale_round_ss_envp; \ - int simde_mm_maskz_roundscale_round_ss_x = feholdexcept(&simde_mm_maskz_roundscale_round_ss_envp); \ - simde_mm_maskz_roundscale_round_ss_r = simde_mm_maskz_roundscale_ss(k, a, b, imm8); \ - if (HEDLEY_LIKELY(simde_mm_maskz_roundscale_round_ss_x == 0)) \ - fesetenv(&simde_mm_maskz_roundscale_round_ss_envp); \ - } \ - else { \ - simde_mm_maskz_roundscale_round_ss_r = simde_mm_maskz_roundscale_ss(k, a, b, imm8); \ - } \ - \ - simde_mm_maskz_roundscale_round_ss_r; \ - })) - #else - #define simde_mm_maskz_roundscale_round_ss(k, a, b, imm8, sae) simde_mm_maskz_roundscale_ss(k, a, b, imm8) - #endif -#elif !(defined(HEDLEY_MSVC_VERSION) && defined(SIMDE_X86_AVX_NATIVE)) - SIMDE_FUNCTION_ATTRIBUTES - simde__m128 - simde_mm_maskz_roundscale_round_ss (simde__mmask8 k, simde__m128 a, simde__m128 b, const int imm8, const int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m128 r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm_maskz_roundscale_ss(k, a, b, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm_maskz_roundscale_ss(k, a, b, imm8); - #endif - } - else { - r = simde_mm_maskz_roundscale_ss(k, a, b, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_roundscale_round_ss - #define _mm_maskz_roundscale_round_ss(k, a, b, imm8, sae) simde_mm_maskz_roundscale_round_ss(k, a, b, imm8, sae) -#endif - -#if defined(HEDLEY_MSVC_VERSION) -#pragma warning( pop ) -#endif - - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm_roundscale_round_sd(a, b, imm8, sae) _mm_roundscale_round_sd(a, b, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm_roundscale_round_sd(a, b, imm8, sae) simde_mm_roundscale_sd(a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm_roundscale_round_sd(a, b, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m128d simde_mm_roundscale_round_sd_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm_roundscale_round_sd_envp; \ - int simde_mm_roundscale_round_sd_x = feholdexcept(&simde_mm_roundscale_round_sd_envp); \ - simde_mm_roundscale_round_sd_r = simde_mm_roundscale_sd(a, b, imm8); \ - if (HEDLEY_LIKELY(simde_mm_roundscale_round_sd_x == 0)) \ - fesetenv(&simde_mm_roundscale_round_sd_envp); \ - } \ - else { \ - simde_mm_roundscale_round_sd_r = simde_mm_roundscale_sd(a, b, imm8); \ - } \ - \ - simde_mm_roundscale_round_sd_r; \ - })) - #else - #define simde_mm_roundscale_round_sd(a, b, imm8, sae) simde_mm_roundscale_sd(a, b, imm8) - #endif -#elif !(defined(HEDLEY_MSVC_VERSION) && defined(SIMDE_X86_AVX_NATIVE)) - SIMDE_FUNCTION_ATTRIBUTES - simde__m128d - simde_mm_roundscale_round_sd (simde__m128d a, simde__m128d b, const int imm8, const int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m128d r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm_roundscale_sd(a, b, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm_roundscale_sd(a, b, imm8); - #endif - } - else { - r = simde_mm_roundscale_sd(a, b, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_roundscale_round_sd - #define _mm_roundscale_round_sd(a, b, imm8, sae) simde_mm_roundscale_round_sd(a, b, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_92035) - #define simde_mm_mask_roundscale_round_sd(src, k, a, b, imm8, sae) _mm_mask_roundscale_round_sd(src, k, a, b, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm_mask_roundscale_round_sd(src, k, a, b, imm8, sae) simde_mm_mask_roundscale_sd(src, k, a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm_mask_roundscale_round_sd(src, k, a, b, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m128d simde_mm_mask_roundscale_round_sd_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm_mask_roundscale_round_sd_envp; \ - int simde_mm_mask_roundscale_round_sd_x = feholdexcept(&simde_mm_mask_roundscale_round_sd_envp); \ - simde_mm_mask_roundscale_round_sd_r = simde_mm_mask_roundscale_sd(src, k, a, b, imm8); \ - if (HEDLEY_LIKELY(simde_mm_mask_roundscale_round_sd_x == 0)) \ - fesetenv(&simde_mm_mask_roundscale_round_sd_envp); \ - } \ - else { \ - simde_mm_mask_roundscale_round_sd_r = simde_mm_mask_roundscale_sd(src, k, a, b, imm8); \ - } \ - \ - simde_mm_mask_roundscale_round_sd_r; \ - })) - #else - #define simde_mm_mask_roundscale_round_sd(src, k, a, b, imm8, sae) simde_mm_mask_roundscale_sd(src, k, a, b, imm8) - #endif -#elif !(defined(HEDLEY_MSVC_VERSION) && defined(SIMDE_X86_AVX_NATIVE)) - SIMDE_FUNCTION_ATTRIBUTES - simde__m128d - simde_mm_mask_roundscale_round_sd (simde__m128d src, simde__mmask8 k, simde__m128d a, simde__m128d b, const int imm8, const int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m128d r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm_mask_roundscale_sd(src, k, a, b, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm_mask_roundscale_sd(src, k, a, b, imm8); - #endif - } - else { - r = simde_mm_mask_roundscale_sd(src, k, a, b, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_roundscale_round_sd - #define _mm_mask_roundscale_round_sd(src, k, a, b, imm8, sae) simde_mm_mask_roundscale_round_sd(src, k, a, b, imm8, sae) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_92035) - #define simde_mm_maskz_roundscale_round_sd(k, a, b, imm8, sae) _mm_maskz_roundscale_round_sd(k, a, b, imm8, sae) -#elif defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm_maskz_roundscale_round_sd(k, a, b, imm8, sae) simde_mm_maskz_roundscale_sd(k, a, b, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) - #if defined(SIMDE_HAVE_FENV_H) - #define simde_mm_maskz_roundscale_round_sd(k, a, b, imm8, sae) SIMDE_STATEMENT_EXPR_(({ \ - simde__m128d simde_mm_maskz_roundscale_round_sd_r; \ - \ - if (sae & SIMDE_MM_FROUND_NO_EXC) { \ - fenv_t simde_mm_maskz_roundscale_round_sd_envp; \ - int simde_mm_maskz_roundscale_round_sd_x = feholdexcept(&simde_mm_maskz_roundscale_round_sd_envp); \ - simde_mm_maskz_roundscale_round_sd_r = simde_mm_maskz_roundscale_sd(k, a, b, imm8); \ - if (HEDLEY_LIKELY(simde_mm_maskz_roundscale_round_sd_x == 0)) \ - fesetenv(&simde_mm_maskz_roundscale_round_sd_envp); \ - } \ - else { \ - simde_mm_maskz_roundscale_round_sd_r = simde_mm_maskz_roundscale_sd(k, a, b, imm8); \ - } \ - \ - simde_mm_maskz_roundscale_round_sd_r; \ - })) - #else - #define simde_mm_maskz_roundscale_round_sd(k, a, b, imm8, sae) simde_mm_maskz_roundscale_sd(k, a, b, imm8) - #endif -#elif !(defined(HEDLEY_MSVC_VERSION) && defined(SIMDE_X86_AVX_NATIVE)) - SIMDE_FUNCTION_ATTRIBUTES - simde__m128d - simde_mm_maskz_roundscale_round_sd (simde__mmask8 k, simde__m128d a, simde__m128d b, const int imm8, const int sae) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) - SIMDE_REQUIRE_CONSTANT(sae) { - simde__m128d r; - - if (sae & SIMDE_MM_FROUND_NO_EXC) { - #if defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = simde_mm_maskz_roundscale_sd(k, a, b, imm8); - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = simde_mm_maskz_roundscale_sd(k, a, b, imm8); - #endif - } - else { - r = simde_mm_maskz_roundscale_sd(k, a, b, imm8); - } - - return r; - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_roundscale_round_sd - #define _mm_maskz_roundscale_round_sd(k, a, b, imm8, sae) simde_mm_maskz_roundscale_round_sd(k, a, b, imm8, sae) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_ROUNDSCALE_ROUND_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/sad.h b/ffi-deps/simde/simde/x86/avx512/sad.h deleted file mode 100644 index 60623e6..0000000 --- a/ffi-deps/simde/simde/x86/avx512/sad.h +++ /dev/null @@ -1,77 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_SAD_H) -#define SIMDE_X86_AVX512_SAD_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_sad_epu8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_sad_epu8(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_sad_epu8(a_.m256i[i], b_.m256i[i]); - } - #else - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - uint16_t tmp = 0; - SIMDE_VECTORIZE_REDUCTION(+:tmp) - for (size_t j = 0 ; j < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 8) ; j++) { - const size_t e = j + (i * 8); - tmp += (a_.u8[e] > b_.u8[e]) ? (a_.u8[e] - b_.u8[e]) : (b_.u8[e] - a_.u8[e]); - } - r_.i64[i] = tmp; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_sad_epu8 - #define _mm512_sad_epu8(a, b) simde_mm512_sad_epu8(a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SAD_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/scalef.h b/ffi-deps/simde/simde/x86/avx512/scalef.h deleted file mode 100644 index 1167331..0000000 --- a/ffi-deps/simde/simde/x86/avx512/scalef.h +++ /dev/null @@ -1,389 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_SCALEF_H) -#define SIMDE_X86_AVX512_SCALEF_H - -#include "types.h" -#include "flushsubnormal.h" -#include "../svml.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_scalef_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_scalef_ps(a, b); - #else - return simde_mm_mul_ps(simde_x_mm_flushsubnormal_ps(a), simde_mm_exp2_ps(simde_mm_floor_ps(simde_x_mm_flushsubnormal_ps(b)))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_scalef_ps - #define _mm_scalef_ps(a, b) simde_mm_scalef_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_mask_scalef_ps (simde__m128 src, simde__mmask8 k, simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_scalef_ps(src, k, a, b); - #else - return simde_mm_mask_mov_ps(src, k, simde_mm_scalef_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_scalef_ps - #define _mm_mask_scalef_ps(src, k, a, b) simde_mm_mask_scalef_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_maskz_scalef_ps (simde__mmask8 k, simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_scalef_ps(k, a, b); - #else - return simde_mm_maskz_mov_ps(k, simde_mm_scalef_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_scalef_ps - #define _mm_maskz_scalef_ps(k, a, b) simde_mm_maskz_scalef_ps(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_scalef_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_scalef_ps(a, b); - #else - return simde_mm256_mul_ps(simde_x_mm256_flushsubnormal_ps(a), simde_mm256_exp2_ps(simde_mm256_floor_ps(simde_x_mm256_flushsubnormal_ps(b)))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_scalef_ps - #define _mm256_scalef_ps(a, b) simde_mm256_scalef_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_mask_scalef_ps (simde__m256 src, simde__mmask8 k, simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_scalef_ps(src, k, a, b); - #else - return simde_mm256_mask_mov_ps(src, k, simde_mm256_scalef_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_scalef_ps - #define _mm256_mask_scalef_ps(src, k, a, b) simde_mm256_mask_scalef_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_maskz_scalef_ps (simde__mmask8 k, simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_scalef_ps(k, a, b); - #else - return simde_mm256_maskz_mov_ps(k, simde_mm256_scalef_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_scalef_ps - #define _mm256_maskz_scalef_ps(k, a, b) simde_mm256_maskz_scalef_ps(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_scalef_ps (simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_scalef_ps(a, b); - #else - return simde_mm512_mul_ps(simde_x_mm512_flushsubnormal_ps(a), simde_mm512_exp2_ps(simde_mm512_floor_ps(simde_x_mm512_flushsubnormal_ps(b)))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_scalef_ps - #define _mm512_scalef_ps(a, b) simde_mm512_scalef_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_scalef_ps (simde__m512 src, simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_scalef_ps(src, k, a, b); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_scalef_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_scalef_ps - #define _mm512_mask_scalef_ps(src, k, a, b) simde_mm512_mask_scalef_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_scalef_ps (simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_scalef_ps(k, a, b); - #else - return simde_mm512_maskz_mov_ps(k, simde_mm512_scalef_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_scalef_ps - #define _mm512_maskz_scalef_ps(k, a, b) simde_mm512_maskz_scalef_ps(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_scalef_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_scalef_pd(a, b); - #else - return simde_mm_mul_pd(simde_x_mm_flushsubnormal_pd(a), simde_mm_exp2_pd(simde_mm_floor_pd(simde_x_mm_flushsubnormal_pd(b)))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_scalef_pd - #define _mm_scalef_pd(a, b) simde_mm_scalef_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_mask_scalef_pd (simde__m128d src, simde__mmask8 k, simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_scalef_pd(src, k, a, b); - #else - return simde_mm_mask_mov_pd(src, k, simde_mm_scalef_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_scalef_pd - #define _mm_mask_scalef_pd(src, k, a, b) simde_mm_mask_scalef_pd(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_maskz_scalef_pd (simde__mmask8 k, simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_scalef_pd(k, a, b); - #else - return simde_mm_maskz_mov_pd(k, simde_mm_scalef_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_scalef_pd - #define _mm_maskz_scalef_pd(k, a, b) simde_mm_maskz_scalef_pd(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_scalef_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_scalef_pd(a, b); - #else - return simde_mm256_mul_pd(simde_x_mm256_flushsubnormal_pd(a), simde_mm256_exp2_pd(simde_mm256_floor_pd(simde_x_mm256_flushsubnormal_pd(b)))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_scalef_pd - #define _mm256_scalef_pd(a, b) simde_mm256_scalef_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_mask_scalef_pd (simde__m256d src, simde__mmask8 k, simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_scalef_pd(src, k, a, b); - #else - return simde_mm256_mask_mov_pd(src, k, simde_mm256_scalef_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_scalef_pd - #define _mm256_mask_scalef_pd(src, k, a, b) simde_mm256_mask_scalef_pd(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_maskz_scalef_pd (simde__mmask8 k, simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_scalef_pd(k, a, b); - #else - return simde_mm256_maskz_mov_pd(k, simde_mm256_scalef_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_scalef_pd - #define _mm256_maskz_scalef_pd(k, a, b) simde_mm256_maskz_scalef_pd(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_scalef_pd (simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_scalef_pd(a, b); - #else - return simde_mm512_mul_pd(simde_x_mm512_flushsubnormal_pd(a), simde_mm512_exp2_pd(simde_mm512_floor_pd(simde_x_mm512_flushsubnormal_pd(b)))); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_scalef_pd - #define _mm512_scalef_pd(a, b) simde_mm512_scalef_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_scalef_pd (simde__m512d src, simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_scalef_pd(src, k, a, b); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_scalef_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_scalef_pd - #define _mm512_mask_scalef_pd(src, k, a, b) simde_mm512_mask_scalef_pd(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_maskz_scalef_pd (simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_scalef_pd(k, a, b); - #else - return simde_mm512_maskz_mov_pd(k, simde_mm512_scalef_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_scalef_pd - #define _mm512_maskz_scalef_pd(k, a, b) simde_mm512_maskz_scalef_pd(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_scalef_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm_scalef_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - a_.f32[0] = (simde_math_issubnormalf(a_.f32[0]) ? 0 : a_.f32[0]) * simde_math_exp2f(simde_math_floorf((simde_math_issubnormalf(b_.f32[0]) ? 0 : b_.f32[0]))); - - return simde__m128_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_scalef_ss - #define _mm_scalef_ss(a, b) simde_mm_scalef_ss(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_mask_scalef_ss (simde__m128 src, simde__mmask8 k, simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(HEDLEY_GCC_VERSION) - return _mm_mask_scalef_round_ss(src, k, a, b, _MM_FROUND_CUR_DIRECTION); - #else - simde__m128_private - src_ = simde__m128_to_private(src), - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - a_.f32[0] = ((k & 1) ? ((simde_math_issubnormalf(a_.f32[0]) ? 0 : a_.f32[0]) * simde_math_exp2f(simde_math_floorf((simde_math_issubnormalf(b_.f32[0]) ? 0 : b_.f32[0])))) : src_.f32[0]); - - return simde__m128_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_scalef_ss - #define _mm_mask_scalef_ss(src, k, a, b) simde_mm_mask_scalef_ss(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_maskz_scalef_ss (simde__mmask8 k, simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_GCC_105339) - return _mm_maskz_scalef_ss(k, a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - a_.f32[0] = ((k & 1) ? ((simde_math_issubnormalf(a_.f32[0]) ? 0 : a_.f32[0]) * simde_math_exp2f(simde_math_floorf((simde_math_issubnormalf(b_.f32[0]) ? 0 : b_.f32[0])))) : SIMDE_FLOAT32_C(0.0)); - - return simde__m128_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_scalef_ss - #define _mm_maskz_scalef_ss(k, a, b) simde_mm_maskz_scalef_ss(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_scalef_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm_scalef_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - a_.f64[0] = (simde_math_issubnormal(a_.f64[0]) ? 0 : a_.f64[0]) * simde_math_exp2(simde_math_floor((simde_math_issubnormal(b_.f64[0]) ? 0 : b_.f64[0]))); - - return simde__m128d_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_scalef_sd - #define _mm_scalef_sd(a, b) simde_mm_scalef_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_mask_scalef_sd (simde__m128d src, simde__mmask8 k, simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_GCC_105339) - return _mm_mask_scalef_sd(src, k, a, b); - #else - simde__m128d_private - src_ = simde__m128d_to_private(src), - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - a_.f64[0] = ((k & 1) ? ((simde_math_issubnormal(a_.f64[0]) ? 0 : a_.f64[0]) * simde_math_exp2(simde_math_floor((simde_math_issubnormal(b_.f64[0]) ? 0 : b_.f64[0])))) : src_.f64[0]); - - return simde__m128d_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_scalef_sd - #define _mm_mask_scalef_sd(src, k, a, b) simde_mm_mask_scalef_sd(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_maskz_scalef_sd (simde__mmask8 k, simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_GCC_105339) - return _mm_maskz_scalef_sd(k, a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - a_.f64[0] = ((k & 1) ? ((simde_math_issubnormal(a_.f64[0]) ? 0 : a_.f64[0]) * simde_math_exp2(simde_math_floor(simde_math_issubnormal(b_.f64[0]) ? 0 : b_.f64[0]))) : SIMDE_FLOAT64_C(0.0)); - - return simde__m128d_from_private(a_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_scalef_sd - #define _mm_maskz_scalef_sd(k, a, b) simde_mm_maskz_scalef_sd(k, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SCALEF_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/set.h b/ffi-deps/simde/simde/x86/avx512/set.h deleted file mode 100644 index d87a72c..0000000 --- a/ffi-deps/simde/simde/x86/avx512/set.h +++ /dev/null @@ -1,572 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - */ - -#if !defined(SIMDE_X86_AVX512_SET_H) -#define SIMDE_X86_AVX512_SET_H - -#include "types.h" -#include "load.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_set_epi16 (int16_t e31, int16_t e30, int16_t e29, int16_t e28, int16_t e27, int16_t e26, int16_t e25, int16_t e24, - int16_t e23, int16_t e22, int16_t e21, int16_t e20, int16_t e19, int16_t e18, int16_t e17, int16_t e16, - int16_t e15, int16_t e14, int16_t e13, int16_t e12, int16_t e11, int16_t e10, int16_t e9, int16_t e8, - int16_t e7, int16_t e6, int16_t e5, int16_t e4, int16_t e3, int16_t e2, int16_t e1, int16_t e0) { - simde__m512i_private r_; - - r_.i16[ 0] = e0; - r_.i16[ 1] = e1; - r_.i16[ 2] = e2; - r_.i16[ 3] = e3; - r_.i16[ 4] = e4; - r_.i16[ 5] = e5; - r_.i16[ 6] = e6; - r_.i16[ 7] = e7; - r_.i16[ 8] = e8; - r_.i16[ 9] = e9; - r_.i16[10] = e10; - r_.i16[11] = e11; - r_.i16[12] = e12; - r_.i16[13] = e13; - r_.i16[14] = e14; - r_.i16[15] = e15; - r_.i16[16] = e16; - r_.i16[17] = e17; - r_.i16[18] = e18; - r_.i16[19] = e19; - r_.i16[20] = e20; - r_.i16[21] = e21; - r_.i16[22] = e22; - r_.i16[23] = e23; - r_.i16[24] = e24; - r_.i16[25] = e25; - r_.i16[26] = e26; - r_.i16[27] = e27; - r_.i16[28] = e28; - r_.i16[29] = e29; - r_.i16[30] = e30; - r_.i16[31] = e31; - - return simde__m512i_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_set_epi16 - #define _mm512_set_epi16(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm512_set_epi16(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_set_epi32 (int32_t e15, int32_t e14, int32_t e13, int32_t e12, int32_t e11, int32_t e10, int32_t e9, int32_t e8, - int32_t e7, int32_t e6, int32_t e5, int32_t e4, int32_t e3, int32_t e2, int32_t e1, int32_t e0) { - simde__m512i_private r_; - - r_.i32[ 0] = e0; - r_.i32[ 1] = e1; - r_.i32[ 2] = e2; - r_.i32[ 3] = e3; - r_.i32[ 4] = e4; - r_.i32[ 5] = e5; - r_.i32[ 6] = e6; - r_.i32[ 7] = e7; - r_.i32[ 8] = e8; - r_.i32[ 9] = e9; - r_.i32[10] = e10; - r_.i32[11] = e11; - r_.i32[12] = e12; - r_.i32[13] = e13; - r_.i32[14] = e14; - r_.i32[15] = e15; - - return simde__m512i_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_set_epi32 - #define _mm512_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm512_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_set_epi64 (int64_t e7, int64_t e6, int64_t e5, int64_t e4, int64_t e3, int64_t e2, int64_t e1, int64_t e0) { - simde__m512i_private r_; - - r_.i64[0] = e0; - r_.i64[1] = e1; - r_.i64[2] = e2; - r_.i64[3] = e3; - r_.i64[4] = e4; - r_.i64[5] = e5; - r_.i64[6] = e6; - r_.i64[7] = e7; - - return simde__m512i_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_set_epi64 - #define _mm512_set_epi64(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm512_set_epi64(e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_set_epu8 (uint8_t e63, uint8_t e62, uint8_t e61, uint8_t e60, uint8_t e59, uint8_t e58, uint8_t e57, uint8_t e56, - uint8_t e55, uint8_t e54, uint8_t e53, uint8_t e52, uint8_t e51, uint8_t e50, uint8_t e49, uint8_t e48, - uint8_t e47, uint8_t e46, uint8_t e45, uint8_t e44, uint8_t e43, uint8_t e42, uint8_t e41, uint8_t e40, - uint8_t e39, uint8_t e38, uint8_t e37, uint8_t e36, uint8_t e35, uint8_t e34, uint8_t e33, uint8_t e32, - uint8_t e31, uint8_t e30, uint8_t e29, uint8_t e28, uint8_t e27, uint8_t e26, uint8_t e25, uint8_t e24, - uint8_t e23, uint8_t e22, uint8_t e21, uint8_t e20, uint8_t e19, uint8_t e18, uint8_t e17, uint8_t e16, - uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12, uint8_t e11, uint8_t e10, uint8_t e9, uint8_t e8, - uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4, uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0) { - simde__m512i_private r_; - - r_.u8[ 0] = e0; - r_.u8[ 1] = e1; - r_.u8[ 2] = e2; - r_.u8[ 3] = e3; - r_.u8[ 4] = e4; - r_.u8[ 5] = e5; - r_.u8[ 6] = e6; - r_.u8[ 7] = e7; - r_.u8[ 8] = e8; - r_.u8[ 9] = e9; - r_.u8[10] = e10; - r_.u8[11] = e11; - r_.u8[12] = e12; - r_.u8[13] = e13; - r_.u8[14] = e14; - r_.u8[15] = e15; - r_.u8[16] = e16; - r_.u8[17] = e17; - r_.u8[18] = e18; - r_.u8[19] = e19; - r_.u8[20] = e20; - r_.u8[21] = e21; - r_.u8[22] = e22; - r_.u8[23] = e23; - r_.u8[24] = e24; - r_.u8[25] = e25; - r_.u8[26] = e26; - r_.u8[27] = e27; - r_.u8[28] = e28; - r_.u8[29] = e29; - r_.u8[30] = e30; - r_.u8[31] = e31; - r_.u8[32] = e32; - r_.u8[33] = e33; - r_.u8[34] = e34; - r_.u8[35] = e35; - r_.u8[36] = e36; - r_.u8[37] = e37; - r_.u8[38] = e38; - r_.u8[39] = e39; - r_.u8[40] = e40; - r_.u8[41] = e41; - r_.u8[42] = e42; - r_.u8[43] = e43; - r_.u8[44] = e44; - r_.u8[45] = e45; - r_.u8[46] = e46; - r_.u8[47] = e47; - r_.u8[48] = e48; - r_.u8[49] = e49; - r_.u8[50] = e50; - r_.u8[51] = e51; - r_.u8[52] = e52; - r_.u8[53] = e53; - r_.u8[54] = e54; - r_.u8[55] = e55; - r_.u8[56] = e56; - r_.u8[57] = e57; - r_.u8[58] = e58; - r_.u8[59] = e59; - r_.u8[60] = e60; - r_.u8[61] = e61; - r_.u8[62] = e62; - r_.u8[63] = e63; - - return simde__m512i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_set_epu16 (uint16_t e31, uint16_t e30, uint16_t e29, uint16_t e28, uint16_t e27, uint16_t e26, uint16_t e25, uint16_t e24, - uint16_t e23, uint16_t e22, uint16_t e21, uint16_t e20, uint16_t e19, uint16_t e18, uint16_t e17, uint16_t e16, - uint16_t e15, uint16_t e14, uint16_t e13, uint16_t e12, uint16_t e11, uint16_t e10, uint16_t e9, uint16_t e8, - uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4, uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) { - simde__m512i_private r_; - - r_.u16[ 0] = e0; - r_.u16[ 1] = e1; - r_.u16[ 2] = e2; - r_.u16[ 3] = e3; - r_.u16[ 4] = e4; - r_.u16[ 5] = e5; - r_.u16[ 6] = e6; - r_.u16[ 7] = e7; - r_.u16[ 8] = e8; - r_.u16[ 9] = e9; - r_.u16[10] = e10; - r_.u16[11] = e11; - r_.u16[12] = e12; - r_.u16[13] = e13; - r_.u16[14] = e14; - r_.u16[15] = e15; - r_.u16[16] = e16; - r_.u16[17] = e17; - r_.u16[18] = e18; - r_.u16[19] = e19; - r_.u16[20] = e20; - r_.u16[21] = e21; - r_.u16[22] = e22; - r_.u16[23] = e23; - r_.u16[24] = e24; - r_.u16[25] = e25; - r_.u16[26] = e26; - r_.u16[27] = e27; - r_.u16[28] = e28; - r_.u16[29] = e29; - r_.u16[30] = e30; - r_.u16[31] = e31; - - return simde__m512i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_set_epu32 (uint32_t e15, uint32_t e14, uint32_t e13, uint32_t e12, uint32_t e11, uint32_t e10, uint32_t e9, uint32_t e8, - uint32_t e7, uint32_t e6, uint32_t e5, uint32_t e4, uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) { - simde__m512i_private r_; - - r_.u32[ 0] = e0; - r_.u32[ 1] = e1; - r_.u32[ 2] = e2; - r_.u32[ 3] = e3; - r_.u32[ 4] = e4; - r_.u32[ 5] = e5; - r_.u32[ 6] = e6; - r_.u32[ 7] = e7; - r_.u32[ 8] = e8; - r_.u32[ 9] = e9; - r_.u32[10] = e10; - r_.u32[11] = e11; - r_.u32[12] = e12; - r_.u32[13] = e13; - r_.u32[14] = e14; - r_.u32[15] = e15; - - return simde__m512i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_set_epu64 (uint64_t e7, uint64_t e6, uint64_t e5, uint64_t e4, uint64_t e3, uint64_t e2, uint64_t e1, uint64_t e0) { - simde__m512i_private r_; - - r_.u64[ 0] = e0; - r_.u64[ 1] = e1; - r_.u64[ 2] = e2; - r_.u64[ 3] = e3; - r_.u64[ 4] = e4; - r_.u64[ 5] = e5; - r_.u64[ 6] = e6; - r_.u64[ 7] = e7; - - return simde__m512i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_set_epi8 (int8_t e63, int8_t e62, int8_t e61, int8_t e60, int8_t e59, int8_t e58, int8_t e57, int8_t e56, - int8_t e55, int8_t e54, int8_t e53, int8_t e52, int8_t e51, int8_t e50, int8_t e49, int8_t e48, - int8_t e47, int8_t e46, int8_t e45, int8_t e44, int8_t e43, int8_t e42, int8_t e41, int8_t e40, - int8_t e39, int8_t e38, int8_t e37, int8_t e36, int8_t e35, int8_t e34, int8_t e33, int8_t e32, - int8_t e31, int8_t e30, int8_t e29, int8_t e28, int8_t e27, int8_t e26, int8_t e25, int8_t e24, - int8_t e23, int8_t e22, int8_t e21, int8_t e20, int8_t e19, int8_t e18, int8_t e17, int8_t e16, - int8_t e15, int8_t e14, int8_t e13, int8_t e12, int8_t e11, int8_t e10, int8_t e9, int8_t e8, - int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && (HEDLEY_GCC_VERSION_CHECK(10,0,0) || SIMDE_DETECT_CLANG_VERSION_CHECK(5,0,0)) - return _mm512_set_epi8( - e63, e62, e61, e60, e59, e58, e57, e56, - e55, e54, e53, e52, e51, e50, e49, e48, - e47, e46, e45, e44, e43, e42, e41, e40, - e39, e38, e37, e36, e35, e34, e33, e32, - e31, e30, e29, e28, e27, e26, e25, e24, - e23, e22, e21, e20, e19, e18, e17, e16, - e15, e14, e13, e12, e11, e10, e9, e8, - e7, e6, e5, e4, e3, e2, e1, e0 - ); - #else - simde__m512i_private r_; - - r_.i8[ 0] = e0; - r_.i8[ 1] = e1; - r_.i8[ 2] = e2; - r_.i8[ 3] = e3; - r_.i8[ 4] = e4; - r_.i8[ 5] = e5; - r_.i8[ 6] = e6; - r_.i8[ 7] = e7; - r_.i8[ 8] = e8; - r_.i8[ 9] = e9; - r_.i8[10] = e10; - r_.i8[11] = e11; - r_.i8[12] = e12; - r_.i8[13] = e13; - r_.i8[14] = e14; - r_.i8[15] = e15; - r_.i8[16] = e16; - r_.i8[17] = e17; - r_.i8[18] = e18; - r_.i8[19] = e19; - r_.i8[20] = e20; - r_.i8[21] = e21; - r_.i8[22] = e22; - r_.i8[23] = e23; - r_.i8[24] = e24; - r_.i8[25] = e25; - r_.i8[26] = e26; - r_.i8[27] = e27; - r_.i8[28] = e28; - r_.i8[29] = e29; - r_.i8[30] = e30; - r_.i8[31] = e31; - r_.i8[32] = e32; - r_.i8[33] = e33; - r_.i8[34] = e34; - r_.i8[35] = e35; - r_.i8[36] = e36; - r_.i8[37] = e37; - r_.i8[38] = e38; - r_.i8[39] = e39; - r_.i8[40] = e40; - r_.i8[41] = e41; - r_.i8[42] = e42; - r_.i8[43] = e43; - r_.i8[44] = e44; - r_.i8[45] = e45; - r_.i8[46] = e46; - r_.i8[47] = e47; - r_.i8[48] = e48; - r_.i8[49] = e49; - r_.i8[50] = e50; - r_.i8[51] = e51; - r_.i8[52] = e52; - r_.i8[53] = e53; - r_.i8[54] = e54; - r_.i8[55] = e55; - r_.i8[56] = e56; - r_.i8[57] = e57; - r_.i8[58] = e58; - r_.i8[59] = e59; - r_.i8[60] = e60; - r_.i8[61] = e61; - r_.i8[62] = e62; - r_.i8[63] = e63; - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_set_epi8 - #define _mm512_set_epi8(e63, e62, e61, e60, e59, e58, e57, e56, e55, e54, e53, e52, e51, e50, e49, e48, e47, e46, e45, e44, e43, e42, e41, e40, e39, e38, e37, e36, e35, e34, e33, e32, e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm512_set_epi8(e63, e62, e61, e60, e59, e58, e57, e56, e55, e54, e53, e52, e51, e50, e49, e48, e47, e46, e45, e44, e43, e42, e41, e40, e39, e38, e37, e36, e35, e34, e33, e32, e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_set_m128i (simde__m128i a, simde__m128i b, simde__m128i c, simde__m128i d) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - SIMDE_ALIGN_TO_64 simde__m128i v[] = { d, c, b, a }; - return simde_mm512_load_si512(HEDLEY_STATIC_CAST(__m512i *, HEDLEY_STATIC_CAST(void *, v))); - #else - simde__m512i_private r_; - - r_.m128i[0] = d; - r_.m128i[1] = c; - r_.m128i[2] = b; - r_.m128i[3] = a; - - return simde__m512i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_x_mm512_set_m256 (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - SIMDE_ALIGN_TO_64 simde__m256 v[] = { b, a }; - return simde_mm512_load_ps(HEDLEY_STATIC_CAST(__m512 *, HEDLEY_STATIC_CAST(void *, v))); - #else - simde__m512_private r_; - - r_.m256[0] = b; - r_.m256[1] = a; - - return simde__m512_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_set_m256i (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - SIMDE_ALIGN_TO_64 simde__m256i v[] = { b, a }; - return simde_mm512_load_si512(HEDLEY_STATIC_CAST(__m512i *, HEDLEY_STATIC_CAST(void *, v))); - #else - simde__m512i_private r_; - - r_.m256i[0] = b; - r_.m256i[1] = a; - - return simde__m512i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_x_mm512_set_m256d (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - SIMDE_ALIGN_TO_64 simde__m256d v[] = { b, a }; - return simde_mm512_load_pd(HEDLEY_STATIC_CAST(__m512d *, HEDLEY_STATIC_CAST(void *, v))); - #else - simde__m512d_private r_; - - r_.m256d[0] = b; - r_.m256d[1] = a; - - return simde__m512d_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_set_ps (simde_float32 e15, simde_float32 e14, simde_float32 e13, simde_float32 e12, - simde_float32 e11, simde_float32 e10, simde_float32 e9, simde_float32 e8, - simde_float32 e7, simde_float32 e6, simde_float32 e5, simde_float32 e4, - simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) { - simde__m512_private r_; - - r_.f32[ 0] = e0; - r_.f32[ 1] = e1; - r_.f32[ 2] = e2; - r_.f32[ 3] = e3; - r_.f32[ 4] = e4; - r_.f32[ 5] = e5; - r_.f32[ 6] = e6; - r_.f32[ 7] = e7; - r_.f32[ 8] = e8; - r_.f32[ 9] = e9; - r_.f32[10] = e10; - r_.f32[11] = e11; - r_.f32[12] = e12; - r_.f32[13] = e13; - r_.f32[14] = e14; - r_.f32[15] = e15; - - return simde__m512_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_set_ps - #define _mm512_set_ps(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm512_set_ps(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_set_pd (simde_float64 e7, simde_float64 e6, simde_float64 e5, simde_float64 e4, simde_float64 e3, simde_float64 e2, simde_float64 e1, simde_float64 e0) { - simde__m512d_private r_; - - r_.f64[0] = e0; - r_.f64[1] = e1; - r_.f64[2] = e2; - r_.f64[3] = e3; - r_.f64[4] = e4; - r_.f64[5] = e5; - r_.f64[6] = e6; - r_.f64[7] = e7; - - return simde__m512d_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_set_pd - #define _mm512_set_pd(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm512_set_pd(e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512h -simde_mm512_set_ph (simde_float16 e31, simde_float16 e30, simde_float16 e29, simde_float16 e28, simde_float16 e27, simde_float16 e26, simde_float16 e25, simde_float16 e24, - simde_float16 e23, simde_float16 e22, simde_float16 e21, simde_float16 e20, simde_float16 e19, simde_float16 e18, simde_float16 e17, simde_float16 e16, - simde_float16 e15, simde_float16 e14, simde_float16 e13, simde_float16 e12, simde_float16 e11, simde_float16 e10, simde_float16 e9, simde_float16 e8, - simde_float16 e7, simde_float16 e6, simde_float16 e5, simde_float16 e4, simde_float16 e3, simde_float16 e2, simde_float16 e1, simde_float16 e0) { - simde__m512h_private r_; - - r_.f16[0] = e0; - r_.f16[1] = e1; - r_.f16[2] = e2; - r_.f16[3] = e3; - r_.f16[4] = e4; - r_.f16[5] = e5; - r_.f16[6] = e6; - r_.f16[7] = e7; - r_.f16[8] = e8; - r_.f16[9] = e9; - r_.f16[10] = e10; - r_.f16[11] = e11; - r_.f16[12] = e12; - r_.f16[13] = e13; - r_.f16[14] = e14; - r_.f16[15] = e15; - r_.f16[16] = e16; - r_.f16[17] = e17; - r_.f16[18] = e18; - r_.f16[19] = e19; - r_.f16[20] = e20; - r_.f16[21] = e21; - r_.f16[22] = e22; - r_.f16[23] = e23; - r_.f16[24] = e24; - r_.f16[25] = e25; - r_.f16[26] = e26; - r_.f16[27] = e27; - r_.f16[28] = e28; - r_.f16[29] = e29; - r_.f16[30] = e30; - r_.f16[31] = e31; - - return simde__m512h_from_private(r_); -} -#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) - #undef _mm512_set_ph - #define _mm512_set_ph(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) \ - simde_mm512_set_ph(e31, e30, e29, e28, e27, e26, e25, e24, e23, e22, e21, e20, e19, e18, e17, e16, e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) -#endif - - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SET_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/set1.h b/ffi-deps/simde/simde/x86/avx512/set1.h deleted file mode 100644 index 33ae841..0000000 --- a/ffi-deps/simde/simde/x86/avx512/set1.h +++ /dev/null @@ -1,352 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Himanshi Mathur - */ - -#if !defined(SIMDE_X86_AVX512_SET1_H) -#define SIMDE_X86_AVX512_SET1_H - -#include "types.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_set1_epi8 (int8_t a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_set1_epi8(a); - #else - simde__m512i_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a; - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_set1_epi8 - #define _mm512_set1_epi8(a) simde_mm512_set1_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_set1_epi8(simde__m512i src, simde__mmask64 k, int8_t a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_set1_epi8(src, k, a); - #else - return simde_mm512_mask_mov_epi8(src, k, simde_mm512_set1_epi8(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_set1_epi8 - #define _mm512_mask_set1_epi8(src, k, a) simde_mm512_mask_set1_epi8(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_set1_epi8(simde__mmask64 k, int8_t a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_set1_epi8(k, a); - #else - return simde_mm512_maskz_mov_epi8(k, simde_mm512_set1_epi8(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_set1_epi8 - #define _mm512_maskz_set1_epi8(k, a) simde_mm512_maskz_set1_epi8(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_set1_epi16 (int16_t a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_set1_epi16(a); - #else - simde__m512i_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a; - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_set1_epi16 - #define _mm512_set1_epi16(a) simde_mm512_set1_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_set1_epi16(simde__m512i src, simde__mmask32 k, int16_t a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_set1_epi16(src, k, a); - #else - return simde_mm512_mask_mov_epi16(src, k, simde_mm512_set1_epi16(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_set1_epi16 - #define _mm512_mask_set1_epi16(src, k, a) simde_mm512_mask_set1_epi16(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_set1_epi16(simde__mmask32 k, int16_t a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_set1_epi16(k, a); - #else - return simde_mm512_maskz_mov_epi16(k, simde_mm512_set1_epi16(a)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_set1_epi16 - #define _mm512_maskz_set1_epi16(k, a) simde_mm512_maskz_set1_epi16(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_set1_epi32 (int32_t a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_set1_epi32(a); - #else - simde__m512i_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a; - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_set1_epi32 - #define _mm512_set1_epi32(a) simde_mm512_set1_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_set1_epi32(simde__m512i src, simde__mmask16 k, int32_t a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_set1_epi32(src, k, a); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_set1_epi32(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_set1_epi32 - #define _mm512_mask_set1_epi32(src, k, a) simde_mm512_mask_set1_epi32(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_set1_epi32(simde__mmask16 k, int32_t a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_set1_epi32(k, a); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_set1_epi32(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_set1_epi32 - #define _mm512_maskz_set1_epi32(k, a) simde_mm512_maskz_set1_epi32(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_set1_epi64 (int64_t a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_set1_epi64(a); - #else - simde__m512i_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a; - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_set1_epi64 - #define _mm512_set1_epi64(a) simde_mm512_set1_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_set1_epi64(simde__m512i src, simde__mmask8 k, int64_t a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_set1_epi64(src, k, a); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_set1_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_set1_epi64 - #define _mm512_mask_set1_epi64(src, k, a) simde_mm512_mask_set1_epi64(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_set1_epi64(simde__mmask8 k, int64_t a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_set1_epi64(k, a); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_set1_epi64(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_set1_epi64 - #define _mm512_maskz_set1_epi64(k, a) simde_mm512_maskz_set1_epi64(k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_set1_epu8 (uint8_t a) { - simde__m512i_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = a; - } - - return simde__m512i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_set1_epu16 (uint16_t a) { - simde__m512i_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = a; - } - - return simde__m512i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_set1_epu32 (uint32_t a) { - simde__m512i_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a; - } - - return simde__m512i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_set1_epu64 (uint64_t a) { - simde__m512i_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = a; - } - - return simde__m512i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_set1_ps (simde_float32 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_set1_ps(a); - #else - simde__m512_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a; - } - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_set1_ps - #define _mm512_set1_ps(a) simde_mm512_set1_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_set1_pd (simde_float64 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_set1_pd(a); - #else - simde__m512d_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a; - } - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_set1_pd - #define _mm512_set1_pd(a) simde_mm512_set1_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512h -simde_mm512_set1_ph (simde_float16 a) { - #if defined(SIMDE_X86_AVX512FP16_NATIVE) - return _mm512_set1_ph(a); - #else - simde__m512h_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f16) / sizeof(r_.f16[0])) ; i++) { - r_.f16[i] = a; - } - - return simde__m512h_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) - #undef _mm512_set1_ph - #define _mm512_set1_ph(a) simde_mm512_set1_ph(a) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SET1_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/set4.h b/ffi-deps/simde/simde/x86/avx512/set4.h deleted file mode 100644 index 379f829..0000000 --- a/ffi-deps/simde/simde/x86/avx512/set4.h +++ /dev/null @@ -1,140 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_SET4_H) -#define SIMDE_X86_AVX512_SET4_H - -#include "types.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_set4_epi32 (int32_t d, int32_t c, int32_t b, int32_t a) { - simde__m512i_private r_; - - r_.i32[ 0] = a; - r_.i32[ 1] = b; - r_.i32[ 2] = c; - r_.i32[ 3] = d; - r_.i32[ 4] = a; - r_.i32[ 5] = b; - r_.i32[ 6] = c; - r_.i32[ 7] = d; - r_.i32[ 8] = a; - r_.i32[ 9] = b; - r_.i32[10] = c; - r_.i32[11] = d; - r_.i32[12] = a; - r_.i32[13] = b; - r_.i32[14] = c; - r_.i32[15] = d; - - return simde__m512i_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_set4_epi32 - #define _mm512_set4_epi32(d,c,b,a) simde_mm512_set4_epi32(d,c,b,a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_set4_epi64 (int64_t d, int64_t c, int64_t b, int64_t a) { - simde__m512i_private r_; - - r_.i64[0] = a; - r_.i64[1] = b; - r_.i64[2] = c; - r_.i64[3] = d; - r_.i64[4] = a; - r_.i64[5] = b; - r_.i64[6] = c; - r_.i64[7] = d; - - return simde__m512i_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_set4_epi64 - #define _mm512_set4_epi64(d,c,b,a) simde_mm512_set4_epi64(d,c,b,a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_set4_ps (simde_float32 d, simde_float32 c, simde_float32 b, simde_float32 a) { - simde__m512_private r_; - - r_.f32[ 0] = a; - r_.f32[ 1] = b; - r_.f32[ 2] = c; - r_.f32[ 3] = d; - r_.f32[ 4] = a; - r_.f32[ 5] = b; - r_.f32[ 6] = c; - r_.f32[ 7] = d; - r_.f32[ 8] = a; - r_.f32[ 9] = b; - r_.f32[10] = c; - r_.f32[11] = d; - r_.f32[12] = a; - r_.f32[13] = b; - r_.f32[14] = c; - r_.f32[15] = d; - - return simde__m512_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_set4_ps - #define _mm512_set4_ps(d,c,b,a) simde_mm512_set4_ps(d,c,b,a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_set4_pd (simde_float64 d, simde_float64 c, simde_float64 b, simde_float64 a) { - simde__m512d_private r_; - - r_.f64[0] = a; - r_.f64[1] = b; - r_.f64[2] = c; - r_.f64[3] = d; - r_.f64[4] = a; - r_.f64[5] = b; - r_.f64[6] = c; - r_.f64[7] = d; - - return simde__m512d_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_set4_pd - #define _mm512_set4_pd(d,c,b,a) simde_mm512_set4_pd(d,c,b,a) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SET4_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/setone.h b/ffi-deps/simde/simde/x86/avx512/setone.h deleted file mode 100644 index df2f6e8..0000000 --- a/ffi-deps/simde/simde/x86/avx512/setone.h +++ /dev/null @@ -1,72 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - */ - -#if !defined(SIMDE_X86_AVX512_SETONE_H) -#define SIMDE_X86_AVX512_SETONE_H - -#include "types.h" -#include "cast.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_setone_si512(void) { - simde__m512i_private r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = ~HEDLEY_STATIC_CAST(int_fast32_t, 0); - } - - return simde__m512i_from_private(r_); -} -#define simde_x_mm512_setone_epi32() simde_x_mm512_setone_si512() - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_x_mm512_setone_ps(void) { - return simde_mm512_castsi512_ps(simde_x_mm512_setone_si512()); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_x_mm512_setone_pd(void) { - return simde_mm512_castsi512_pd(simde_x_mm512_setone_si512()); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512h -simde_x_mm512_setone_ph(void) { - return simde_mm512_castsi512_ph(simde_x_mm512_setone_si512()); -} - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SETONE_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/setr.h b/ffi-deps/simde/simde/x86/avx512/setr.h deleted file mode 100644 index c44bed4..0000000 --- a/ffi-deps/simde/simde/x86/avx512/setr.h +++ /dev/null @@ -1,144 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Himanshi Mathur - */ - -#if !defined(SIMDE_X86_AVX512_SETR_H) -#define SIMDE_X86_AVX512_SETR_H - -#include "types.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_setr_epi32 (int32_t e15, int32_t e14, int32_t e13, int32_t e12, int32_t e11, int32_t e10, int32_t e9, int32_t e8, - int32_t e7, int32_t e6, int32_t e5, int32_t e4, int32_t e3, int32_t e2, int32_t e1, int32_t e0) { - simde__m512i_private r_; - - r_.i32[ 0] = e15; - r_.i32[ 1] = e14; - r_.i32[ 2] = e13; - r_.i32[ 3] = e12; - r_.i32[ 4] = e11; - r_.i32[ 5] = e10; - r_.i32[ 6] = e9; - r_.i32[ 7] = e8; - r_.i32[ 8] = e7; - r_.i32[ 9] = e6; - r_.i32[10] = e5; - r_.i32[11] = e4; - r_.i32[12] = e3; - r_.i32[13] = e2; - r_.i32[14] = e1; - r_.i32[15] = e0; - - return simde__m512i_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_setr_epi32 - #define _mm512_setr_epi32(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm512_setr_epi32(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_setr_epi64 (int64_t e7, int64_t e6, int64_t e5, int64_t e4, int64_t e3, int64_t e2, int64_t e1, int64_t e0) { - simde__m512i_private r_; - - r_.i64[0] = e7; - r_.i64[1] = e6; - r_.i64[2] = e5; - r_.i64[3] = e4; - r_.i64[4] = e3; - r_.i64[5] = e2; - r_.i64[6] = e1; - r_.i64[7] = e0; - - return simde__m512i_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_setr_epi64 - #define _mm512_setr_epi64(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm512_setr_epi64(e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_setr_ps (simde_float32 e15, simde_float32 e14, simde_float32 e13, simde_float32 e12, - simde_float32 e11, simde_float32 e10, simde_float32 e9, simde_float32 e8, - simde_float32 e7, simde_float32 e6, simde_float32 e5, simde_float32 e4, - simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) { - simde__m512_private r_; - - r_.f32[ 0] = e15; - r_.f32[ 1] = e14; - r_.f32[ 2] = e13; - r_.f32[ 3] = e12; - r_.f32[ 4] = e11; - r_.f32[ 5] = e10; - r_.f32[ 6] = e9; - r_.f32[ 7] = e8; - r_.f32[ 8] = e7; - r_.f32[ 9] = e6; - r_.f32[10] = e5; - r_.f32[11] = e4; - r_.f32[12] = e3; - r_.f32[13] = e2; - r_.f32[14] = e1; - r_.f32[15] = e0; - - return simde__m512_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_setr_ps - #define _mm512_setr_ps(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm512_setr_ps(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_setr_pd (simde_float64 e7, simde_float64 e6, simde_float64 e5, simde_float64 e4, simde_float64 e3, simde_float64 e2, simde_float64 e1, simde_float64 e0) { - simde__m512d_private r_; - - r_.f64[0] = e7; - r_.f64[1] = e6; - r_.f64[2] = e5; - r_.f64[3] = e4; - r_.f64[4] = e3; - r_.f64[5] = e2; - r_.f64[6] = e1; - r_.f64[7] = e0; - - return simde__m512d_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_setr_pd - #define _mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm512_setr_pd(e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SETR_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/setr4.h b/ffi-deps/simde/simde/x86/avx512/setr4.h deleted file mode 100644 index 7ee5921..0000000 --- a/ffi-deps/simde/simde/x86/avx512/setr4.h +++ /dev/null @@ -1,140 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Himanshi Mathur - */ - -#if !defined(SIMDE_X86_AVX512_SETR4_H) -#define SIMDE_X86_AVX512_SETR4_H - -#include "types.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_setr4_epi32 (int32_t d, int32_t c, int32_t b, int32_t a) { - simde__m512i_private r_; - - r_.i32[ 0] = d; - r_.i32[ 1] = c; - r_.i32[ 2] = b; - r_.i32[ 3] = a; - r_.i32[ 4] = d; - r_.i32[ 5] = c; - r_.i32[ 6] = b; - r_.i32[ 7] = a; - r_.i32[ 8] = d; - r_.i32[ 9] = c; - r_.i32[10] = b; - r_.i32[11] = a; - r_.i32[12] = d; - r_.i32[13] = c; - r_.i32[14] = b; - r_.i32[15] = a; - - return simde__m512i_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_setr4_epi32 - #define _mm512_setr4_epi32(d,c,b,a) simde_mm512_setr4_epi32(d,c,b,a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_setr4_epi64 (int64_t d, int64_t c, int64_t b, int64_t a) { - simde__m512i_private r_; - - r_.i64[0] = d; - r_.i64[1] = c; - r_.i64[2] = b; - r_.i64[3] = a; - r_.i64[4] = d; - r_.i64[5] = c; - r_.i64[6] = b; - r_.i64[7] = a; - - return simde__m512i_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_setr4_epi64 - #define _mm512_setr4_epi64(d,c,b,a) simde_mm512_setr4_epi64(d,c,b,a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_setr4_ps (simde_float32 d, simde_float32 c, simde_float32 b, simde_float32 a) { - simde__m512_private r_; - - r_.f32[ 0] = d; - r_.f32[ 1] = c; - r_.f32[ 2] = b; - r_.f32[ 3] = a; - r_.f32[ 4] = d; - r_.f32[ 5] = c; - r_.f32[ 6] = b; - r_.f32[ 7] = a; - r_.f32[ 8] = d; - r_.f32[ 9] = c; - r_.f32[10] = b; - r_.f32[11] = a; - r_.f32[12] = d; - r_.f32[13] = c; - r_.f32[14] = b; - r_.f32[15] = a; - - return simde__m512_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_setr4_ps - #define _mm512_setr4_ps(d,c,b,a) simde_mm512_setr4_ps(d,c,b,a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_setr4_pd (simde_float64 d, simde_float64 c, simde_float64 b, simde_float64 a) { - simde__m512d_private r_; - - r_.f64[0] = d; - r_.f64[1] = c; - r_.f64[2] = b; - r_.f64[3] = a; - r_.f64[4] = d; - r_.f64[5] = c; - r_.f64[6] = b; - r_.f64[7] = a; - - return simde__m512d_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_setr4_pd - #define _mm512_setr4_pd(d,c,b,a) simde_mm512_setr4_pd(d,c,b,a) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SETR4_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/setzero.h b/ffi-deps/simde/simde/x86/avx512/setzero.h deleted file mode 100644 index c5bfdc4..0000000 --- a/ffi-deps/simde/simde/x86/avx512/setzero.h +++ /dev/null @@ -1,105 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Himanshi Mathur - * 2020 Hidayat Khan - * 2020 Christopher Moore - */ - -#if !defined(SIMDE_X86_AVX512_SETZERO_H) -#define SIMDE_X86_AVX512_SETZERO_H - -#include "types.h" -#include "cast.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_setzero_si512(void) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_setzero_si512(); - #else - simde__m512i r; - simde_memset(&r, 0, sizeof(r)); - return r; - #endif -} -#define simde_mm512_setzero_epi32() simde_mm512_setzero_si512() -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_setzero_si512 - #define _mm512_setzero_si512() simde_mm512_setzero_si512() - #undef _mm512_setzero_epi32 - #define _mm512_setzero_epi32() simde_mm512_setzero_si512() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_setzero_ps(void) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_setzero_ps(); - #else - return simde_mm512_castsi512_ps(simde_mm512_setzero_si512()); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_setzero_ps - #define _mm512_setzero_ps() simde_mm512_setzero_ps() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_setzero_pd(void) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_setzero_pd(); - #else - return simde_mm512_castsi512_pd(simde_mm512_setzero_si512()); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_setzero_pd - #define _mm512_setzero_pd() simde_mm512_setzero_pd() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512h -simde_mm512_setzero_ph(void) { - #if defined(SIMDE_X86_AVX512FP16_NATIVE) - return _mm512_setzero_ph(); - #else - return simde_mm512_castsi512_ph(simde_mm512_setzero_si512()); - #endif -} -#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) - #undef _mm512_setzero_ph - #define _mm512_setzero_ph() simde_mm512_setzero_ph() -#endif - - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SETZERO_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/shldv.h b/ffi-deps/simde/simde/x86/avx512/shldv.h deleted file mode 100644 index 1cd38f1..0000000 --- a/ffi-deps/simde/simde/x86/avx512/shldv.h +++ /dev/null @@ -1,157 +0,0 @@ -#if !defined(SIMDE_X86_AVX512_SHLDV_H) -#define SIMDE_X86_AVX512_SHLDV_H - -#include "types.h" -#include "../avx2.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_shldv_epi32(simde__m128i a, simde__m128i b, simde__m128i c) { - #if defined(SIMDE_X86_AVX512VBMI2_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_shldv_epi32(a, b, c); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - c_ = simde__m128i_to_private(c); - - uint64x2_t - values_lo = vreinterpretq_u64_u32(vzip1q_u32(b_.neon_u32, a_.neon_u32)), - values_hi = vreinterpretq_u64_u32(vzip2q_u32(b_.neon_u32, a_.neon_u32)); - - int32x4_t count = vandq_s32(c_.neon_i32, vdupq_n_s32(31)); - - values_lo = vshlq_u64(values_lo, vmovl_s32(vget_low_s32(count))); - values_hi = vshlq_u64(values_hi, vmovl_high_s32(count)); - - r_.neon_u32 = - vuzp2q_u32( - vreinterpretq_u32_u64(values_lo), - vreinterpretq_u32_u64(values_hi) - ); - #elif defined(SIMDE_X86_AVX2_NATIVE) - simde__m256i - tmp1, - lo = - simde_mm256_castps_si256( - simde_mm256_unpacklo_ps( - simde_mm256_castsi256_ps(simde_mm256_castsi128_si256(b)), - simde_mm256_castsi256_ps(simde_mm256_castsi128_si256(a)) - ) - ), - hi = - simde_mm256_castps_si256( - simde_mm256_unpackhi_ps( - simde_mm256_castsi256_ps(simde_mm256_castsi128_si256(b)), - simde_mm256_castsi256_ps(simde_mm256_castsi128_si256(a)) - ) - ), - tmp2 = - simde_mm256_castpd_si256( - simde_mm256_permute2f128_pd( - simde_mm256_castsi256_pd(lo), - simde_mm256_castsi256_pd(hi), - 32 - ) - ); - - tmp2 = - simde_mm256_sllv_epi64( - tmp2, - simde_mm256_cvtepi32_epi64( - simde_mm_and_si128( - c, - simde_mm_set1_epi32(31) - ) - ) - ); - - tmp1 = - simde_mm256_castpd_si256( - simde_mm256_permute2f128_pd( - simde_mm256_castsi256_pd(tmp2), - simde_mm256_castsi256_pd(tmp2), - 1 - ) - ); - - r_ = - simde__m128i_to_private( - simde_mm256_castsi256_si128( - simde_mm256_castps_si256( - simde_mm256_shuffle_ps( - simde_mm256_castsi256_ps(tmp2), - simde_mm256_castsi256_ps(tmp1), - 221 - ) - ) - ) - ); - #elif defined(SIMDE_X86_SSE2_NATIVE) - simde__m128i_private - c_ = simde__m128i_to_private(c), - lo = simde__m128i_to_private(simde_mm_unpacklo_epi32(b, a)), - hi = simde__m128i_to_private(simde_mm_unpackhi_epi32(b, a)); - - size_t halfway = (sizeof(r_.u32) / sizeof(r_.u32[0]) / 2); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < halfway ; i++) { - lo.u64[i] <<= (c_.u32[i] & 31); - hi.u64[i] <<= (c_.u32[halfway + i] & 31); - } - - r_ = - simde__m128i_to_private( - simde_mm_castps_si128( - simde_mm_shuffle_ps( - simde_mm_castsi128_ps(simde__m128i_from_private(lo)), - simde_mm_castsi128_ps(simde__m128i_from_private(hi)), - 221) - ) - ); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_CONVERT_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - simde__m128i_private - c_ = simde__m128i_to_private(c); - simde__m256i_private - a_ = simde__m256i_to_private(simde_mm256_castsi128_si256(a)), - b_ = simde__m256i_to_private(simde_mm256_castsi128_si256(b)), - tmp1, - tmp2; - - tmp1.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(tmp1.u64), SIMDE_SHUFFLE_VECTOR_(32, 32, b_.i32, a_.i32, 0, 8, 1, 9, 2, 10, 3, 11)); - SIMDE_CONVERT_VECTOR_(tmp2.u64, c_.u32); - - tmp1.u64 <<= (tmp2.u64 & 31); - - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, tmp1.m128i_private[0].i32, tmp1.m128i_private[1].i32, 1, 3, 5, 7); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - c_ = simde__m128i_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (((HEDLEY_STATIC_CAST(uint64_t, a_.u32[i]) << 32) | b_.u32[i]) << (c_.u32[i] & 31)) >> 32); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512VBMI2_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_shldv_epi32 - #define _mm_shldv_epi32(a, b, c) simde_mm_shldv_epi32(a, b, c) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SHLDV_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/shuffle.h b/ffi-deps/simde/simde/x86/avx512/shuffle.h deleted file mode 100644 index d1c537f..0000000 --- a/ffi-deps/simde/simde/x86/avx512/shuffle.h +++ /dev/null @@ -1,417 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Christopher Moore - * 2023 Michael R. Crusoe - */ - -#if !defined(SIMDE_X86_AVX512_SHUFFLE_H) -#define SIMDE_X86_AVX512_SHUFFLE_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" -#include "extract.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_shuffle_epi8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_shuffle_epi8(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(a_.m256i) / sizeof(a_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_shuffle_epi8(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (b_.i8[i] & 0x80) ? 0 : a_.i8[(b_.i8[i] & 0x0f) + (i & 0x30)]; - } - #endif - - return simde__m512i_from_private(r_); -#endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_shuffle_epi8 - #define _mm512_shuffle_epi8(a, b) simde_mm512_shuffle_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_shuffle_epi8 (simde__m512i src, simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_shuffle_epi8(src, k, a, b); - #else - return simde_mm512_mask_mov_epi8(src, k, simde_mm512_shuffle_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_shuffle_epi8 - #define _mm512_mask_shuffle_epi8(src, k, a, b) simde_mm512_mask_shuffle_epi8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_shuffle_epi8 (simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_shuffle_epi8(k, a, b); - #else - return simde_mm512_maskz_mov_epi8(k, simde_mm512_shuffle_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_shuffle_epi8 - #define _mm512_maskz_shuffle_epi8(k, a, b) simde_mm512_maskz_shuffle_epi8(k, a, b) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) -# define simde_mm512_shuffle_epi32(a, imm8) _mm512_shuffle_epi32((a), (imm8)) -#elif defined(SIMDE_STATEMENT_EXPR_) -# define simde_mm512_shuffle_epi32(a, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512i_private simde_mm512_shuffle_epi32_r_, \ - simde_mm512_shuffle_epi32_a_ = simde__m512i_to_private((a)); \ - simde_mm512_shuffle_epi32_r_.m128i[0] = simde_mm_shuffle_epi32( \ - simde_mm512_shuffle_epi32_a_.m128i[0], (imm8)); \ - simde_mm512_shuffle_epi32_r_.m128i[1] = simde_mm_shuffle_epi32( \ - simde_mm512_shuffle_epi32_a_.m128i[1], (imm8)); \ - simde_mm512_shuffle_epi32_r_.m128i[2] = simde_mm_shuffle_epi32( \ - simde_mm512_shuffle_epi32_a_.m128i[2], (imm8)); \ - simde_mm512_shuffle_epi32_r_.m128i[3] = simde_mm_shuffle_epi32( \ - simde_mm512_shuffle_epi32_a_.m128i[3], (imm8)); \ - simde__m512i_from_private(simde_mm512_shuffle_epi32_r_); \ - })) -#else -# define simde_mm512_shuffle_epi32(a, imm8) \ - simde_x_mm512_set_m128i( \ - simde_mm_shuffle_epi32(simde_mm512_extracti32x4_epi32(a, 3), (imm8)), \ - simde_mm_shuffle_epi32(simde_mm512_extracti32x4_epi32(a, 2), (imm8)), \ - simde_mm_shuffle_epi32(simde_mm512_extracti32x4_epi32(a, 1), (imm8)), \ - simde_mm_shuffle_epi32(simde_mm512_extracti32x4_epi32(a, 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_shuffle_epi32 - #define _mm512_shuffle_epi32(a, imm8) simde_mm512_shuffle_epi32((a), (imm8)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_shuffle_i32x4 (simde__m256i a, simde__m256i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - r_.m128i[0] = a_.m128i[ imm8 & 1]; - r_.m128i[1] = b_.m128i[(imm8 >> 1) & 1]; - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_shuffle_i32x4(a, b, imm8) _mm256_shuffle_i32x4(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_shuffle_i32x4 - #define _mm256_shuffle_i32x4(a, b, imm8) simde_mm256_shuffle_i32x4(a, b, imm8) -#endif - -#define simde_mm256_maskz_shuffle_i32x4(k, a, b, imm8) simde_mm256_maskz_mov_epi32(k, simde_mm256_shuffle_i32x4(a, b, imm8)) -#define simde_mm256_mask_shuffle_i32x4(src, k, a, b, imm8) simde_mm256_mask_mov_epi32(src, k, simde_mm256_shuffle_i32x4(a, b, imm8)) - -#define simde_mm256_shuffle_f32x4(a, b, imm8) simde_mm256_castsi256_ps(simde_mm256_shuffle_i32x4(simde_mm256_castps_si256(a), simde_mm256_castps_si256(b), imm8)) -#define simde_mm256_maskz_shuffle_f32x4(k, a, b, imm8) simde_mm256_maskz_mov_ps(k, simde_mm256_shuffle_f32x4(a, b, imm8)) -#define simde_mm256_mask_shuffle_f32x4(src, k, a, b, imm8) simde_mm256_mask_mov_ps(src, k, simde_mm256_shuffle_f32x4(a, b, imm8)) - -#define simde_mm256_shuffle_i64x2(a, b, imm8) simde_mm256_shuffle_i32x4(a, b, imm8) -#define simde_mm256_maskz_shuffle_i64x2(k, a, b, imm8) simde_mm256_maskz_mov_epi64(k, simde_mm256_shuffle_i64x2(a, b, imm8)) -#define simde_mm256_mask_shuffle_i64x2(src, k, a, b, imm8) simde_mm256_mask_mov_epi64(src, k, simde_mm256_shuffle_i64x2(a, b, imm8)) - -#define simde_mm256_shuffle_f64x2(a, b, imm8) simde_mm256_castsi256_pd(simde_mm256_shuffle_i64x2(simde_mm256_castpd_si256(a), simde_mm256_castpd_si256(b), imm8)) -#define simde_mm256_maskz_shuffle_f64x2(k, a, b, imm8) simde_mm256_maskz_mov_pd(k, simde_mm256_shuffle_f64x2(a, b, imm8)) -#define simde_mm256_mask_shuffle_f64x2(src, k, a, b, imm8) simde_mm256_mask_mov_pd(src, k, simde_mm256_shuffle_f64x2(a, b, imm8)) - -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_shuffle_i32x4 - #undef _mm256_mask_shuffle_i32x4 - #define _mm256_maskz_shuffle_i32x4(k, a, b, imm8) simde_mm256_maskz_shuffle_i32x4(k, a, b, imm8) - #define _mm256_mask_shuffle_i32x4(src, k, a, b, imm8) simde_mm256_mask_shuffle_i32x4(src, k, a, b, imm8) - - #undef _mm256_shuffle_f32x4 - #undef _mm256_maskz_shuffle_f32x4 - #undef _mm256_mask_shuffle_f32x4 - #define _mm256_shuffle_f32x4(a, b, imm8) simde_mm256_shuffle_f32x4(a, b, imm8) - #define _mm256_maskz_shuffle_f32x4(k, a, b, imm8) simde_mm256_maskz_shuffle_f32x4(k, a, b, imm8) - #define _mm256_mask_shuffle_f32x4(src, k, a, b, imm8) simde_mm256_mask_shuffle_f32x4(src, k, a, b, imm8) - - #undef _mm256_shuffle_i64x2 - #undef _mm256_maskz_shuffle_i64x2 - #undef _mm256_mask_shuffle_i64x2 - #define _mm256_shuffle_i64x2(a, b, imm8) simde_mm256_shuffle_i64x2(a, b, imm8) - #define _mm256_maskz_shuffle_i64x2(k, a, b, imm8) simde_mm256_maskz_shuffle_i64x2(k, a, b, imm8) - #define _mm256_mask_shuffle_i64x2(src, k, a, b, imm8) simde_mm256_mask_shuffle_i64x2(src, k, a, b, imm8) - - #undef _mm256_shuffle_f64x2 - #undef _mm256_maskz_shuffle_f64x2 - #undef _mm256_mask_shuffle_f64x2 - #define _mm256_shuffle_f64x2(a, b, imm8) simde_mm256_shuffle_f64x2(a, b, imm8) - #define _mm256_maskz_shuffle_f64x2(k, a, b, imm8) simde_mm256_maskz_shuffle_f64x2(k, a, b, imm8) - #define _mm256_mask_shuffle_f64x2(src, k, a, b, imm8) simde_mm256_mask_shuffle_f64x2(src, k, a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_shuffle_i32x4 (simde__m512i a, simde__m512i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - r_.m128i[0] = a_.m128i[ imm8 & 3]; - r_.m128i[1] = a_.m128i[(imm8 >> 2) & 3]; - r_.m128i[2] = b_.m128i[(imm8 >> 4) & 3]; - r_.m128i[3] = b_.m128i[(imm8 >> 6) & 3]; - - return simde__m512i_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_shuffle_i32x4(a, b, imm8) _mm512_shuffle_i32x4(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_shuffle_i32x4 - #define _mm512_shuffle_i32x4(a, b, imm8) simde_mm512_shuffle_i32x4(a, b, imm8) -#endif - -#define simde_mm512_maskz_shuffle_i32x4(k, a, b, imm8) simde_mm512_maskz_mov_epi32(k, simde_mm512_shuffle_i32x4(a, b, imm8)) -#define simde_mm512_mask_shuffle_i32x4(src, k, a, b, imm8) simde_mm512_mask_mov_epi32(src, k, simde_mm512_shuffle_i32x4(a, b, imm8)) - -#define simde_mm512_shuffle_f32x4(a, b, imm8) simde_mm512_castsi512_ps(simde_mm512_shuffle_i32x4(simde_mm512_castps_si512(a), simde_mm512_castps_si512(b), imm8)) -#define simde_mm512_maskz_shuffle_f32x4(k, a, b, imm8) simde_mm512_maskz_mov_ps(k, simde_mm512_shuffle_f32x4(a, b, imm8)) -#define simde_mm512_mask_shuffle_f32x4(src, k, a, b, imm8) simde_mm512_mask_mov_ps(src, k, simde_mm512_shuffle_f32x4(a, b, imm8)) - -#define simde_mm512_shuffle_i64x2(a, b, imm8) simde_mm512_shuffle_i32x4(a, b, imm8) -#define simde_mm512_maskz_shuffle_i64x2(k, a, b, imm8) simde_mm512_maskz_mov_epi64(k, simde_mm512_shuffle_i64x2(a, b, imm8)) -#define simde_mm512_mask_shuffle_i64x2(src, k, a, b, imm8) simde_mm512_mask_mov_epi64(src, k, simde_mm512_shuffle_i64x2(a, b, imm8)) - -#define simde_mm512_shuffle_f64x2(a, b, imm8) simde_mm512_castsi512_pd(simde_mm512_shuffle_i64x2(simde_mm512_castpd_si512(a), simde_mm512_castpd_si512(b), imm8)) -#define simde_mm512_maskz_shuffle_f64x2(k, a, b, imm8) simde_mm512_maskz_mov_pd(k, simde_mm512_shuffle_f64x2(a, b, imm8)) -#define simde_mm512_mask_shuffle_f64x2(src, k, a, b, imm8) simde_mm512_mask_mov_pd(src, k, simde_mm512_shuffle_f64x2(a, b, imm8)) - -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_shuffle_i32x4 - #undef _mm512_mask_shuffle_i32x4 - #define _mm512_maskz_shuffle_i32x4(k, a, b, imm8) simde_mm512_maskz_shuffle_i32x4(k, a, b, imm8) - #define _mm512_mask_shuffle_i32x4(src, k, a, b, imm8) simde_mm512_mask_shuffle_i32x4(src, k, a, b, imm8) - - #undef _mm512_shuffle_f32x4 - #undef _mm512_maskz_shuffle_f32x4 - #undef _mm512_mask_shuffle_f32x4 - #define _mm512_shuffle_f32x4(a, b, imm8) simde_mm512_shuffle_f32x4(a, b, imm8) - #define _mm512_maskz_shuffle_f32x4(k, a, b, imm8) simde_mm512_maskz_shuffle_f32x4(k, a, b, imm8) - #define _mm512_mask_shuffle_f32x4(src, k, a, b, imm8) simde_mm512_mask_shuffle_f32x4(src, k, a, b, imm8) - - #undef _mm512_shuffle_i64x2 - #undef _mm512_maskz_shuffle_i64x2 - #undef _mm512_mask_shuffle_i64x2 - #define _mm512_shuffle_i64x2(a, b, imm8) simde_mm512_shuffle_i64x2(a, b, imm8) - #define _mm512_maskz_shuffle_i64x2(k, a, b, imm8) simde_mm512_maskz_shuffle_i64x2(k, a, b, imm8) - #define _mm512_mask_shuffle_i64x2(src, k, a, b, imm8) simde_mm512_mask_shuffle_i64x2(src, k, a, b, imm8) - - #undef _mm512_shuffle_f64x2 - #undef _mm512_maskz_shuffle_f64x2 - #undef _mm512_mask_shuffle_f64x2 - #define _mm512_shuffle_f64x2(a, b, imm8) simde_mm512_shuffle_f64x2(a, b, imm8) - #define _mm512_maskz_shuffle_f64x2(k, a, b, imm8) simde_mm512_maskz_shuffle_f64x2(k, a, b, imm8) - #define _mm512_mask_shuffle_f64x2(src, k, a, b, imm8) simde_mm512_mask_shuffle_f64x2(src, k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_shuffle_ps(a, b, imm8) _mm512_shuffle_ps(a, b, imm8) -#elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm512_shuffle_ps(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512_private \ - simde_mm512_shuffle_ps_a_ = simde__m512_to_private(a), \ - simde_mm512_shuffle_ps_b_ = simde__m512_to_private(b); \ - \ - simde_mm512_shuffle_ps_a_.m256[0] = simde_mm256_shuffle_ps(simde_mm512_shuffle_ps_a_.m256[0], simde_mm512_shuffle_ps_b_.m256[0], imm8); \ - simde_mm512_shuffle_ps_a_.m256[1] = simde_mm256_shuffle_ps(simde_mm512_shuffle_ps_a_.m256[1], simde_mm512_shuffle_ps_b_.m256[1], imm8); \ - \ - simde__m512_from_private(simde_mm512_shuffle_ps_a_); \ - })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm512_shuffle_ps(a, b, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512_private \ - simde_mm512_shuffle_ps_a_ = simde__m512_to_private(a), \ - simde_mm512_shuffle_ps_b_ = simde__m512_to_private(b); \ - \ - simde_mm512_shuffle_ps_a_.f32 = \ - SIMDE_SHUFFLE_VECTOR_( \ - 32, 64, \ - simde_mm512_shuffle_ps_a_.f32, \ - simde_mm512_shuffle_ps_b_.f32, \ - (((imm8) ) & 3), \ - (((imm8) >> 2) & 3), \ - (((imm8) >> 4) & 3) + 16, \ - (((imm8) >> 6) & 3) + 16, \ - (((imm8) ) & 3) + 4, \ - (((imm8) >> 2) & 3) + 4, \ - (((imm8) >> 4) & 3) + 20, \ - (((imm8) >> 6) & 3) + 20, \ - (((imm8) ) & 3) + 8, \ - (((imm8) >> 2) & 3) + 8, \ - (((imm8) >> 4) & 3) + 24, \ - (((imm8) >> 6) & 3) + 24, \ - (((imm8) ) & 3) + 12, \ - (((imm8) >> 2) & 3) + 12, \ - (((imm8) >> 4) & 3) + 28, \ - (((imm8) >> 6) & 3) + 28 \ - ); \ - \ - simde__m512_from_private(simde_mm512_shuffle_ps_a_); \ - })) -#else - SIMDE_FUNCTION_ATTRIBUTES - simde__m512 - simde_mm512_shuffle_ps(simde__m512 a, simde__m512 b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE (imm8, 0, 255) { - simde__m512_private - r_, - a_ = simde__m512_to_private(a), - b_ = simde__m512_to_private(b); - - const size_t halfway = (sizeof(r_.m128_private[0].f32) / sizeof(r_.m128_private[0].f32[0]) / 2); - for (size_t i = 0 ; i < (sizeof(r_.m128_private) / sizeof(r_.m128_private[0])) ; i++) { - SIMDE_VECTORIZE - for (size_t j = 0 ; j < halfway ; j++) { - r_.m128_private[i].f32[j] = a_.m128_private[i].f32[(imm8 >> (j * 2)) & 3]; - r_.m128_private[i].f32[halfway + j] = b_.m128_private[i].f32[(imm8 >> ((halfway + j) * 2)) & 3]; - } - } - - return simde__m512_from_private(r_); - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_shuffle_ps - #define _mm512_shuffle_ps(a, b, imm8) simde_mm512_shuffle_ps(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_shuffle_pd(simde__m512d a, simde__m512d b, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE (imm8, 0, 255) { - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a), - b_ = simde__m512d_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_.f64) / sizeof(r_.f64[0])) / 2) ; i++) { - r_.f64[i * 2] = (imm8 & ( 1 << (i*2) )) ? a_.f64[i * 2 + 1]: a_.f64[i * 2]; - r_.f64[i * 2 + 1] = (imm8 & ( 1 << (i*2+1) )) ? b_.f64[i * 2 + 1]: b_.f64[i * 2]; - } - - return simde__m512d_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_shuffle_pd(a, b, imm8) _mm512_shuffle_pd(a, b, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_shuffle_pd - #define _mm512_shuffle_pd(a, b, imm8) simde_mm512_shuffle_pd(a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512BW_NATIVE) -# define simde_mm512_shufflehi_epi16(a, imm8) _mm512_shufflehi_epi16(a, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) -# define simde_mm512_shufflehi_epi16(a, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512i_private simde_mm512_shufflehi_epi16_r_, \ - simde_mm512_shufflehi_epi16_a_ = simde__m512i_to_private((a)); \ - simde_mm512_shufflehi_epi16_r_.m128i[0] = simde_mm_shufflehi_epi16( \ - simde_mm512_shufflehi_epi16_a_.m128i[0], (imm8)); \ - simde_mm512_shufflehi_epi16_r_.m128i[1] = simde_mm_shufflehi_epi16( \ - simde_mm512_shufflehi_epi16_a_.m128i[1], (imm8)); \ - simde_mm512_shufflehi_epi16_r_.m128i[2] = simde_mm_shufflehi_epi16( \ - simde_mm512_shufflehi_epi16_a_.m128i[2], (imm8)); \ - simde_mm512_shufflehi_epi16_r_.m128i[3] = simde_mm_shufflehi_epi16( \ - simde_mm512_shufflehi_epi16_a_.m128i[3], (imm8)); \ - simde__m512i_from_private(simde_mm512_shufflehi_epi16_r_); \ - })) -#else -# define simde_mm512_shufflehi_epi16(a, imm8) \ - simde_x_mm512_set_m128i( \ - simde_mm_shufflehi_epi16(simde_mm512_extracti32x4_epi32((a), 3), (imm8)), \ - simde_mm_shufflehi_epi16(simde_mm512_extracti32x4_epi32((a), 2), (imm8)), \ - simde_mm_shufflehi_epi16(simde_mm512_extracti32x4_epi32((a), 1), (imm8)), \ - simde_mm_shufflehi_epi16(simde_mm512_extracti32x4_epi32((a), 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_shufflehi_epi16 - #define _mm512_shufflehi_epi16(a, imm8) simde_mm512_shufflehi_epi16(a, imm8) -#endif - -#if defined(SIMDE_X86_AVX512BW_NATIVE) -# define simde_mm512_shufflelo_epi16(a, imm8) _mm512_shufflelo_epi16(a, imm8) -#elif defined(SIMDE_STATEMENT_EXPR_) -# define simde_mm512_shufflelo_epi16(a, imm8) SIMDE_STATEMENT_EXPR_(({ \ - simde__m512i_private simde_mm512_shufflelo_epi16_r_, \ - simde_mm512_shufflelo_epi16_a_ = simde__m512i_to_private((a)); \ - simde_mm512_shufflelo_epi16_r_.m128i[0] = simde_mm_shufflelo_epi16( \ - simde_mm512_shufflelo_epi16_a_.m128i[0], (imm8)); \ - simde_mm512_shufflelo_epi16_r_.m128i[1] = simde_mm_shufflelo_epi16( \ - simde_mm512_shufflelo_epi16_a_.m128i[1], (imm8)); \ - simde_mm512_shufflelo_epi16_r_.m128i[2] = simde_mm_shufflelo_epi16( \ - simde_mm512_shufflelo_epi16_a_.m128i[2], (imm8)); \ - simde_mm512_shufflelo_epi16_r_.m128i[3] = simde_mm_shufflelo_epi16( \ - simde_mm512_shufflelo_epi16_a_.m128i[3], (imm8)); \ - simde__m512i_from_private(simde_mm512_shufflelo_epi16_r_); \ - })) -#else -# define simde_mm512_shufflelo_epi16(a, imm8) \ - simde_x_mm512_set_m128i( \ - simde_mm_shufflelo_epi16(simde_mm512_extracti32x4_epi32((a), 3), (imm8)), \ - simde_mm_shufflelo_epi16(simde_mm512_extracti32x4_epi32((a), 2), (imm8)), \ - simde_mm_shufflelo_epi16(simde_mm512_extracti32x4_epi32((a), 1), (imm8)), \ - simde_mm_shufflelo_epi16(simde_mm512_extracti32x4_epi32((a), 0), (imm8))) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_shufflelo_epi16 - #define _mm512_shufflelo_epi16(a, imm8) simde_mm512_shufflelo_epi16(a, imm8) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SHUFFLE_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/sll.h b/ffi-deps/simde/simde/x86/avx512/sll.h deleted file mode 100644 index 18fbbb8..0000000 --- a/ffi-deps/simde/simde/x86/avx512/sll.h +++ /dev/null @@ -1,247 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_SLL_H) -#define SIMDE_X86_AVX512_SLL_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" -#include "setzero.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_sll_epi16 (simde__m512i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_sll_epi16(a, count); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_sll_epi16(a_.m256i[i], count); - } - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]); - if (shift > 15) - return simde_mm512_setzero_si512(); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << (shift)); - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_sll_epi16 - #define _mm512_sll_epi16(a, count) simde_mm512_sll_epi16(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_sll_epi16 (simde__m512i src, simde__mmask32 k, simde__m512i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX51BW_NATIVE) - return _mm512_mask_sll_epi16(src, k, a, count); - #else - return simde_mm512_mask_mov_epi16(src, k, simde_mm512_sll_epi16(a, count)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_sll_epi16 - #define _mm512_mask_sll_epi16(src, k, a, count) simde_mm512_mask_sll_epi16(src, k, a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_sll_epi16 (simde__mmask32 k, simde__m512i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_sll_epi16(k, a, count); - #else - return simde_mm512_maskz_mov_epi16(k, simde_mm512_sll_epi16(a, count)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_sll_epi16 - #define _mm512_maskz_sll_epi16(k, a, count) simde_mm512_maskz_sll_epi16(k, a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_sll_epi32 (simde__m512i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_sll_epi32(a, count); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_sll_epi32(a_.m256i[i], count); - } - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]); - if (shift > 31) - return simde_mm512_setzero_si512(); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 << HEDLEY_STATIC_CAST(int32_t, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.i32[i] << (shift)); - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_sll_epi32 - #define _mm512_sll_epi32(a, count) simde_mm512_sll_epi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_sll_epi32(simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_sll_epi32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_sll_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_sll_epi32 - #define _mm512_mask_sll_epi32(src, k, a, b) simde_mm512_mask_sll_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_sll_epi32(simde__mmask16 k, simde__m512i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_sll_epi32(k, a, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_sll_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_sll_epi32 - #define _mm512_maskz_sll_epi32(k, a, b) simde_mm512_maskz_sll_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_sll_epi64 (simde__m512i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_sll_epi64(a, count); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_sll_epi64(a_.m256i[i], count); - } - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]); - if (shift > 63) - return simde_mm512_setzero_si512(); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i64 = a_.i64 << HEDLEY_STATIC_CAST(int64_t, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = HEDLEY_STATIC_CAST(int64_t, a_.i64[i] << (shift)); - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_sll_epi64 - #define _mm512_sll_epi64(a, count) simde_mm512_sll_epi64(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_sll_epi64(simde__m512i src, simde__mmask8 k, simde__m512i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_sll_epi64(src, k, a, b); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_sll_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_sll_epi64 - #define _mm512_mask_sll_epi64(src, k, a, b) simde_mm512_mask_sll_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_sll_epi64(simde__mmask8 k, simde__m512i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_sll_epi64(k, a, b); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_sll_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_sll_epi64 - #define _mm512_maskz_sll_epi64(k, a, b) simde_mm512_maskz_sll_epi64(k, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SLL_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/slli.h b/ffi-deps/simde/simde/x86/avx512/slli.h deleted file mode 100644 index d2ad75b..0000000 --- a/ffi-deps/simde/simde/x86/avx512/slli.h +++ /dev/null @@ -1,179 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - * 2020 Christopher Moore - */ - -#if !defined(SIMDE_X86_AVX512_SLLI_H) -#define SIMDE_X86_AVX512_SLLI_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" -#include "setzero.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_slli_epi16 (simde__m512i a, const unsigned int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (defined(HEDLEY_GCC_VERSION) && ((__GNUC__ == 5 && __GNUC_MINOR__ == 5) || (__GNUC__ == 6 && __GNUC_MINOR__ >= 4))) - simde__m512i r; - - SIMDE_CONSTIFY_16_(_mm512_slli_epi16, r, simde_mm512_setzero_si512(), imm8, a); - - return r; - #elif defined(SIMDE_X86_AVX512BW_NATIVE) - return SIMDE_BUG_IGNORE_SIGN_CONVERSION(_mm512_slli_epi16(a, imm8)); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - if(imm8 < 16) - r_.i16 = HEDLEY_STATIC_CAST(__typeof__(r_.i16), (a_.i16 << HEDLEY_STATIC_CAST(int16_t, imm8))); - else - return simde_mm512_setzero_si512(); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (imm8 < 16) ? HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << (imm8 & 0xff)) : 0; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_slli_epi16 - #define _mm512_slli_epi16(a, imm8) simde_mm512_slli_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_slli_epi32 (simde__m512i a, unsigned int imm8) { - /* I guess the restriction was added in 6.4, back-ported to 5.5, then - * removed (fixed) in 7? */ - #if defined(SIMDE_X86_AVX512F_NATIVE) && (defined(HEDLEY_GCC_VERSION) && ((__GNUC__ == 5 && __GNUC_MINOR__ == 5) || (__GNUC__ == 6 && __GNUC_MINOR__ >= 4))) - simde__m512i r; - - SIMDE_CONSTIFY_32_(_mm512_slli_epi32, r, simde_mm512_setzero_si512(), imm8, a); - - return r; - #elif defined(SIMDE_X86_AVX512F_NATIVE) - return SIMDE_BUG_IGNORE_SIGN_CONVERSION(_mm512_slli_epi32(a, imm8)); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - /* The Intel Intrinsics Guide says that only the 8 LSBits of imm8 are - * used. In this case we should do "imm8 &= 0xff". However in - * practice all bits are used. */ - if (imm8 > 31) { - simde_memset(&r_, 0, sizeof(r_)); - } else { - #if defined(SIMDE_X86_AVX2_NATIVE) - r_.m256i[0] = simde_mm256_slli_epi32(a_.m256i[0], HEDLEY_STATIC_CAST(int, imm8)); - r_.m256i[1] = simde_mm256_slli_epi32(a_.m256i[1], HEDLEY_STATIC_CAST(int, imm8)); - #elif defined(SIMDE_X86_SSE2_NATIVE) - r_.m128i[0] = simde_mm_slli_epi32(a_.m128i[0], HEDLEY_STATIC_CAST(int, imm8)); - r_.m128i[1] = simde_mm_slli_epi32(a_.m128i[1], HEDLEY_STATIC_CAST(int, imm8)); - r_.m128i[2] = simde_mm_slli_epi32(a_.m128i[2], HEDLEY_STATIC_CAST(int, imm8)); - r_.m128i[3] = simde_mm_slli_epi32(a_.m128i[3], HEDLEY_STATIC_CAST(int, imm8)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = a_.u32 << imm8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] << imm8; - } - #endif - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_slli_epi32 - #define _mm512_slli_epi32(a, imm8) simde_mm512_slli_epi32(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_slli_epi64 (simde__m512i a, unsigned int imm8) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && (defined(HEDLEY_GCC_VERSION) && ((__GNUC__ == 5 && __GNUC_MINOR__ == 5) || (__GNUC__ == 6 && __GNUC_MINOR__ >= 4))) - simde__m512i r; - - SIMDE_CONSTIFY_64_(_mm512_slli_epi64, r, simde_mm512_setzero_si512(), imm8, a); - - return r; - #elif defined(SIMDE_X86_AVX512F_NATIVE) - return SIMDE_BUG_IGNORE_SIGN_CONVERSION(_mm512_slli_epi64(a, imm8)); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - /* The Intel Intrinsics Guide says that only the 8 LSBits of imm8 are - * used. In this case we should do "imm8 &= 0xff". However in - * practice all bits are used. */ - if (imm8 > 63) { - simde_memset(&r_, 0, sizeof(r_)); - } else { - #if defined(SIMDE_X86_AVX2_NATIVE) - r_.m256i[0] = simde_mm256_slli_epi64(a_.m256i[0], HEDLEY_STATIC_CAST(int, imm8)); - r_.m256i[1] = simde_mm256_slli_epi64(a_.m256i[1], HEDLEY_STATIC_CAST(int, imm8)); - #elif defined(SIMDE_X86_SSE2_NATIVE) - r_.m128i[0] = simde_mm_slli_epi64(a_.m128i[0], HEDLEY_STATIC_CAST(int, imm8)); - r_.m128i[1] = simde_mm_slli_epi64(a_.m128i[1], HEDLEY_STATIC_CAST(int, imm8)); - r_.m128i[2] = simde_mm_slli_epi64(a_.m128i[2], HEDLEY_STATIC_CAST(int, imm8)); - r_.m128i[3] = simde_mm_slli_epi64(a_.m128i[3], HEDLEY_STATIC_CAST(int, imm8)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_97248) - r_.u64 = a_.u64 << imm8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = a_.u64[i] << imm8; - } - #endif - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_slli_epi64 - #define _mm512_slli_epi64(a, imm8) simde_mm512_slli_epi64(a, imm8) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SLLI_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/sllv.h b/ffi-deps/simde/simde/x86/avx512/sllv.h deleted file mode 100644 index f4caa6e..0000000 --- a/ffi-deps/simde/simde/x86/avx512/sllv.h +++ /dev/null @@ -1,122 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_SLLV_H) -#define SIMDE_X86_AVX512_SLLV_H - -#include "types.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_sllv_epi16 (simde__m512i a, simde__m512i b) { - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b), - r_; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), (b_.u16 < 16)) & (a_.u16 << b_.u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (b_.u16[i] < 16) ? HEDLEY_STATIC_CAST(uint16_t, (a_.u16[i] << b_.u16[i])) : 0; - } - #endif - - return simde__m512i_from_private(r_); -} -#if defined(SIMDE_X86_AVX512BW_NATIVE) - #define simde_mm512_sllv_epi16(a, b) _mm512_sllv_epi16(a, b) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_sllv_epi16 - #define _mm512_sllv_epi16(a, b) simde_mm512_sllv_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_sllv_epi32 (simde__m512i a, simde__m512i b) { - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b), - r_; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (b_.u32 < 32)) & (a_.u32 << b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (b_.u32[i] < 32) ? HEDLEY_STATIC_CAST(uint32_t, (a_.u32[i] << b_.u32[i])) : 0; - } - #endif - - return simde__m512i_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_sllv_epi32(a, b) _mm512_sllv_epi32(a, b) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_sllv_epi32 - #define _mm512_sllv_epi32(a, b) simde_mm512_sllv_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_sllv_epi64 (simde__m512i a, simde__m512i b) { - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b), - r_; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (b_.u64 < 64)) & (a_.u64 << b_.u64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (b_.u64[i] < 64) ? HEDLEY_STATIC_CAST(uint64_t, (a_.u64[i] << b_.u64[i])) : 0; - } - #endif - - return simde__m512i_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_sllv_epi64(a, b) _mm512_sllv_epi64(a, b) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_sllv_epi64 - #define _mm512_sllv_epi64(a, b) simde_mm512_sllv_epi64(a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SLLV_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/sqrt.h b/ffi-deps/simde/simde/x86/avx512/sqrt.h deleted file mode 100644 index cdc18ae..0000000 --- a/ffi-deps/simde/simde/x86/avx512/sqrt.h +++ /dev/null @@ -1,127 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Himanshi Mathur - */ - -#if !defined(SIMDE_X86_AVX512_SQRT_H) -#define SIMDE_X86_AVX512_SQRT_H - -#include "types.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_sqrt_ps (simde__m512 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_sqrt_ps(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if defined(SIMDE_X86_AVX_NATIVE) - r_.m256[0] = simde_mm256_sqrt_ps(a_.m256[0]); - r_.m256[1] = simde_mm256_sqrt_ps(a_.m256[1]); - #elif defined(simde_math_sqrtf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_sqrtf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) -# define _mm512_sqrt_ps(a) simde_mm512_sqrt_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_sqrt_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_sqrt_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_sqrt_ps(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_sqrt_ps - #define _mm512_mask_sqrt_ps(src, k, a) simde_mm512_mask_sqrt_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_sqrt_pd (simde__m512d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_sqrt_pd(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if defined(SIMDE_X86_AVX_NATIVE) - r_.m256d[0] = simde_mm256_sqrt_pd(a_.m256d[0]); - r_.m256d[1] = simde_mm256_sqrt_pd(a_.m256d[1]); - #elif defined(simde_math_sqrt) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_sqrt(a_.f64[i]); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) -# define _mm512_sqrt_pd(a) simde_mm512_sqrt_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_sqrt_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_sqrt_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_sqrt_pd(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_sqrt_pd - #define _mm512_mask_sqrt_pd(src, k, a) simde_mm512_mask_sqrt_pd(src, k, a) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SQRT_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/sra.h b/ffi-deps/simde/simde/x86/avx512/sra.h deleted file mode 100644 index 3a7512d..0000000 --- a/ffi-deps/simde/simde/x86/avx512/sra.h +++ /dev/null @@ -1,81 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_SRA_H) -#define SIMDE_X86_AVX512_SRA_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_sra_epi16 (simde__m512i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_sra_epi16(a, count); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_sra_epi16(a_.m256i[i], count); - } - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - uint64_t shift = HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]); - - if (shift > 15) shift = 15; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 >> HEDLEY_STATIC_CAST(int16_t, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] >> shift; - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_sra_epi16 - #define _mm512_sra_epi16(a, count) simde_mm512_sra_epi16(a, count) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SRA_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/srai.h b/ffi-deps/simde/simde/x86/avx512/srai.h deleted file mode 100644 index 4fcbd95..0000000 --- a/ffi-deps/simde/simde/x86/avx512/srai.h +++ /dev/null @@ -1,96 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_SRAI_H) -#define SIMDE_X86_AVX512_SRAI_H - -#include "types.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_srai_epi16 (simde__m512i a, const int imm8) { - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - unsigned int shift = HEDLEY_STATIC_CAST(unsigned int, imm8); - - if (shift > 15) shift = 15; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 >> HEDLEY_STATIC_CAST(int16_t, shift); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] >> shift; - } - #endif - - return simde__m512i_from_private(r_); -} -#if defined(SIMDE_X86_AVX512BW_NATIVE) -# define simde_mm512_srai_epi16(a, imm8) _mm512_srai_epi16(a, imm8) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_srai_epi16 - #define _mm512_srai_epi16(a, imm8) simde_mm512_srai_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_srai_epi32 (simde__m512i a, const unsigned int imm8) { - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 >> HEDLEY_STATIC_CAST(int32_t, imm8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] >> imm8; - } - #endif - - return simde__m512i_from_private(r_); -} -#if defined(SIMDE_X86_AVX512F_NATIVE) -# define simde_mm512_srai_epi32(a, imm8) _mm512_srai_epi32(a, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_srai_epi32 - #define _mm512_srai_epi32(a, imm8) simde_mm512_srai_epi32(a, imm8) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SRAI_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/srav.h b/ffi-deps/simde/simde/x86/avx512/srav.h deleted file mode 100644 index 9c811f5..0000000 --- a/ffi-deps/simde/simde/x86/avx512/srav.h +++ /dev/null @@ -1,67 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_SRAV_H) -#define SIMDE_X86_AVX512_SRAV_H - -#include "types.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_srav_epi16 (simde__m512i a, simde__m512i count) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_srav_epi16(a, count); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - count_ = simde__m512i_to_private(count); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - uint32_t shift = HEDLEY_STATIC_CAST(uint32_t, count_.i16[i]); - if (shift > 16) shift = 15; - r_.i16[i] = a_.i16[i] >> shift; - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_srav_epi16 - #define _mm512_srav_epi16(a, count) simde_mm512_srav_epi16(a, count) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SRAV_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/srl.h b/ffi-deps/simde/simde/x86/avx512/srl.h deleted file mode 100644 index 31e3fa1..0000000 --- a/ffi-deps/simde/simde/x86/avx512/srl.h +++ /dev/null @@ -1,216 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_SRL_H) -#define SIMDE_X86_AVX512_SRL_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" -#include "setzero.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_srl_epi16 (simde__m512i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_srl_epi16(a, count); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_srl_epi16(a_.m256i[i], count); - } - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - if (HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]) > 15) - return simde_mm512_setzero_si512(); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u16 = a_.u16 >> count_.i64[0]; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.u16[i] = a_.u16[i] >> count_.i64[0]; - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_srl_epi16 - #define _mm512_srl_epi16(a, count) simde_mm512_srl_epi16(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_srl_epi32 (simde__m512i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_srl_epi32(a, count); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_srl_epi32(a_.m256i[i], count); - } - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - if (HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]) > 31) - return simde_mm512_setzero_si512(); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = a_.u32 >> count_.i64[0]; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.u32[i] = a_.u32[i] >> count_.i64[0]; - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_srl_epi32 - #define _mm512_srl_epi32(a, count) simde_mm512_srl_epi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_srl_epi32(simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_srl_epi32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_srl_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_srl_epi32 - #define _mm512_mask_srl_epi32(src, k, a, b) simde_mm512_mask_srl_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_srl_epi32(simde__mmask16 k, simde__m512i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_srl_epi32(k, a, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_srl_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_srl_epi32 - #define _mm512_maskz_srl_epi32(k, a, b) simde_mm512_maskz_srl_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_srl_epi64 (simde__m512i a, simde__m128i count) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_srl_epi64(a, count); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_srl_epi64(a_.m256i[i], count); - } - #else - simde__m128i_private - count_ = simde__m128i_to_private(count); - - if (HEDLEY_STATIC_CAST(uint64_t, count_.i64[0]) > 63) - return simde_mm512_setzero_si512(); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u64 = a_.u64 >> count_.i64[0]; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.u64[i] = a_.u64[i] >> count_.i64[0]; - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_srl_epi64 - #define _mm512_srl_epi64(a, count) simde_mm512_srl_epi64(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_srl_epi64(simde__m512i src, simde__mmask8 k, simde__m512i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_srl_epi64(src, k, a, b); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_srl_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_srl_epi64 - #define _mm512_mask_srl_epi64(src, k, a, b) simde_mm512_mask_srl_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_srl_epi64(simde__mmask8 k, simde__m512i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_srl_epi64(k, a, b); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_srl_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_srl_epi64 - #define _mm512_maskz_srl_epi64(k, a, b) simde_mm512_maskz_srl_epi64(k, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SRL_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/srli.h b/ffi-deps/simde/simde/x86/avx512/srli.h deleted file mode 100644 index f240693..0000000 --- a/ffi-deps/simde/simde/x86/avx512/srli.h +++ /dev/null @@ -1,180 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_SRLI_H) -#define SIMDE_X86_AVX512_SRLI_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" -#include "setzero.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_srli_epi16 (simde__m512i a, const unsigned int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && (defined(HEDLEY_GCC_VERSION) && ((__GNUC__ == 5 && __GNUC_MINOR__ == 5) || (__GNUC__ == 6 && __GNUC_MINOR__ >= 4))) - simde__m512i r; - - SIMDE_CONSTIFY_16_(_mm512_srli_epi16, r, simde_mm512_setzero_si512(), imm8, a); - - return r; - #elif defined(SIMDE_X86_AVX512BW_NATIVE) - return SIMDE_BUG_IGNORE_SIGN_CONVERSION(_mm512_srli_epi16(a, imm8)); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - if (HEDLEY_STATIC_CAST(unsigned int, imm8) > 15) - return simde_mm512_setzero_si512(); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(16, imm8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = a_.u16[i] >> imm8; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_NATIVE) - #define simde_mm512_srli_epi16(a, imm8) _mm512_srli_epi16(a, imm8) -#endif -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_srli_epi16 - #define _mm512_srli_epi16(a, imm8) simde_mm512_srli_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_srli_epi32 (simde__m512i a, unsigned int imm8) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && (defined(HEDLEY_GCC_VERSION) && ((__GNUC__ == 5 && __GNUC_MINOR__ == 5) || (__GNUC__ == 6 && __GNUC_MINOR__ >= 4))) - simde__m512i r; - - SIMDE_CONSTIFY_32_(_mm512_srli_epi32, r, simde_mm512_setzero_si512(), imm8, a); - - return r; - #elif defined(SIMDE_X86_AVX512F_NATIVE) - return SIMDE_BUG_IGNORE_SIGN_CONVERSION(_mm512_srli_epi32(a, imm8)); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if defined(SIMDE_X86_AVX2_NATIVE) - r_.m256i[0] = simde_mm256_srli_epi32(a_.m256i[0], HEDLEY_STATIC_CAST(int, imm8)); - r_.m256i[1] = simde_mm256_srli_epi32(a_.m256i[1], HEDLEY_STATIC_CAST(int, imm8)); - #elif defined(SIMDE_X86_SSE2_NATIVE) - r_.m128i[0] = simde_mm_srli_epi32(a_.m128i[0], HEDLEY_STATIC_CAST(int, imm8)); - r_.m128i[1] = simde_mm_srli_epi32(a_.m128i[1], HEDLEY_STATIC_CAST(int, imm8)); - r_.m128i[2] = simde_mm_srli_epi32(a_.m128i[2], HEDLEY_STATIC_CAST(int, imm8)); - r_.m128i[3] = simde_mm_srli_epi32(a_.m128i[3], HEDLEY_STATIC_CAST(int, imm8)); - #else - if (imm8 > 31) { - simde_memset(&r_, 0, sizeof(r_)); - } else { - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = a_.u32 >> imm8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] >> imm8; - } - #endif - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_srli_epi32 - #define _mm512_srli_epi32(a, imm8) simde_mm512_srli_epi32(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_srli_epi64 (simde__m512i a, unsigned int imm8) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && (defined(HEDLEY_GCC_VERSION) && ((__GNUC__ == 5 && __GNUC_MINOR__ == 5) || (__GNUC__ == 6 && __GNUC_MINOR__ >= 4))) - simde__m512i r; - - SIMDE_CONSTIFY_64_(_mm512_srli_epi64, r, simde_mm512_setzero_si512(), imm8, a); - - return r; - #elif defined(SIMDE_X86_AVX512F_NATIVE) - return SIMDE_BUG_IGNORE_SIGN_CONVERSION(_mm512_srli_epi64(a, imm8)); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a); - - #if defined(SIMDE_X86_AVX2_NATIVE) - r_.m256i[0] = simde_mm256_srli_epi64(a_.m256i[0], HEDLEY_STATIC_CAST(int, imm8)); - r_.m256i[1] = simde_mm256_srli_epi64(a_.m256i[1], HEDLEY_STATIC_CAST(int, imm8)); - #elif defined(SIMDE_X86_SSE2_NATIVE) - r_.m128i[0] = simde_mm_srli_epi64(a_.m128i[0], HEDLEY_STATIC_CAST(int, imm8)); - r_.m128i[1] = simde_mm_srli_epi64(a_.m128i[1], HEDLEY_STATIC_CAST(int, imm8)); - r_.m128i[2] = simde_mm_srli_epi64(a_.m128i[2], HEDLEY_STATIC_CAST(int, imm8)); - r_.m128i[3] = simde_mm_srli_epi64(a_.m128i[3], HEDLEY_STATIC_CAST(int, imm8)); - #else - /* The Intel Intrinsics Guide says that only the 8 LSBits of imm8 are - * used. In this case we should do "imm8 &= 0xff" here. However in - * practice all bits are used. */ - if (imm8 > 63) { - simde_memset(&r_, 0, sizeof(r_)); - } else { - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_97248) - r_.u64 = a_.u64 >> imm8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = a_.u64[i] >> imm8; - } - #endif - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_srli_epi64 - #define _mm512_srli_epi64(a, imm8) simde_mm512_srli_epi64(a, imm8) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SRLI_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/srlv.h b/ffi-deps/simde/simde/x86/avx512/srlv.h deleted file mode 100644 index 7b7f774..0000000 --- a/ffi-deps/simde/simde/x86/avx512/srlv.h +++ /dev/null @@ -1,282 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_SRLV_H) -#define SIMDE_X86_AVX512_SRLV_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srlv_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_srlv_epi16(a, b); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - r_; - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), (b_.u16 < 16)) & (a_.u16 >> b_.u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (b_.u16[i] < 16) ? (a_.u16[i] >> b_.u16[i]) : 0; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm_srlv_epi16 - #define _mm_srlv_epi16(a, b) simde_mm_srlv_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_srlv_epi16(simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_mask_srlv_epi16(src, k, a, b); - #else - return simde_mm_mask_mov_epi16(src, k, simde_mm_srlv_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_srlv_epi16 - #define _mm_mask_srlv_epi16(src, k, a, b) simde_mm_mask_srlv_epi16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_srlv_epi16(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm_maskz_srlv_epi16(k, a, b); - #else - return simde_mm_maskz_mov_epi16(k, simde_mm_srlv_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_srlv_epi16 - #define _mm_maskz_srlv_epi16(k, a, b) simde_mm_maskz_srlv_epi16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_srlv_epi32(simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_srlv_epi32(src, k, a, b); - #else - return simde_mm_mask_mov_epi32(src, k, simde_mm_srlv_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_srlv_epi32 - #define _mm_mask_srlv_epi32(src, k, a, b) simde_mm_mask_srlv_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_srlv_epi32(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_srlv_epi32(k, a, b); - #else - return simde_mm_maskz_mov_epi32(k, simde_mm_srlv_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_srlv_epi32 - #define _mm_maskz_srlv_epi32(k, a, b) simde_mm_maskz_srlv_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_srlv_epi64(simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_srlv_epi64(src, k, a, b); - #else - return simde_mm_mask_mov_epi64(src, k, simde_mm_srlv_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_srlv_epi64 - #define _mm_mask_srlv_epi64(src, k, a, b) simde_mm_mask_srlv_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_srlv_epi64(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_srlv_epi64(k, a, b); - #else - return simde_mm_maskz_mov_epi64(k, simde_mm_srlv_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_srlv_epi64 - #define _mm_maskz_srlv_epi64(k, a, b) simde_mm_maskz_srlv_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_srlv_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm256_srlv_epi16(a, b); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b), - r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_srlv_epi16(a_.m128i[i], b_.m128i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), (b_.u16 < 16)) & (a_.u16 >> b_.u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (b_.u16[i] < 16) ? (a_.u16[i] >> b_.u16[i]) : 0; - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm256_srlv_epi16 - #define _mm256_srlv_epi16(a, b) simde_mm256_srlv_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_srlv_epi16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_srlv_epi16(a, b); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b), - r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_srlv_epi16(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), (b_.u16 < 16)) & (a_.u16 >> b_.u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (b_.u16[i] < 16) ? (a_.u16[i] >> b_.u16[i]) : 0; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_srlv_epi16 - #define _mm512_srlv_epi16(a, b) simde_mm512_srlv_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_srlv_epi32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_srlv_epi32(a, b); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b), - r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_srlv_epi32(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), (b_.u32 < 32)) & (a_.u32 >> b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (b_.u32[i] < 32) ? (a_.u32[i] >> b_.u32[i]) : 0; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_srlv_epi32 - #define _mm512_srlv_epi32(a, b) simde_mm512_srlv_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_srlv_epi64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_srlv_epi64(a, b); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b), - r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_srlv_epi64(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), (b_.u64 < 64)) & (a_.u64 >> b_.u64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (b_.u64[i] < 64) ? (a_.u64[i] >> b_.u64[i]) : 0; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_srlv_epi64 - #define _mm512_srlv_epi64(a, b) simde_mm512_srlv_epi64(a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SRLV_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/store.h b/ffi-deps/simde/simde/x86/avx512/store.h deleted file mode 100644 index 1f1538b..0000000 --- a/ffi-deps/simde/simde/x86/avx512/store.h +++ /dev/null @@ -1,93 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - */ - -#if !defined(SIMDE_X86_AVX512_STORE_H) -#define SIMDE_X86_AVX512_STORE_H - -#include "types.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm512_store_ps (void * mem_addr, simde__m512 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - _mm512_store_ps(mem_addr, a); - #else - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512), &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_store_ps - #define _mm512_store_ps(mem_addr, a) simde_mm512_store_ps(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm512_store_pd (void * mem_addr, simde__m512d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - _mm512_store_pd(mem_addr, a); - #else - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512d), &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_store_pd - #define _mm512_store_pd(mem_addr, a) simde_mm512_store_pd(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm512_store_si512 (void * mem_addr, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - _mm512_store_si512(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a); - #else - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m512i), &a, sizeof(a)); - #endif -} -#define simde_mm512_store_epi8(mem_addr, a) simde_mm512_store_si512(mem_addr, a) -#define simde_mm512_store_epi16(mem_addr, a) simde_mm512_store_si512(mem_addr, a) -#define simde_mm512_store_epi32(mem_addr, a) simde_mm512_store_si512(mem_addr, a) -#define simde_mm512_store_epi64(mem_addr, a) simde_mm512_store_si512(mem_addr, a) -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_store_epi8 - #undef _mm512_store_epi16 - #undef _mm512_store_epi32 - #undef _mm512_store_epi64 - #undef _mm512_store_si512 - #define _mm512_store_si512(mem_addr, a) simde_mm512_store_si512(mem_addr, a) - #define _mm512_store_epi8(mem_addr, a) simde_mm512_store_si512(mem_addr, a) - #define _mm512_store_epi16(mem_addr, a) simde_mm512_store_si512(mem_addr, a) - #define _mm512_store_epi32(mem_addr, a) simde_mm512_store_si512(mem_addr, a) - #define _mm512_store_epi64(mem_addr, a) simde_mm512_store_si512(mem_addr, a) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_STORE_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/storeu.h b/ffi-deps/simde/simde/x86/avx512/storeu.h deleted file mode 100644 index e00801f..0000000 --- a/ffi-deps/simde/simde/x86/avx512/storeu.h +++ /dev/null @@ -1,218 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - */ - -#if !defined(SIMDE_X86_AVX512_STOREU_H) -#define SIMDE_X86_AVX512_STOREU_H - -#include "types.h" -#include "mov.h" -#include "setzero.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -#define simde_mm256_storeu_epi8(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) -#define simde_mm256_storeu_epi16(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) -#define simde_mm256_storeu_epi32(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) -#define simde_mm256_storeu_epi64(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_storeu_epi8 - #undef _mm256_storeu_epi16 - #define _mm256_storeu_epi8(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) - #define _mm256_storeu_epi16(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_storeu_epi32 - #undef _mm256_storeu_epi64 - #define _mm256_storeu_epi32(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) - #define _mm256_storeu_epi64(mem_addr, a) simde_mm256_storeu_si256(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm256_mask_storeu_epi16 (void * mem_addr, simde__mmask16 k, simde__m256i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - _mm256_mask_storeu_epi16(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); - #else - const simde__m256i zero = simde_mm256_setzero_si256(); - simde_mm256_storeu_epi16(mem_addr, simde_mm256_mask_mov_epi16(zero, k, a)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_storeu_epi16 - #define _mm256_mask_storeu_epi16(mem_addr, k, a) simde_mm256_mask_storeu_epi16(mem_addr, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm512_storeu_ps (void * mem_addr, simde__m512 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - _mm512_storeu_ps(mem_addr, a); - #else - simde_memcpy(mem_addr, &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_storeu_ps - #define _mm512_storeu_ps(mem_addr, a) simde_mm512_storeu_ps(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm512_storeu_pd (void * mem_addr, simde__m512d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - _mm512_storeu_pd(mem_addr, a); - #else - simde_memcpy(mem_addr, &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_storeu_pd - #define _mm512_storeu_pd(mem_addr, a) simde_mm512_storeu_pd(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm512_storeu_ph (void * mem_addr, simde__m512h a) { - #if defined(SIMDE_X86_AVX512FP16_NATIVE) - _mm512_storeu_ph(mem_addr, a); - #else - simde_memcpy(mem_addr, &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_AVX512FP16_ENABLE_NATIVE_ALIASES) - #undef _mm512_storeu_ph - #define _mm512_storeu_ph(mem_addr, a) simde_mm512_storeu_ph(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm512_storeu_si512 (void * mem_addr, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - _mm512_storeu_si512(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a); - #else - simde_memcpy(mem_addr, &a, sizeof(a)); - #endif -} -#define simde_mm512_storeu_epi8(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) -#define simde_mm512_storeu_epi16(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) -#define simde_mm512_storeu_epi32(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) -#define simde_mm512_storeu_epi64(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_storeu_epi8 - #undef _mm512_storeu_epi16 - #define _mm512_storeu_epi16(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) - #define _mm512_storeu_epi8(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_storeu_epi32 - #undef _mm512_storeu_epi64 - #undef _mm512_storeu_si512 - #define _mm512_storeu_si512(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) - #define _mm512_storeu_epi32(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) - #define _mm512_storeu_epi64(mem_addr, a) simde_mm512_storeu_si512(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm512_mask_storeu_epi16 (void * mem_addr, simde__mmask32 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - _mm512_mask_storeu_epi16(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); - #else - const simde__m512i zero = simde_mm512_setzero_si512(); - simde_mm512_storeu_epi16(mem_addr, simde_mm512_mask_mov_epi16(zero, k, a)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_storeu_epi16 - #define _mm512_mask_storeu_epi16(mem_addr, k, a) simde_mm512_mask_storeu_epi16(mem_addr, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm512_mask_storeu_epi32 (void * mem_addr, simde__mmask16 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - _mm512_mask_storeu_epi32(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); - #else - const simde__m512i zero = simde_mm512_setzero_si512(); - simde_mm512_storeu_epi32(mem_addr, simde_mm512_mask_mov_epi32(zero, k, a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_storeu_epi32 - #define _mm512_mask_storeu_epi32(mem_addr, k, a) simde_mm512_mask_storeu_epi32(mem_addr, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm512_mask_storeu_epi64 (void * mem_addr, simde__mmask8 k, simde__m512i a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - _mm512_mask_storeu_epi64(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); - #else - const simde__m512i zero = simde_mm512_setzero_si512(); - simde_mm512_storeu_epi64(mem_addr, simde_mm512_mask_mov_epi64(zero, k, a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_storeu_epi64 - #define _mm512_mask_storeu_epi64(mem_addr, k, a) simde_mm512_mask_storeu_epi64(mem_addr, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm512_mask_storeu_ps (void * mem_addr, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - _mm512_mask_storeu_ps(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); - #else - const simde__m512 zero = simde_mm512_setzero_ps(); - simde_mm512_storeu_ps(mem_addr, simde_mm512_mask_mov_ps(zero, k, a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_storeu_ps - #define _mm512_mask_storeu_ps(mem_addr, k, a) simde_mm512_mask_storeu_ps(mem_addr, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm512_mask_storeu_pd (void * mem_addr, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - _mm512_mask_storeu_pd(HEDLEY_REINTERPRET_CAST(void*, mem_addr), k, a); - #else - const simde__m512d zero = simde_mm512_setzero_pd(); - simde_mm512_storeu_pd(mem_addr, simde_mm512_mask_mov_pd(zero, k, a)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_storeu_pd - #define _mm512_mask_storeu_pd(mem_addr, k, a) simde_mm512_mask_storeu_pd(mem_addr, k, a) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_STOREU_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/sub.h b/ffi-deps/simde/simde/x86/avx512/sub.h deleted file mode 100644 index 6e44d85..0000000 --- a/ffi-deps/simde/simde/x86/avx512/sub.h +++ /dev/null @@ -1,351 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Himanshi Mathur - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_SUB_H) -#define SIMDE_X86_AVX512_SUB_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_sub_epi8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_sub_epi8(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = a_.i8 - b_.i8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_sub_epi8(a_.m256i[i], b_.m256i[i]); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_sub_epi8 - #define _mm512_sub_epi8(a, b) simde_mm512_sub_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_sub_epi8 (simde__m512i src, simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_sub_epi8(src, k, a, b); - #else - return simde_mm512_mask_mov_epi8(src, k, simde_mm512_sub_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_sub_epi8 - #define _mm512_mask_sub_epi8(src, k, a, b) simde_mm512_mask_sub_epi8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_sub_epi8 (simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_sub_epi8(k, a, b); - #else - return simde_mm512_maskz_mov_epi8(k, simde_mm512_sub_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_sub_epi8 - #define _mm512_maskz_sub_epi8(k, a, b) simde_mm512_maskz_sub_epi8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_sub_epi16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_sub_epi16(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = a_.i16 - b_.i16; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_sub_epi16(a_.m256i[i], b_.m256i[i]); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_sub_epi16 - #define _mm512_sub_epi16(a, b) simde_mm512_sub_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_sub_epi32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_sub_epi32(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 - b_.i32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_sub_epi32(a_.m256i[i], b_.m256i[i]); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_sub_epi32 - #define _mm512_sub_epi32(a, b) simde_mm512_sub_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_sub_epi32 (simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_sub_epi32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_sub_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_sub_epi32 - #define _mm512_mask_sub_epi32(src, k, a, b) simde_mm512_mask_sub_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_sub_epi32(simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_sub_epi32(k, a, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_sub_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_sub_epi32 - #define _mm512_maskz_sub_epi32(k, a, b) simde_mm512_maskz_sub_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_sub_epi64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_sub_epi64(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 - b_.i64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_sub_epi64(a_.m256i[i], b_.m256i[i]); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_sub_epi64 - #define _mm512_sub_epi64(a, b) simde_mm512_sub_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_sub_epi64 (simde__m512i src, simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_sub_epi64(src, k, a, b); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_sub_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_sub_epi64 - #define _mm512_mask_sub_epi64(src, k, a, b) simde_mm512_mask_sub_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_sub_epi64(simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_sub_epi64(k, a, b); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_sub_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_sub_epi64 - #define _mm512_maskz_sub_epi64(k, a, b) simde_mm512_maskz_sub_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_sub_ps (simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_sub_ps(a, b); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a), - b_ = simde__m512_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = a_.f32 - b_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_sub_ps(a_.m256[i], b_.m256[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_sub_ps - #define _mm512_sub_ps(a, b) simde_mm512_sub_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_sub_ps (simde__m512 src, simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_sub_ps(src, k, a, b); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_sub_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_sub_ps - #define _mm512_mask_sub_ps(src, k, a, b) simde_mm512_mask_sub_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_sub_ps(simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_sub_ps(k, a, b); - #else - return simde_mm512_maskz_mov_ps(k, simde_mm512_sub_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_sub_ps - #define _mm512_maskz_sub_ps(k, a, b) simde_mm512_maskz_sub_ps(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_sub_pd (simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_sub_pd(a, b); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a), - b_ = simde__m512d_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 - b_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_sub_pd(a_.m256d[i], b_.m256d[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_sub_pd - #define _mm512_sub_pd(a, b) simde_mm512_sub_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_sub_pd (simde__m512d src, simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_sub_pd(src, k, a, b); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_sub_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_sub_pd - #define _mm512_mask_sub_pd(src, k, a, b) simde_mm512_mask_sub_pd(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_maskz_sub_pd(simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_sub_pd(k, a, b); - #else - return simde_mm512_maskz_mov_pd(k, simde_mm512_sub_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_sub_pd - #define _mm512_maskz_sub_pd(k, a, b) simde_mm512_maskz_sub_pd(k, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SUB_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/subs.h b/ffi-deps/simde/simde/x86/avx512/subs.h deleted file mode 100644 index 114ecf1..0000000 --- a/ffi-deps/simde/simde/x86/avx512/subs.h +++ /dev/null @@ -1,222 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_SUBS_H) -#define SIMDE_X86_AVX512_SUBS_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_subs_epi8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_subs_epi8(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if !defined(HEDLEY_INTEL_VERSION) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_subs_epi8(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = simde_math_subs_i8(a_.i8[i], b_.i8[i]); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_subs_epi8 - #define _mm512_subs_epi8(a, b) simde_mm512_subs_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_subs_epi8 (simde__m512i src, simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_subs_epi8(src, k, a, b); - #else - return simde_mm512_mask_mov_epi8(src, k, simde_mm512_subs_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_subs_epi8 - #define _mm512_mask_subs_epi8(src, k, a, b) simde_mm512_mask_subs_epi8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_subs_epi8 (simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_subs_epi8(k, a, b); - #else - return simde_mm512_maskz_mov_epi8(k, simde_mm512_subs_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_subs_epi8 - #define _mm512_maskz_subs_epi8(k, a, b) simde_mm512_maskz_subs_epi8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_subs_epi16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_subs_epi16(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if !defined(HEDLEY_INTEL_VERSION) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_subs_epi16(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = simde_math_subs_i16(a_.i16[i], b_.i16[i]); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_subs_epi16 - #define _mm512_subs_epi16(a, b) simde_mm512_subs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_subs_epu8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_subs_epu8(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if !defined(HEDLEY_INTEL_VERSION) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_subs_epu8(a_.m128i[i], b_.m128i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = simde_math_subs_u8(a_.u8[i], b_.u8[i]); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_subs_epu8 - #define _mm512_subs_epu8(a, b) simde_mm512_subs_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_subs_epu8 (simde__m512i src, simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_subs_epu8(src, k, a, b); - #else - return simde_mm512_mask_mov_epi8(src, k, simde_mm512_subs_epu8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_subs_epu8 - #define _mm512_mask_subs_epu8(src, k, a, b) simde_mm512_mask_subs_epu8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_subs_epu8 (simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_subs_epu8(k, a, b); - #else - return simde_mm512_maskz_mov_epi8(k, simde_mm512_subs_epu8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_subs_epu8 - #define _mm512_maskz_subs_epu8(k, a, b) simde_mm512_maskz_subs_epu8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_subs_epu16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_subs_epu16(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if !defined(HEDLEY_INTEL_VERSION) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_subs_epu16(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = simde_math_subs_u16(a_.u16[i], b_.u16[i]); - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_subs_epu16 - #define _mm512_subs_epu16(a, b) simde_mm512_subs_epu16(a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_SUBS_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/ternarylogic.h b/ffi-deps/simde/simde/x86/avx512/ternarylogic.h deleted file mode 100644 index c9a2f67..0000000 --- a/ffi-deps/simde/simde/x86/avx512/ternarylogic.h +++ /dev/null @@ -1,3769 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2021 Kunwar Maheep Singh - * 2021 Christopher Moore - */ - -/* The ternarylogic implementation is based on Wojciech Muła's work at - * https://github.com/WojciechMula/ternary-logic */ - -#if !defined(SIMDE_X86_AVX512_TERNARYLOGIC_H) -#define SIMDE_X86_AVX512_TERNARYLOGIC_H - -#include "types.h" -#include "movm.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x00_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, a); - HEDLEY_STATIC_CAST(void, b); - HEDLEY_STATIC_CAST(void, c); - const uint_fast32_t c0 = 0; - return c0; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x01_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b | c; - const uint_fast32_t t1 = a | t0; - const uint_fast32_t t2 = ~t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x02_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b | a; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = c & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x03_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, c); - const uint_fast32_t t0 = b | a; - const uint_fast32_t t1 = ~t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x04_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a | c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = b & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x05_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, b); - const uint_fast32_t t0 = c | a; - const uint_fast32_t t1 = ~t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x06_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = b ^ c; - const uint_fast32_t t2 = t0 & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x07_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & c; - const uint_fast32_t t1 = a | t0; - const uint_fast32_t t2 = ~t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x08_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = t0 & b; - const uint_fast32_t t2 = t1 & c; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x09_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ c; - const uint_fast32_t t1 = a | t0; - const uint_fast32_t t2 = ~t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x0a_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, b); - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = c & t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x0b_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = ~b; - const uint_fast32_t t2 = t1 | c; - const uint_fast32_t t3 = t0 & t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x0c_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, c); - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = b & t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x0d_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = ~c; - const uint_fast32_t t2 = t1 | b; - const uint_fast32_t t3 = t0 & t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x0e_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = b | c; - const uint_fast32_t t2 = t0 & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x0f_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, b); - HEDLEY_STATIC_CAST(void, c); - const uint_fast32_t t0 = ~a; - return t0; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x10_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b | c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = a & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x11_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, a); - const uint_fast32_t t0 = c | b; - const uint_fast32_t t1 = ~t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x12_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = a ^ c; - const uint_fast32_t t2 = t0 & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x13_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & c; - const uint_fast32_t t1 = b | t0; - const uint_fast32_t t2 = ~t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x14_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = a ^ b; - const uint_fast32_t t2 = t0 & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x15_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & a; - const uint_fast32_t t1 = c | t0; - const uint_fast32_t t2 = ~t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x16_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b | c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = a & t1; - const uint_fast32_t t3 = ~a; - const uint_fast32_t t4 = b ^ c; - const uint_fast32_t t5 = t3 & t4; - const uint_fast32_t t6 = t2 | t5; - return t6; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x17_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b | c; - const uint_fast32_t t1 = b & c; - const uint_fast32_t t2 = (a & t0) | (~a & t1); - const uint_fast32_t t3 = ~t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x18_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ b; - const uint_fast32_t t1 = a ^ c; - const uint_fast32_t t2 = t0 & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x19_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ c; - const uint_fast32_t t1 = b & c; - const uint_fast32_t t2 = a & t1; - const uint_fast32_t t3 = t0 ^ t2; - const uint_fast32_t t4 = ~t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x1a_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & b; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = a ^ c; - const uint_fast32_t t3 = t1 & t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x1b_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & c; - const uint_fast32_t t1 = ~b; - const uint_fast32_t t2 = t1 | c; - const uint_fast32_t t3 = t0 ^ t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x1c_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = a ^ b; - const uint_fast32_t t3 = t1 & t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x1d_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & b; - const uint_fast32_t t1 = ~c; - const uint_fast32_t t2 = t1 | b; - const uint_fast32_t t3 = t0 ^ t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x1e_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b | c; - const uint_fast32_t t1 = a ^ t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x1f_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b | c; - const uint_fast32_t t1 = a & t0; - const uint_fast32_t t2 = ~t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x20_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = t0 & a; - const uint_fast32_t t2 = t1 & c; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x21_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ c; - const uint_fast32_t t1 = b | t0; - const uint_fast32_t t2 = ~t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x22_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, a); - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = c & t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x23_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = ~a; - const uint_fast32_t t2 = t1 | c; - const uint_fast32_t t3 = t0 & t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x24_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ b; - const uint_fast32_t t1 = b ^ c; - const uint_fast32_t t2 = t0 & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x25_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & b; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = ~c; - const uint_fast32_t t3 = a ^ t2; - const uint_fast32_t t4 = t1 & t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x26_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & b; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = b ^ c; - const uint_fast32_t t3 = t1 & t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x27_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & c; - const uint_fast32_t t1 = ~a; - const uint_fast32_t t2 = t1 | c; - const uint_fast32_t t3 = t0 ^ t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x28_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ a; - const uint_fast32_t t1 = c & t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x29_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = t0 | c; - const uint_fast32_t t2 = ~a; - const uint_fast32_t t3 = b ^ c; - const uint_fast32_t t4 = t2 ^ t3; - const uint_fast32_t t5 = t1 & t4; - return t5; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x2a_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & a; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = c & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x2b_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & a; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = c & t1; - const uint_fast32_t t3 = ~c; - const uint_fast32_t t4 = b | a; - const uint_fast32_t t5 = ~t4; - const uint_fast32_t t6 = t3 & t5; - const uint_fast32_t t7 = t2 | t6; - return t7; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x2c_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b | c; - const uint_fast32_t t1 = a ^ b; - const uint_fast32_t t2 = t0 & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x2d_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = b | t0; - const uint_fast32_t t2 = a ^ t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x2e_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b | c; - const uint_fast32_t t1 = a & b; - const uint_fast32_t t2 = t0 ^ t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x2f_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = ~b; - const uint_fast32_t t2 = t1 & c; - const uint_fast32_t t3 = t0 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x30_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, c); - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = a & t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x31_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = ~c; - const uint_fast32_t t2 = t1 | a; - const uint_fast32_t t3 = t0 & t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x32_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = a | c; - const uint_fast32_t t2 = t0 & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x33_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, a); - HEDLEY_STATIC_CAST(void, c); - const uint_fast32_t t0 = ~b; - return t0; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x34_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = a ^ b; - const uint_fast32_t t3 = t1 & t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x35_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & b; - const uint_fast32_t t1 = ~c; - const uint_fast32_t t2 = t1 | a; - const uint_fast32_t t3 = t0 ^ t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x36_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a | c; - const uint_fast32_t t1 = b ^ t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x37_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a | c; - const uint_fast32_t t1 = b & t0; - const uint_fast32_t t2 = ~t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x38_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a | c; - const uint_fast32_t t1 = a ^ b; - const uint_fast32_t t2 = t0 & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x39_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = t0 | a; - const uint_fast32_t t2 = b ^ t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x3a_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = a & t0; - const uint_fast32_t t2 = ~a; - const uint_fast32_t t3 = t2 & c; - const uint_fast32_t t4 = t1 | t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x3b_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = ~a; - const uint_fast32_t t2 = t1 & c; - const uint_fast32_t t3 = t0 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x3c_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, c); - const uint_fast32_t t0 = b ^ a; - return t0; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x3d_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ b; - const uint_fast32_t t1 = a | c; - const uint_fast32_t t2 = ~t1; - const uint_fast32_t t3 = t0 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x3e_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = t0 & c; - const uint_fast32_t t2 = a ^ b; - const uint_fast32_t t3 = t1 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x3f_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, c); - const uint_fast32_t t0 = b & a; - const uint_fast32_t t1 = ~t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x40_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = t0 & a; - const uint_fast32_t t2 = t1 & b; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x41_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ a; - const uint_fast32_t t1 = c | t0; - const uint_fast32_t t2 = ~t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x42_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ c; - const uint_fast32_t t1 = b ^ c; - const uint_fast32_t t2 = t0 & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x43_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = ~b; - const uint_fast32_t t3 = a ^ t2; - const uint_fast32_t t4 = t1 & t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x44_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, a); - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = b & t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x45_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = ~a; - const uint_fast32_t t2 = t1 | b; - const uint_fast32_t t3 = t0 & t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x46_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = b ^ c; - const uint_fast32_t t3 = t1 & t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x47_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & c; - const uint_fast32_t t1 = ~a; - const uint_fast32_t t2 = t1 | b; - const uint_fast32_t t3 = t0 ^ t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x48_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ c; - const uint_fast32_t t1 = b & t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x49_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = t0 | b; - const uint_fast32_t t2 = ~a; - const uint_fast32_t t3 = b ^ c; - const uint_fast32_t t4 = t2 ^ t3; - const uint_fast32_t t5 = t1 & t4; - return t5; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x4a_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b | c; - const uint_fast32_t t1 = a ^ c; - const uint_fast32_t t2 = t0 & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x4b_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = t0 | c; - const uint_fast32_t t2 = a ^ t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x4c_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = b & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x4d_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = b & t1; - const uint_fast32_t t3 = ~b; - const uint_fast32_t t4 = a | c; - const uint_fast32_t t5 = ~t4; - const uint_fast32_t t6 = t3 & t5; - const uint_fast32_t t7 = t2 | t6; - return t7; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x4e_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = c & t0; - const uint_fast32_t t2 = ~c; - const uint_fast32_t t3 = t2 & b; - const uint_fast32_t t4 = t1 | t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x4f_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = ~c; - const uint_fast32_t t2 = b & t1; - const uint_fast32_t t3 = t0 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x50_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, b); - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = a & t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x51_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = ~b; - const uint_fast32_t t2 = t1 | a; - const uint_fast32_t t3 = t0 & t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x52_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = a ^ c; - const uint_fast32_t t3 = t1 & t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x53_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & c; - const uint_fast32_t t1 = ~b; - const uint_fast32_t t2 = t1 | a; - const uint_fast32_t t3 = t0 ^ t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x54_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = a | b; - const uint_fast32_t t2 = t0 & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x55_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, a); - HEDLEY_STATIC_CAST(void, b); - const uint_fast32_t t0 = ~c; - return t0; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x56_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b | a; - const uint_fast32_t t1 = c ^ t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x57_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b | a; - const uint_fast32_t t1 = c & t0; - const uint_fast32_t t2 = ~t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x58_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a | b; - const uint_fast32_t t1 = a ^ c; - const uint_fast32_t t2 = t0 & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x59_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = t0 | a; - const uint_fast32_t t2 = c ^ t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x5a_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, b); - const uint_fast32_t t0 = c ^ a; - return t0; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x5b_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a | b; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = a ^ c; - const uint_fast32_t t3 = t1 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x5c_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = a & t0; - const uint_fast32_t t2 = ~a; - const uint_fast32_t t3 = t2 & b; - const uint_fast32_t t4 = t1 | t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x5d_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = ~a; - const uint_fast32_t t2 = t1 & b; - const uint_fast32_t t3 = t0 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x5e_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = t0 & b; - const uint_fast32_t t2 = a ^ c; - const uint_fast32_t t3 = t1 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x5f_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, b); - const uint_fast32_t t0 = c & a; - const uint_fast32_t t1 = ~t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x60_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ c; - const uint_fast32_t t1 = a & t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x61_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = t0 | a; - const uint_fast32_t t2 = ~b; - const uint_fast32_t t3 = a ^ c; - const uint_fast32_t t4 = t2 ^ t3; - const uint_fast32_t t5 = t1 & t4; - return t5; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x62_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a | c; - const uint_fast32_t t1 = b ^ c; - const uint_fast32_t t2 = t0 & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x63_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = t0 | c; - const uint_fast32_t t2 = b ^ t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x64_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a | b; - const uint_fast32_t t1 = b ^ c; - const uint_fast32_t t2 = t0 & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x65_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = t0 | b; - const uint_fast32_t t2 = c ^ t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x66_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, a); - const uint_fast32_t t0 = c ^ b; - return t0; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x67_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ c; - const uint_fast32_t t1 = a | b; - const uint_fast32_t t2 = ~t1; - const uint_fast32_t t3 = t0 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x68_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ c; - const uint_fast32_t t1 = a & t0; - const uint_fast32_t t2 = ~a; - const uint_fast32_t t3 = b & c; - const uint_fast32_t t4 = t2 & t3; - const uint_fast32_t t5 = t1 | t4; - return t5; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x69_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ c; - const uint_fast32_t t1 = a ^ t0; - const uint_fast32_t t2 = ~t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x6a_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & a; - const uint_fast32_t t1 = c ^ t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x6b_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = t0 & c; - const uint_fast32_t c1 = ~HEDLEY_STATIC_CAST(uint_fast32_t, 0); - const uint_fast32_t t2 = a ^ c1; - const uint_fast32_t t3 = b ^ c; - const uint_fast32_t t4 = t2 ^ t3; - const uint_fast32_t t5 = t1 | t4; - return t5; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x6c_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & c; - const uint_fast32_t t1 = b ^ t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x6d_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = t0 & b; - const uint_fast32_t c1 = ~HEDLEY_STATIC_CAST(uint_fast32_t, 0); - const uint_fast32_t t2 = a ^ c1; - const uint_fast32_t t3 = b ^ c; - const uint_fast32_t t4 = t2 ^ t3; - const uint_fast32_t t5 = t1 | t4; - return t5; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x6e_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = t0 & b; - const uint_fast32_t t2 = b ^ c; - const uint_fast32_t t3 = t1 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x6f_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = b ^ c; - const uint_fast32_t t2 = t0 | t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x70_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = a & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x71_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b | c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = b ^ c; - const uint_fast32_t t3 = a & t2; - const uint_fast32_t t4 = t1 | t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x72_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = c & t0; - const uint_fast32_t t2 = ~c; - const uint_fast32_t t3 = t2 & a; - const uint_fast32_t t4 = t1 | t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x73_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = ~c; - const uint_fast32_t t2 = a & t1; - const uint_fast32_t t3 = t0 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x74_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = b & t0; - const uint_fast32_t t2 = ~b; - const uint_fast32_t t3 = t2 & a; - const uint_fast32_t t4 = t1 | t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x75_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = ~b; - const uint_fast32_t t2 = a & t1; - const uint_fast32_t t3 = t0 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x76_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = t0 & a; - const uint_fast32_t t2 = b ^ c; - const uint_fast32_t t3 = t1 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x77_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, a); - const uint_fast32_t t0 = c & b; - const uint_fast32_t t1 = ~t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x78_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & c; - const uint_fast32_t t1 = a ^ t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x79_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = t0 & a; - const uint_fast32_t c1 = ~HEDLEY_STATIC_CAST(uint_fast32_t, 0); - const uint_fast32_t t2 = b ^ c1; - const uint_fast32_t t3 = a ^ c; - const uint_fast32_t t4 = t2 ^ t3; - const uint_fast32_t t5 = t1 | t4; - return t5; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x7a_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = t0 & a; - const uint_fast32_t t2 = a ^ c; - const uint_fast32_t t3 = t1 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x7b_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = a ^ c; - const uint_fast32_t t2 = t0 | t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x7c_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = t0 & a; - const uint_fast32_t t2 = a ^ b; - const uint_fast32_t t3 = t1 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x7d_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = a ^ b; - const uint_fast32_t t2 = t0 | t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x7e_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ b; - const uint_fast32_t t1 = a ^ c; - const uint_fast32_t t2 = t0 | t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x7f_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & b; - const uint_fast32_t t1 = t0 & c; - const uint_fast32_t c1 = ~HEDLEY_STATIC_CAST(uint_fast32_t, 0); - const uint_fast32_t t2 = t1 ^ c1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x80_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & c; - const uint_fast32_t t1 = a & t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x81_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = ~b; - const uint_fast32_t t3 = a ^ t2; - const uint_fast32_t t4 = t1 & t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x82_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ a; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = c & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x83_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ b; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = ~a; - const uint_fast32_t t3 = t2 | c; - const uint_fast32_t t4 = t1 & t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x84_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = b & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x85_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = ~c; - const uint_fast32_t t3 = t2 | b; - const uint_fast32_t t4 = t1 & t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x86_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b | c; - const uint_fast32_t t1 = a ^ b; - const uint_fast32_t t2 = c ^ t1; - const uint_fast32_t t3 = t0 & t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x87_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & c; - const uint_fast32_t t1 = a ^ t0; - const uint_fast32_t t2 = ~t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x88_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, a); - const uint_fast32_t t0 = c & b; - return t0; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x89_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = ~a; - const uint_fast32_t t3 = t2 | b; - const uint_fast32_t t4 = t1 & t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x8a_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = t0 | b; - const uint_fast32_t t2 = c & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x8b_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = t0 | b; - const uint_fast32_t t2 = ~b; - const uint_fast32_t t3 = t2 | c; - const uint_fast32_t t4 = t1 & t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x8c_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = t0 | c; - const uint_fast32_t t2 = b & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x8d_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = t0 | b; - const uint_fast32_t t2 = ~a; - const uint_fast32_t t3 = t2 | c; - const uint_fast32_t t4 = t1 & t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x8e_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & c; - const uint_fast32_t t1 = ~a; - const uint_fast32_t t2 = b ^ c; - const uint_fast32_t t3 = t1 & t2; - const uint_fast32_t t4 = t0 | t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x8f_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = b & c; - const uint_fast32_t t2 = t0 | t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x90_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = a & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x91_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = ~b; - const uint_fast32_t t3 = t2 | a; - const uint_fast32_t t4 = t1 & t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x92_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a | c; - const uint_fast32_t t1 = a ^ b; - const uint_fast32_t t2 = c ^ t1; - const uint_fast32_t t3 = t0 & t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x93_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & c; - const uint_fast32_t t1 = b ^ t0; - const uint_fast32_t t2 = ~t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x94_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a | b; - const uint_fast32_t t1 = a ^ c; - const uint_fast32_t t2 = b ^ t1; - const uint_fast32_t t3 = t0 & t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x95_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & a; - const uint_fast32_t t1 = c ^ t0; - const uint_fast32_t t2 = ~t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x96_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ c; - const uint_fast32_t t1 = a ^ t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x97_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = t0 | a; - const uint_fast32_t t2 = t1 ^ a; - const uint_fast32_t t3 = b ^ c; - const uint_fast32_t t4 = a ^ t3; - const uint_fast32_t t5 = t2 | t4; - return t5; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x98_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = a | b; - const uint_fast32_t t3 = t1 & t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x99_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, a); - const uint_fast32_t t0 = c ^ b; - const uint_fast32_t t1 = ~t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x9a_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = t0 & a; - const uint_fast32_t t2 = t1 ^ c; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x9b_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = ~a; - const uint_fast32_t t3 = t2 & c; - const uint_fast32_t t4 = t1 | t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x9c_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = t0 & a; - const uint_fast32_t t2 = t1 ^ b; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x9d_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = ~a; - const uint_fast32_t t3 = t2 & b; - const uint_fast32_t t4 = t1 | t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x9e_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & c; - const uint_fast32_t t1 = a ^ b; - const uint_fast32_t t2 = c ^ t1; - const uint_fast32_t t3 = t0 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0x9f_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ c; - const uint_fast32_t t1 = a & t0; - const uint_fast32_t t2 = ~t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xa0_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, b); - const uint_fast32_t t0 = c & a; - return t0; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xa1_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = ~b; - const uint_fast32_t t3 = t2 | a; - const uint_fast32_t t4 = t1 & t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xa2_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = a | t0; - const uint_fast32_t t2 = c & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xa3_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = t0 | a; - const uint_fast32_t t2 = ~a; - const uint_fast32_t t3 = t2 | c; - const uint_fast32_t t4 = t1 & t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xa4_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = a | b; - const uint_fast32_t t3 = t1 & t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xa5_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, b); - const uint_fast32_t t0 = c ^ a; - const uint_fast32_t t1 = ~t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xa6_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = t0 & b; - const uint_fast32_t t2 = t1 ^ c; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xa7_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = ~b; - const uint_fast32_t t3 = t2 & c; - const uint_fast32_t t4 = t1 | t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xa8_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a | b; - const uint_fast32_t t1 = c & t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xa9_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b | a; - const uint_fast32_t t1 = c ^ t0; - const uint_fast32_t t2 = ~t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xaa_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, a); - HEDLEY_STATIC_CAST(void, b); - return c; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xab_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b | a; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = c | t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xac_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & c; - const uint_fast32_t t1 = ~a; - const uint_fast32_t t2 = t1 & b; - const uint_fast32_t t3 = t0 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xad_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = b & c; - const uint_fast32_t t3 = t1 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xae_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = t0 & b; - const uint_fast32_t t2 = t1 | c; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xaf_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, b); - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = c | t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xb0_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = t0 | c; - const uint_fast32_t t2 = a & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xb1_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = t0 | a; - const uint_fast32_t t2 = ~b; - const uint_fast32_t t3 = t2 | c; - const uint_fast32_t t4 = t1 & t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xb2_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & c; - const uint_fast32_t t1 = b & t0; - const uint_fast32_t t2 = ~b; - const uint_fast32_t t3 = a | c; - const uint_fast32_t t4 = t2 & t3; - const uint_fast32_t t5 = t1 | t4; - return t5; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xb3_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = a & c; - const uint_fast32_t t2 = t0 | t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xb4_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = t0 & b; - const uint_fast32_t t2 = t1 ^ a; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xb5_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = ~b; - const uint_fast32_t t3 = t2 & a; - const uint_fast32_t t4 = t1 | t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xb6_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & c; - const uint_fast32_t t1 = a ^ b; - const uint_fast32_t t2 = c ^ t1; - const uint_fast32_t t3 = t0 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xb7_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ c; - const uint_fast32_t t1 = b & t0; - const uint_fast32_t t2 = ~t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xb8_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & c; - const uint_fast32_t t1 = ~b; - const uint_fast32_t t2 = t1 & a; - const uint_fast32_t t3 = t0 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xb9_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = a & c; - const uint_fast32_t t3 = t1 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xba_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = t0 & a; - const uint_fast32_t t2 = t1 | c; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xbb_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, a); - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = c | t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xbc_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & c; - const uint_fast32_t t1 = a ^ b; - const uint_fast32_t t2 = t0 | t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xbd_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = a ^ b; - const uint_fast32_t t3 = t1 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xbe_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ a; - const uint_fast32_t t1 = c | t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xbf_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & a; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = c | t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xc0_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, c); - const uint_fast32_t t0 = b & a; - return t0; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xc1_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ b; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = ~c; - const uint_fast32_t t3 = t2 | a; - const uint_fast32_t t4 = t1 & t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xc2_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ b; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = a | c; - const uint_fast32_t t3 = t1 & t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xc3_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, c); - const uint_fast32_t t0 = b ^ a; - const uint_fast32_t t1 = ~t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xc4_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = t0 | a; - const uint_fast32_t t2 = b & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xc5_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = t0 | a; - const uint_fast32_t t2 = ~a; - const uint_fast32_t t3 = t2 | b; - const uint_fast32_t t4 = t1 & t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xc6_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = t0 & c; - const uint_fast32_t t2 = t1 ^ b; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xc7_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ b; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = ~c; - const uint_fast32_t t3 = t2 & b; - const uint_fast32_t t4 = t1 | t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xc8_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a | c; - const uint_fast32_t t1 = b & t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xc9_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a | c; - const uint_fast32_t t1 = b ^ t0; - const uint_fast32_t t2 = ~t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xca_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & b; - const uint_fast32_t t1 = ~a; - const uint_fast32_t t2 = t1 & c; - const uint_fast32_t t3 = t0 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xcb_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ b; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = b & c; - const uint_fast32_t t3 = t1 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xcc_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, a); - HEDLEY_STATIC_CAST(void, c); - return b; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xcd_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a | c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = b | t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xce_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = t0 & c; - const uint_fast32_t t2 = t1 | b; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xcf_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, c); - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = b | t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xd0_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = b | t0; - const uint_fast32_t t2 = a & t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xd1_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b | c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = a & b; - const uint_fast32_t t3 = t1 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xd2_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = t0 & c; - const uint_fast32_t t2 = t1 ^ a; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xd3_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ b; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = ~c; - const uint_fast32_t t3 = t2 & a; - const uint_fast32_t t4 = t1 | t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xd4_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = b & t0; - const uint_fast32_t t2 = b ^ c; - const uint_fast32_t t3 = ~t2; - const uint_fast32_t t4 = a & t3; - const uint_fast32_t t5 = t1 | t4; - return t5; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xd5_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = a & b; - const uint_fast32_t t2 = t0 | t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xd6_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & b; - const uint_fast32_t t1 = a ^ c; - const uint_fast32_t t2 = b ^ t1; - const uint_fast32_t t3 = t0 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xd7_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ a; - const uint_fast32_t t1 = c & t0; - const uint_fast32_t t2 = ~t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xd8_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = c & b; - const uint_fast32_t t1 = ~c; - const uint_fast32_t t2 = t1 & a; - const uint_fast32_t t3 = t0 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xd9_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = a & b; - const uint_fast32_t t3 = t1 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xda_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & b; - const uint_fast32_t t1 = a ^ c; - const uint_fast32_t t2 = t0 | t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xdb_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ b; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = a ^ c; - const uint_fast32_t t3 = t1 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xdc_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = t0 & a; - const uint_fast32_t t2 = t1 | b; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xdd_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, a); - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = b | t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xde_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ c; - const uint_fast32_t t1 = b | t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xdf_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = b | t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xe0_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b | c; - const uint_fast32_t t1 = a & t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xe1_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b | c; - const uint_fast32_t t1 = a ^ t0; - const uint_fast32_t t2 = ~t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xe2_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & a; - const uint_fast32_t t1 = ~b; - const uint_fast32_t t2 = t1 & c; - const uint_fast32_t t3 = t0 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xe3_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ b; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = a & c; - const uint_fast32_t t3 = t1 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xe4_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = c & a; - const uint_fast32_t t1 = ~c; - const uint_fast32_t t2 = t1 & b; - const uint_fast32_t t3 = t0 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xe5_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = a & b; - const uint_fast32_t t3 = t1 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xe6_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & b; - const uint_fast32_t t1 = b ^ c; - const uint_fast32_t t2 = t0 | t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xe7_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ c; - const uint_fast32_t t1 = ~a; - const uint_fast32_t t2 = t1 ^ c; - const uint_fast32_t t3 = t0 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xe8_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & c; - const uint_fast32_t t1 = b ^ c; - const uint_fast32_t t2 = a & t1; - const uint_fast32_t t3 = t0 | t2; - return t3; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xe9_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = b ^ c; - const uint_fast32_t t2 = t0 ^ t1; - const uint_fast32_t t3 = a & b; - const uint_fast32_t t4 = t2 | t3; - return t4; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xea_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & a; - const uint_fast32_t t1 = c | t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xeb_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ a; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = c | t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xec_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a & c; - const uint_fast32_t t1 = b | t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xed_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = a ^ c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = b | t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xee_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, a); - const uint_fast32_t t0 = c | b; - return t0; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xef_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~a; - const uint_fast32_t t1 = b | c; - const uint_fast32_t t2 = t0 | t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xf0_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, b); - HEDLEY_STATIC_CAST(void, c); - return a; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xf1_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b | c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = a | t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xf2_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = t0 & c; - const uint_fast32_t t2 = t1 | a; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xf3_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, c); - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = a | t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xf4_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = t0 & b; - const uint_fast32_t t2 = t1 | a; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xf5_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, b); - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = a | t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xf6_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ c; - const uint_fast32_t t1 = a | t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xf7_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = a | t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xf8_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b & c; - const uint_fast32_t t1 = a | t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xf9_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b ^ c; - const uint_fast32_t t1 = ~t0; - const uint_fast32_t t2 = a | t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xfa_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, b); - const uint_fast32_t t0 = c | a; - return t0; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xfb_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~b; - const uint_fast32_t t1 = t0 | c; - const uint_fast32_t t2 = a | t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xfc_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, c); - const uint_fast32_t t0 = b | a; - return t0; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xfd_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = ~c; - const uint_fast32_t t1 = a | b; - const uint_fast32_t t2 = t0 | t1; - return t2; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xfe_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - const uint_fast32_t t0 = b | c; - const uint_fast32_t t1 = a | t0; - return t1; -} - -SIMDE_FUNCTION_ATTRIBUTES -uint_fast32_t -simde_x_ternarylogic_0xff_impl_(uint_fast32_t a, uint_fast32_t b, uint_fast32_t c) { - HEDLEY_STATIC_CAST(void, a); - HEDLEY_STATIC_CAST(void, b); - HEDLEY_STATIC_CAST(void, c); - const uint_fast32_t c1 = ~HEDLEY_STATIC_CAST(uint_fast32_t, 0); - return c1; -} - -#define SIMDE_X_TERNARYLOGIC_CASE(value) \ - case value: \ - SIMDE_VECTORIZE \ - for (size_t i = 0 ; i < (sizeof(r_.u32f) / sizeof(r_.u32f[0])) ; i++) { \ - r_.u32f[i] = HEDLEY_CONCAT3(simde_x_ternarylogic_, value, _impl_)(a_.u32f[i], b_.u32f[i], c_.u32f[i]); \ - } \ - break; - -#define SIMDE_X_TERNARYLOGIC_SWITCH(value) \ - switch(value) { \ - SIMDE_X_TERNARYLOGIC_CASE(0x00) \ - SIMDE_X_TERNARYLOGIC_CASE(0x01) \ - SIMDE_X_TERNARYLOGIC_CASE(0x02) \ - SIMDE_X_TERNARYLOGIC_CASE(0x03) \ - SIMDE_X_TERNARYLOGIC_CASE(0x04) \ - SIMDE_X_TERNARYLOGIC_CASE(0x05) \ - SIMDE_X_TERNARYLOGIC_CASE(0x06) \ - SIMDE_X_TERNARYLOGIC_CASE(0x07) \ - SIMDE_X_TERNARYLOGIC_CASE(0x08) \ - SIMDE_X_TERNARYLOGIC_CASE(0x09) \ - SIMDE_X_TERNARYLOGIC_CASE(0x0a) \ - SIMDE_X_TERNARYLOGIC_CASE(0x0b) \ - SIMDE_X_TERNARYLOGIC_CASE(0x0c) \ - SIMDE_X_TERNARYLOGIC_CASE(0x0d) \ - SIMDE_X_TERNARYLOGIC_CASE(0x0e) \ - SIMDE_X_TERNARYLOGIC_CASE(0x0f) \ - SIMDE_X_TERNARYLOGIC_CASE(0x10) \ - SIMDE_X_TERNARYLOGIC_CASE(0x11) \ - SIMDE_X_TERNARYLOGIC_CASE(0x12) \ - SIMDE_X_TERNARYLOGIC_CASE(0x13) \ - SIMDE_X_TERNARYLOGIC_CASE(0x14) \ - SIMDE_X_TERNARYLOGIC_CASE(0x15) \ - SIMDE_X_TERNARYLOGIC_CASE(0x16) \ - SIMDE_X_TERNARYLOGIC_CASE(0x17) \ - SIMDE_X_TERNARYLOGIC_CASE(0x18) \ - SIMDE_X_TERNARYLOGIC_CASE(0x19) \ - SIMDE_X_TERNARYLOGIC_CASE(0x1a) \ - SIMDE_X_TERNARYLOGIC_CASE(0x1b) \ - SIMDE_X_TERNARYLOGIC_CASE(0x1c) \ - SIMDE_X_TERNARYLOGIC_CASE(0x1d) \ - SIMDE_X_TERNARYLOGIC_CASE(0x1e) \ - SIMDE_X_TERNARYLOGIC_CASE(0x1f) \ - SIMDE_X_TERNARYLOGIC_CASE(0x20) \ - SIMDE_X_TERNARYLOGIC_CASE(0x21) \ - SIMDE_X_TERNARYLOGIC_CASE(0x22) \ - SIMDE_X_TERNARYLOGIC_CASE(0x23) \ - SIMDE_X_TERNARYLOGIC_CASE(0x24) \ - SIMDE_X_TERNARYLOGIC_CASE(0x25) \ - SIMDE_X_TERNARYLOGIC_CASE(0x26) \ - SIMDE_X_TERNARYLOGIC_CASE(0x27) \ - SIMDE_X_TERNARYLOGIC_CASE(0x28) \ - SIMDE_X_TERNARYLOGIC_CASE(0x29) \ - SIMDE_X_TERNARYLOGIC_CASE(0x2a) \ - SIMDE_X_TERNARYLOGIC_CASE(0x2b) \ - SIMDE_X_TERNARYLOGIC_CASE(0x2c) \ - SIMDE_X_TERNARYLOGIC_CASE(0x2d) \ - SIMDE_X_TERNARYLOGIC_CASE(0x2e) \ - SIMDE_X_TERNARYLOGIC_CASE(0x2f) \ - SIMDE_X_TERNARYLOGIC_CASE(0x30) \ - SIMDE_X_TERNARYLOGIC_CASE(0x31) \ - SIMDE_X_TERNARYLOGIC_CASE(0x32) \ - SIMDE_X_TERNARYLOGIC_CASE(0x33) \ - SIMDE_X_TERNARYLOGIC_CASE(0x34) \ - SIMDE_X_TERNARYLOGIC_CASE(0x35) \ - SIMDE_X_TERNARYLOGIC_CASE(0x36) \ - SIMDE_X_TERNARYLOGIC_CASE(0x37) \ - SIMDE_X_TERNARYLOGIC_CASE(0x38) \ - SIMDE_X_TERNARYLOGIC_CASE(0x39) \ - SIMDE_X_TERNARYLOGIC_CASE(0x3a) \ - SIMDE_X_TERNARYLOGIC_CASE(0x3b) \ - SIMDE_X_TERNARYLOGIC_CASE(0x3c) \ - SIMDE_X_TERNARYLOGIC_CASE(0x3d) \ - SIMDE_X_TERNARYLOGIC_CASE(0x3e) \ - SIMDE_X_TERNARYLOGIC_CASE(0x3f) \ - SIMDE_X_TERNARYLOGIC_CASE(0x40) \ - SIMDE_X_TERNARYLOGIC_CASE(0x41) \ - SIMDE_X_TERNARYLOGIC_CASE(0x42) \ - SIMDE_X_TERNARYLOGIC_CASE(0x43) \ - SIMDE_X_TERNARYLOGIC_CASE(0x44) \ - SIMDE_X_TERNARYLOGIC_CASE(0x45) \ - SIMDE_X_TERNARYLOGIC_CASE(0x46) \ - SIMDE_X_TERNARYLOGIC_CASE(0x47) \ - SIMDE_X_TERNARYLOGIC_CASE(0x48) \ - SIMDE_X_TERNARYLOGIC_CASE(0x49) \ - SIMDE_X_TERNARYLOGIC_CASE(0x4a) \ - SIMDE_X_TERNARYLOGIC_CASE(0x4b) \ - SIMDE_X_TERNARYLOGIC_CASE(0x4c) \ - SIMDE_X_TERNARYLOGIC_CASE(0x4d) \ - SIMDE_X_TERNARYLOGIC_CASE(0x4e) \ - SIMDE_X_TERNARYLOGIC_CASE(0x4f) \ - SIMDE_X_TERNARYLOGIC_CASE(0x50) \ - SIMDE_X_TERNARYLOGIC_CASE(0x51) \ - SIMDE_X_TERNARYLOGIC_CASE(0x52) \ - SIMDE_X_TERNARYLOGIC_CASE(0x53) \ - SIMDE_X_TERNARYLOGIC_CASE(0x54) \ - SIMDE_X_TERNARYLOGIC_CASE(0x55) \ - SIMDE_X_TERNARYLOGIC_CASE(0x56) \ - SIMDE_X_TERNARYLOGIC_CASE(0x57) \ - SIMDE_X_TERNARYLOGIC_CASE(0x58) \ - SIMDE_X_TERNARYLOGIC_CASE(0x59) \ - SIMDE_X_TERNARYLOGIC_CASE(0x5a) \ - SIMDE_X_TERNARYLOGIC_CASE(0x5b) \ - SIMDE_X_TERNARYLOGIC_CASE(0x5c) \ - SIMDE_X_TERNARYLOGIC_CASE(0x5d) \ - SIMDE_X_TERNARYLOGIC_CASE(0x5e) \ - SIMDE_X_TERNARYLOGIC_CASE(0x5f) \ - SIMDE_X_TERNARYLOGIC_CASE(0x60) \ - SIMDE_X_TERNARYLOGIC_CASE(0x61) \ - SIMDE_X_TERNARYLOGIC_CASE(0x62) \ - SIMDE_X_TERNARYLOGIC_CASE(0x63) \ - SIMDE_X_TERNARYLOGIC_CASE(0x64) \ - SIMDE_X_TERNARYLOGIC_CASE(0x65) \ - SIMDE_X_TERNARYLOGIC_CASE(0x66) \ - SIMDE_X_TERNARYLOGIC_CASE(0x67) \ - SIMDE_X_TERNARYLOGIC_CASE(0x68) \ - SIMDE_X_TERNARYLOGIC_CASE(0x69) \ - SIMDE_X_TERNARYLOGIC_CASE(0x6a) \ - SIMDE_X_TERNARYLOGIC_CASE(0x6b) \ - SIMDE_X_TERNARYLOGIC_CASE(0x6c) \ - SIMDE_X_TERNARYLOGIC_CASE(0x6d) \ - SIMDE_X_TERNARYLOGIC_CASE(0x6e) \ - SIMDE_X_TERNARYLOGIC_CASE(0x6f) \ - SIMDE_X_TERNARYLOGIC_CASE(0x70) \ - SIMDE_X_TERNARYLOGIC_CASE(0x71) \ - SIMDE_X_TERNARYLOGIC_CASE(0x72) \ - SIMDE_X_TERNARYLOGIC_CASE(0x73) \ - SIMDE_X_TERNARYLOGIC_CASE(0x74) \ - SIMDE_X_TERNARYLOGIC_CASE(0x75) \ - SIMDE_X_TERNARYLOGIC_CASE(0x76) \ - SIMDE_X_TERNARYLOGIC_CASE(0x77) \ - SIMDE_X_TERNARYLOGIC_CASE(0x78) \ - SIMDE_X_TERNARYLOGIC_CASE(0x79) \ - SIMDE_X_TERNARYLOGIC_CASE(0x7a) \ - SIMDE_X_TERNARYLOGIC_CASE(0x7b) \ - SIMDE_X_TERNARYLOGIC_CASE(0x7c) \ - SIMDE_X_TERNARYLOGIC_CASE(0x7d) \ - SIMDE_X_TERNARYLOGIC_CASE(0x7e) \ - SIMDE_X_TERNARYLOGIC_CASE(0x7f) \ - SIMDE_X_TERNARYLOGIC_CASE(0x80) \ - SIMDE_X_TERNARYLOGIC_CASE(0x81) \ - SIMDE_X_TERNARYLOGIC_CASE(0x82) \ - SIMDE_X_TERNARYLOGIC_CASE(0x83) \ - SIMDE_X_TERNARYLOGIC_CASE(0x84) \ - SIMDE_X_TERNARYLOGIC_CASE(0x85) \ - SIMDE_X_TERNARYLOGIC_CASE(0x86) \ - SIMDE_X_TERNARYLOGIC_CASE(0x87) \ - SIMDE_X_TERNARYLOGIC_CASE(0x88) \ - SIMDE_X_TERNARYLOGIC_CASE(0x89) \ - SIMDE_X_TERNARYLOGIC_CASE(0x8a) \ - SIMDE_X_TERNARYLOGIC_CASE(0x8b) \ - SIMDE_X_TERNARYLOGIC_CASE(0x8c) \ - SIMDE_X_TERNARYLOGIC_CASE(0x8d) \ - SIMDE_X_TERNARYLOGIC_CASE(0x8e) \ - SIMDE_X_TERNARYLOGIC_CASE(0x8f) \ - SIMDE_X_TERNARYLOGIC_CASE(0x90) \ - SIMDE_X_TERNARYLOGIC_CASE(0x91) \ - SIMDE_X_TERNARYLOGIC_CASE(0x92) \ - SIMDE_X_TERNARYLOGIC_CASE(0x93) \ - SIMDE_X_TERNARYLOGIC_CASE(0x94) \ - SIMDE_X_TERNARYLOGIC_CASE(0x95) \ - SIMDE_X_TERNARYLOGIC_CASE(0x96) \ - SIMDE_X_TERNARYLOGIC_CASE(0x97) \ - SIMDE_X_TERNARYLOGIC_CASE(0x98) \ - SIMDE_X_TERNARYLOGIC_CASE(0x99) \ - SIMDE_X_TERNARYLOGIC_CASE(0x9a) \ - SIMDE_X_TERNARYLOGIC_CASE(0x9b) \ - SIMDE_X_TERNARYLOGIC_CASE(0x9c) \ - SIMDE_X_TERNARYLOGIC_CASE(0x9d) \ - SIMDE_X_TERNARYLOGIC_CASE(0x9e) \ - SIMDE_X_TERNARYLOGIC_CASE(0x9f) \ - SIMDE_X_TERNARYLOGIC_CASE(0xa0) \ - SIMDE_X_TERNARYLOGIC_CASE(0xa1) \ - SIMDE_X_TERNARYLOGIC_CASE(0xa2) \ - SIMDE_X_TERNARYLOGIC_CASE(0xa3) \ - SIMDE_X_TERNARYLOGIC_CASE(0xa4) \ - SIMDE_X_TERNARYLOGIC_CASE(0xa5) \ - SIMDE_X_TERNARYLOGIC_CASE(0xa6) \ - SIMDE_X_TERNARYLOGIC_CASE(0xa7) \ - SIMDE_X_TERNARYLOGIC_CASE(0xa8) \ - SIMDE_X_TERNARYLOGIC_CASE(0xa9) \ - SIMDE_X_TERNARYLOGIC_CASE(0xaa) \ - SIMDE_X_TERNARYLOGIC_CASE(0xab) \ - SIMDE_X_TERNARYLOGIC_CASE(0xac) \ - SIMDE_X_TERNARYLOGIC_CASE(0xad) \ - SIMDE_X_TERNARYLOGIC_CASE(0xae) \ - SIMDE_X_TERNARYLOGIC_CASE(0xaf) \ - SIMDE_X_TERNARYLOGIC_CASE(0xb0) \ - SIMDE_X_TERNARYLOGIC_CASE(0xb1) \ - SIMDE_X_TERNARYLOGIC_CASE(0xb2) \ - SIMDE_X_TERNARYLOGIC_CASE(0xb3) \ - SIMDE_X_TERNARYLOGIC_CASE(0xb4) \ - SIMDE_X_TERNARYLOGIC_CASE(0xb5) \ - SIMDE_X_TERNARYLOGIC_CASE(0xb6) \ - SIMDE_X_TERNARYLOGIC_CASE(0xb7) \ - SIMDE_X_TERNARYLOGIC_CASE(0xb8) \ - SIMDE_X_TERNARYLOGIC_CASE(0xb9) \ - SIMDE_X_TERNARYLOGIC_CASE(0xba) \ - SIMDE_X_TERNARYLOGIC_CASE(0xbb) \ - SIMDE_X_TERNARYLOGIC_CASE(0xbc) \ - SIMDE_X_TERNARYLOGIC_CASE(0xbd) \ - SIMDE_X_TERNARYLOGIC_CASE(0xbe) \ - SIMDE_X_TERNARYLOGIC_CASE(0xbf) \ - SIMDE_X_TERNARYLOGIC_CASE(0xc0) \ - SIMDE_X_TERNARYLOGIC_CASE(0xc1) \ - SIMDE_X_TERNARYLOGIC_CASE(0xc2) \ - SIMDE_X_TERNARYLOGIC_CASE(0xc3) \ - SIMDE_X_TERNARYLOGIC_CASE(0xc4) \ - SIMDE_X_TERNARYLOGIC_CASE(0xc5) \ - SIMDE_X_TERNARYLOGIC_CASE(0xc6) \ - SIMDE_X_TERNARYLOGIC_CASE(0xc7) \ - SIMDE_X_TERNARYLOGIC_CASE(0xc8) \ - SIMDE_X_TERNARYLOGIC_CASE(0xc9) \ - SIMDE_X_TERNARYLOGIC_CASE(0xca) \ - SIMDE_X_TERNARYLOGIC_CASE(0xcb) \ - SIMDE_X_TERNARYLOGIC_CASE(0xcc) \ - SIMDE_X_TERNARYLOGIC_CASE(0xcd) \ - SIMDE_X_TERNARYLOGIC_CASE(0xce) \ - SIMDE_X_TERNARYLOGIC_CASE(0xcf) \ - SIMDE_X_TERNARYLOGIC_CASE(0xd0) \ - SIMDE_X_TERNARYLOGIC_CASE(0xd1) \ - SIMDE_X_TERNARYLOGIC_CASE(0xd2) \ - SIMDE_X_TERNARYLOGIC_CASE(0xd3) \ - SIMDE_X_TERNARYLOGIC_CASE(0xd4) \ - SIMDE_X_TERNARYLOGIC_CASE(0xd5) \ - SIMDE_X_TERNARYLOGIC_CASE(0xd6) \ - SIMDE_X_TERNARYLOGIC_CASE(0xd7) \ - SIMDE_X_TERNARYLOGIC_CASE(0xd8) \ - SIMDE_X_TERNARYLOGIC_CASE(0xd9) \ - SIMDE_X_TERNARYLOGIC_CASE(0xda) \ - SIMDE_X_TERNARYLOGIC_CASE(0xdb) \ - SIMDE_X_TERNARYLOGIC_CASE(0xdc) \ - SIMDE_X_TERNARYLOGIC_CASE(0xdd) \ - SIMDE_X_TERNARYLOGIC_CASE(0xde) \ - SIMDE_X_TERNARYLOGIC_CASE(0xdf) \ - SIMDE_X_TERNARYLOGIC_CASE(0xe0) \ - SIMDE_X_TERNARYLOGIC_CASE(0xe1) \ - SIMDE_X_TERNARYLOGIC_CASE(0xe2) \ - SIMDE_X_TERNARYLOGIC_CASE(0xe3) \ - SIMDE_X_TERNARYLOGIC_CASE(0xe4) \ - SIMDE_X_TERNARYLOGIC_CASE(0xe5) \ - SIMDE_X_TERNARYLOGIC_CASE(0xe6) \ - SIMDE_X_TERNARYLOGIC_CASE(0xe7) \ - SIMDE_X_TERNARYLOGIC_CASE(0xe8) \ - SIMDE_X_TERNARYLOGIC_CASE(0xe9) \ - SIMDE_X_TERNARYLOGIC_CASE(0xea) \ - SIMDE_X_TERNARYLOGIC_CASE(0xeb) \ - SIMDE_X_TERNARYLOGIC_CASE(0xec) \ - SIMDE_X_TERNARYLOGIC_CASE(0xed) \ - SIMDE_X_TERNARYLOGIC_CASE(0xee) \ - SIMDE_X_TERNARYLOGIC_CASE(0xef) \ - SIMDE_X_TERNARYLOGIC_CASE(0xf0) \ - SIMDE_X_TERNARYLOGIC_CASE(0xf1) \ - SIMDE_X_TERNARYLOGIC_CASE(0xf2) \ - SIMDE_X_TERNARYLOGIC_CASE(0xf3) \ - SIMDE_X_TERNARYLOGIC_CASE(0xf4) \ - SIMDE_X_TERNARYLOGIC_CASE(0xf5) \ - SIMDE_X_TERNARYLOGIC_CASE(0xf6) \ - SIMDE_X_TERNARYLOGIC_CASE(0xf7) \ - SIMDE_X_TERNARYLOGIC_CASE(0xf8) \ - SIMDE_X_TERNARYLOGIC_CASE(0xf9) \ - SIMDE_X_TERNARYLOGIC_CASE(0xfa) \ - SIMDE_X_TERNARYLOGIC_CASE(0xfb) \ - SIMDE_X_TERNARYLOGIC_CASE(0xfc) \ - SIMDE_X_TERNARYLOGIC_CASE(0xfd) \ - SIMDE_X_TERNARYLOGIC_CASE(0xfe) \ - SIMDE_X_TERNARYLOGIC_CASE(0xff) \ - } - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm_ternarylogic_epi32(a, b, c, imm8) _mm_ternarylogic_epi32(a, b, c, imm8) -#else - SIMDE_HUGE_FUNCTION_ATTRIBUTES - simde__m128i - simde_mm_ternarylogic_epi32(simde__m128i a, simde__m128i b, simde__m128i c, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - c_ = simde__m128i_to_private(c); - - #if defined(SIMDE_TERNARYLOGIC_COMPRESSION) - int to_do, mask; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - simde__m128i_private t_; - to_do = imm8; - - r_.u64 = a_.u64 ^ a_.u64; - - mask = 0xFF; - if ((to_do & mask) == mask) { - r_.u64 = ~r_.u64; - to_do &= ~mask; - } - - mask = 0xF0; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 = a_.u64; - to_do &= ~mask; - } - - mask = 0xCC; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= b_.u64; - to_do &= ~mask; - } - - mask = 0xAA; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= c_.u64; - to_do &= ~mask; - } - - mask = 0x0F; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= ~a_.u64; - to_do &= ~mask; - } - - mask = 0x33; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= ~b_.u64; - to_do &= ~mask; - } - - mask = 0x55; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= ~c_.u64; - to_do &= ~mask; - } - - mask = 0x3C; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= a_.u64 ^ b_.u64; - to_do &= ~mask; - } - - mask = 0x5A; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= a_.u64 ^ c_.u64; - to_do &= ~mask; - } - - mask = 0x66; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= b_.u64 ^ c_.u64; - to_do &= ~mask; - } - - mask = 0xA0; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= a_.u64 & c_.u64; - to_do &= ~mask; - } - - mask = 0x50; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= ~c_.u64 & a_.u64; - to_do &= ~mask; - } - - mask = 0x0A; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= ~a_.u64 & c_.u64; - to_do &= ~mask; - } - - mask = 0x88; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= b_.u64 & c_.u64; - to_do &= ~mask; - } - - mask = 0x44; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= ~c_.u64 & b_.u64; - to_do &= ~mask; - } - - mask = 0x22; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= ~b_.u64 & c_.u64; - to_do &= ~mask; - } - - if (to_do & 0xc0) { - t_.u64 = a_.u64 & b_.u64; - if ((to_do & 0xc0) == 0xc0) r_.u64 |= t_.u64; - else if (to_do & 0x80) r_.u64 |= c_.u64 & t_.u64; - else r_.u64 |= ~c_.u64 & t_.u64; - } - - if (to_do & 0x30) { - t_.u64 = ~b_.u64 & a_.u64; - if ((to_do & 0x30) == 0x30) r_.u64 |= t_.u64; - else if (to_do & 0x20) r_.u64 |= c_.u64 & t_.u64; - else r_.u64 |= ~c_.u64 & t_.u64; - } - - if (to_do & 0x0c) { - t_.u64 = ~a_.u64 & b_.u64; - if ((to_do & 0x0c) == 0x0c) r_.u64 |= t_.u64; - else if (to_do & 0x08) r_.u64 |= c_.u64 & t_.u64; - else r_.u64 |= ~c_.u64 & t_.u64; - } - - if (to_do & 0x03) { - t_.u64 = ~(a_.u64 | b_.u64); - if ((to_do & 0x03) == 0x03) r_.u64 |= t_.u64; - else if (to_do & 0x02) r_.u64 |= c_.u64 & t_.u64; - else r_.u64 |= ~c_.u64 & t_.u64; - } - #else - uint64_t t; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - to_do = imm8; - - mask = 0xFF; - if ((to_do & mask) == mask) { - r_.u64[i] = UINT64_MAX; - to_do &= ~mask; - } - else r_.u64[i] = 0; - - mask = 0xF0; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] = a_.u64[i]; - to_do &= ~mask; - } - - mask = 0xCC; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= b_.u64[i]; - to_do &= ~mask; - } - - mask = 0xAA; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= c_.u64[i]; - to_do &= ~mask; - } - - mask = 0x0F; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= ~a_.u64[i]; - to_do &= ~mask; - } - - mask = 0x33; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= ~b_.u64[i]; - to_do &= ~mask; - } - - mask = 0x55; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= ~c_.u64[i]; - to_do &= ~mask; - } - - mask = 0x3C; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= a_.u64[i] ^ b_.u64[i]; - to_do &= ~mask; - } - - mask = 0x5A; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= a_.u64[i] ^ c_.u64[i]; - to_do &= ~mask; - } - - mask = 0x66; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= b_.u64[i] ^ c_.u64[i]; - to_do &= ~mask; - } - - mask = 0xA0; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= a_.u64[i] & c_.u64[i]; - to_do &= ~mask; - } - - mask = 0x50; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= ~c_.u64[i] & a_.u64[i]; - to_do &= ~mask; - } - - mask = 0x0A; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= ~a_.u64[i] & c_.u64[i]; - to_do &= ~mask; - } - - mask = 0x88; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= b_.u64[i] & c_.u64[i]; - to_do &= ~mask; - } - - mask = 0x44; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= ~c_.u64[i] & b_.u64[i]; - to_do &= ~mask; - } - - mask = 0x22; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= ~b_.u64[i] & c_.u64[i]; - to_do &= ~mask; - } - - if (to_do & 0xc0) { - t = a_.u64[i] & b_.u64[i]; - if ((to_do & 0xc0) == 0xc0) r_.u64[i] |= t; - else if (to_do & 0x80) r_.u64[i] |= c_.u64[i] & t; - else r_.u64[i] |= ~c_.u64[i] & t; - } - - if (to_do & 0x30) { - t = ~b_.u64[i] & a_.u64[i]; - if ((to_do & 0x30) == 0x30) r_.u64[i] |= t; - else if (to_do & 0x20) r_.u64[i] |= c_.u64[i] & t; - else r_.u64[i] |= ~c_.u64[i] & t; - } - - if (to_do & 0x0c) { - t = ~a_.u64[i] & b_.u64[i]; - if ((to_do & 0x0c) == 0x0c) r_.u64[i] |= t; - else if (to_do & 0x08) r_.u64[i] |= c_.u64[i] & t; - else r_.u64[i] |= ~c_.u64[i] & t; - } - - if (to_do & 0x03) { - t = ~(a_.u64[i] | b_.u64[i]); - if ((to_do & 0x03) == 0x03) r_.u64[i] |= t; - else if (to_do & 0x02) r_.u64[i] |= c_.u64[i] & t; - else r_.u64[i] |= ~c_.u64[i] & t; - } - } - #endif - #else - SIMDE_X_TERNARYLOGIC_SWITCH(imm8 & 255) - #endif - - return simde__m128i_from_private(r_); - } -#endif -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_ternarylogic_epi32 - #define _mm_ternarylogic_epi32(a, b, c, imm8) simde_mm_ternarylogic_epi32(a, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm_mask_ternarylogic_epi32(src, k, a, b, imm8) _mm_mask_ternarylogic_epi32(src, k, a, b, imm8) -#else - #define simde_mm_mask_ternarylogic_epi32(src, k, a, b, imm8) simde_mm_mask_mov_epi32(src, k, simde_mm_ternarylogic_epi32(src, a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_ternarylogic_epi32 - #define _mm_mask_ternarylogic_epi32(src, k, a, b, imm8) simde_mm_mask_ternarylogic_epi32(src, k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm_maskz_ternarylogic_epi32(k, a, b, c, imm8) _mm_maskz_ternarylogic_epi32(k, a, b, c, imm8) -#else - #define simde_mm_maskz_ternarylogic_epi32(k, a, b, c, imm8) simde_mm_maskz_mov_epi32(k, simde_mm_ternarylogic_epi32(a, b, c, imm8)) -#endif -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_ternarylogic_epi32 - #define _mm_maskz_ternarylogic_epi32(k, a, b, c, imm8) simde_mm_maskz_ternarylogic_epi32(k, a, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm256_ternarylogic_epi32(a, b, c, imm8) _mm256_ternarylogic_epi32(a, b, c, imm8) -#else - SIMDE_HUGE_FUNCTION_ATTRIBUTES - simde__m256i - simde_mm256_ternarylogic_epi32(simde__m256i a, simde__m256i b, simde__m256i c, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b), - c_ = simde__m256i_to_private(c); - - #if defined(SIMDE_TERNARYLOGIC_COMPRESSION) - int to_do, mask; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - simde__m256i_private t_; - to_do = imm8; - - r_.u64 = a_.u64 ^ a_.u64; - - mask = 0xFF; - if ((to_do & mask) == mask) { - r_.u64 = ~r_.u64; - to_do &= ~mask; - } - - mask = 0xF0; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 = a_.u64; - to_do &= ~mask; - } - - mask = 0xCC; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= b_.u64; - to_do &= ~mask; - } - - mask = 0xAA; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= c_.u64; - to_do &= ~mask; - } - - mask = 0x0F; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= ~a_.u64; - to_do &= ~mask; - } - - mask = 0x33; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= ~b_.u64; - to_do &= ~mask; - } - - mask = 0x55; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= ~c_.u64; - to_do &= ~mask; - } - - mask = 0x3C; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= a_.u64 ^ b_.u64; - to_do &= ~mask; - } - - mask = 0x5A; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= a_.u64 ^ c_.u64; - to_do &= ~mask; - } - - mask = 0x66; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= b_.u64 ^ c_.u64; - to_do &= ~mask; - } - - mask = 0xA0; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= a_.u64 & c_.u64; - to_do &= ~mask; - } - - mask = 0x50; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= ~c_.u64 & a_.u64; - to_do &= ~mask; - } - - mask = 0x0A; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= ~a_.u64 & c_.u64; - to_do &= ~mask; - } - - mask = 0x88; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= b_.u64 & c_.u64; - to_do &= ~mask; - } - - mask = 0x44; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= ~c_.u64 & b_.u64; - to_do &= ~mask; - } - - mask = 0x22; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= ~b_.u64 & c_.u64; - to_do &= ~mask; - } - - if (to_do & 0xc0) { - t_.u64 = a_.u64 & b_.u64; - if ((to_do & 0xc0) == 0xc0) r_.u64 |= t_.u64; - else if (to_do & 0x80) r_.u64 |= c_.u64 & t_.u64; - else r_.u64 |= ~c_.u64 & t_.u64; - } - - if (to_do & 0x30) { - t_.u64 = ~b_.u64 & a_.u64; - if ((to_do & 0x30) == 0x30) r_.u64 |= t_.u64; - else if (to_do & 0x20) r_.u64 |= c_.u64 & t_.u64; - else r_.u64 |= ~c_.u64 & t_.u64; - } - - if (to_do & 0x0c) { - t_.u64 = ~a_.u64 & b_.u64; - if ((to_do & 0x0c) == 0x0c) r_.u64 |= t_.u64; - else if (to_do & 0x08) r_.u64 |= c_.u64 & t_.u64; - else r_.u64 |= ~c_.u64 & t_.u64; - } - - if (to_do & 0x03) { - t_.u64 = ~(a_.u64 | b_.u64); - if ((to_do & 0x03) == 0x03) r_.u64 |= t_.u64; - else if (to_do & 0x02) r_.u64 |= c_.u64 & t_.u64; - else r_.u64 |= ~c_.u64 & t_.u64; - } - #else - uint64_t t; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - to_do = imm8; - - mask = 0xFF; - if ((to_do & mask) == mask) { - r_.u64[i] = UINT64_MAX; - to_do &= ~mask; - } - else r_.u64[i] = 0; - - mask = 0xF0; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] = a_.u64[i]; - to_do &= ~mask; - } - - mask = 0xCC; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= b_.u64[i]; - to_do &= ~mask; - } - - mask = 0xAA; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= c_.u64[i]; - to_do &= ~mask; - } - - mask = 0x0F; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= ~a_.u64[i]; - to_do &= ~mask; - } - - mask = 0x33; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= ~b_.u64[i]; - to_do &= ~mask; - } - - mask = 0x55; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= ~c_.u64[i]; - to_do &= ~mask; - } - - mask = 0x3C; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= a_.u64[i] ^ b_.u64[i]; - to_do &= ~mask; - } - - mask = 0x5A; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= a_.u64[i] ^ c_.u64[i]; - to_do &= ~mask; - } - - mask = 0x66; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= b_.u64[i] ^ c_.u64[i]; - to_do &= ~mask; - } - - mask = 0xA0; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= a_.u64[i] & c_.u64[i]; - to_do &= ~mask; - } - - mask = 0x50; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= ~c_.u64[i] & a_.u64[i]; - to_do &= ~mask; - } - - mask = 0x0A; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= ~a_.u64[i] & c_.u64[i]; - to_do &= ~mask; - } - - mask = 0x88; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= b_.u64[i] & c_.u64[i]; - to_do &= ~mask; - } - - mask = 0x44; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= ~c_.u64[i] & b_.u64[i]; - to_do &= ~mask; - } - - mask = 0x22; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= ~b_.u64[i] & c_.u64[i]; - to_do &= ~mask; - } - - if (to_do & 0xc0) { - t = a_.u64[i] & b_.u64[i]; - if ((to_do & 0xc0) == 0xc0) r_.u64[i] |= t; - else if (to_do & 0x80) r_.u64[i] |= c_.u64[i] & t; - else r_.u64[i] |= ~c_.u64[i] & t; - } - - if (to_do & 0x30) { - t = ~b_.u64[i] & a_.u64[i]; - if ((to_do & 0x30) == 0x30) r_.u64[i] |= t; - else if (to_do & 0x20) r_.u64[i] |= c_.u64[i] & t; - else r_.u64[i] |= ~c_.u64[i] & t; - } - - if (to_do & 0x0c) { - t = ~a_.u64[i] & b_.u64[i]; - if ((to_do & 0x0c) == 0x0c) r_.u64[i] |= t; - else if (to_do & 0x08) r_.u64[i] |= c_.u64[i] & t; - else r_.u64[i] |= ~c_.u64[i] & t; - } - - if (to_do & 0x03) { - t = ~(a_.u64[i] | b_.u64[i]); - if ((to_do & 0x03) == 0x03) r_.u64[i] |= t; - else if (to_do & 0x02) r_.u64[i] |= c_.u64[i] & t; - else r_.u64[i] |= ~c_.u64[i] & t; - } - } - #endif - #else - SIMDE_X_TERNARYLOGIC_SWITCH(imm8 & 255) - #endif - - return simde__m256i_from_private(r_); - } -#endif -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm256_ternarylogic_epi32 - #define _mm256_ternarylogic_epi32(a, b, c, imm8) simde_mm256_ternarylogic_epi32(a, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm256_mask_ternarylogic_epi32(src, k, a, b, imm8) _mm256_mask_ternarylogic_epi32(src, k, a, b, imm8) -#else - #define simde_mm256_mask_ternarylogic_epi32(src, k, a, b, imm8) simde_mm256_mask_mov_epi32(src, k, simde_mm256_ternarylogic_epi32(src, a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_ternarylogic_epi32 - #define _mm256_mask_ternarylogic_epi32(src, k, a, b, imm8) simde_mm256_mask_ternarylogic_epi32(src, k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm256_maskz_ternarylogic_epi32(k, a, b, c, imm8) _mm256_maskz_ternarylogic_epi32(k, a, b, c, imm8) -#else - #define simde_mm256_maskz_ternarylogic_epi32(k, a, b, c, imm8) simde_mm256_maskz_mov_epi32(k, simde_mm256_ternarylogic_epi32(a, b, c, imm8)) -#endif -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_ternarylogic_epi32 - #define _mm256_maskz_ternarylogic_epi32(k, a, b, c, imm8) simde_mm256_maskz_ternarylogic_epi32(k, a, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_ternarylogic_epi32(a, b, c, imm8) _mm512_ternarylogic_epi32(a, b, c, imm8) -#else - SIMDE_HUGE_FUNCTION_ATTRIBUTES - simde__m512i - simde_mm512_ternarylogic_epi32(simde__m512i a, simde__m512i b, simde__m512i c, int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b), - c_ = simde__m512i_to_private(c); - - #if defined(SIMDE_TERNARYLOGIC_COMPRESSION) - int to_do, mask; - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - simde__m512i_private t_; - to_do = imm8; - - r_.u64 = a_.u64 ^ a_.u64; - - mask = 0xFF; - if ((to_do & mask) == mask) { - r_.u64 = ~r_.u64; - to_do &= ~mask; - } - - mask = 0xF0; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 = a_.u64; - to_do &= ~mask; - } - - mask = 0xCC; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= b_.u64; - to_do &= ~mask; - } - - mask = 0xAA; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= c_.u64; - to_do &= ~mask; - } - - mask = 0x0F; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= ~a_.u64; - to_do &= ~mask; - } - - mask = 0x33; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= ~b_.u64; - to_do &= ~mask; - } - - mask = 0x55; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= ~c_.u64; - to_do &= ~mask; - } - - mask = 0x3C; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= a_.u64 ^ b_.u64; - to_do &= ~mask; - } - - mask = 0x5A; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= a_.u64 ^ c_.u64; - to_do &= ~mask; - } - - mask = 0x66; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= b_.u64 ^ c_.u64; - to_do &= ~mask; - } - - mask = 0xA0; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= a_.u64 & c_.u64; - to_do &= ~mask; - } - - mask = 0x50; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= ~c_.u64 & a_.u64; - to_do &= ~mask; - } - - mask = 0x0A; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= ~a_.u64 & c_.u64; - to_do &= ~mask; - } - - mask = 0x88; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= b_.u64 & c_.u64; - to_do &= ~mask; - } - - mask = 0x44; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= ~c_.u64 & b_.u64; - to_do &= ~mask; - } - - mask = 0x22; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64 |= ~b_.u64 & c_.u64; - to_do &= ~mask; - } - - if (to_do & 0xc0) { - t_.u64 = a_.u64 & b_.u64; - if ((to_do & 0xc0) == 0xc0) r_.u64 |= t_.u64; - else if (to_do & 0x80) r_.u64 |= c_.u64 & t_.u64; - else r_.u64 |= ~c_.u64 & t_.u64; - } - - if (to_do & 0x30) { - t_.u64 = ~b_.u64 & a_.u64; - if ((to_do & 0x30) == 0x30) r_.u64 |= t_.u64; - else if (to_do & 0x20) r_.u64 |= c_.u64 & t_.u64; - else r_.u64 |= ~c_.u64 & t_.u64; - } - - if (to_do & 0x0c) { - t_.u64 = ~a_.u64 & b_.u64; - if ((to_do & 0x0c) == 0x0c) r_.u64 |= t_.u64; - else if (to_do & 0x08) r_.u64 |= c_.u64 & t_.u64; - else r_.u64 |= ~c_.u64 & t_.u64; - } - - if (to_do & 0x03) { - t_.u64 = ~(a_.u64 | b_.u64); - if ((to_do & 0x03) == 0x03) r_.u64 |= t_.u64; - else if (to_do & 0x02) r_.u64 |= c_.u64 & t_.u64; - else r_.u64 |= ~c_.u64 & t_.u64; - } - #else - uint64_t t; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - to_do = imm8; - - mask = 0xFF; - if ((to_do & mask) == mask) { - r_.u64[i] = UINT64_MAX; - to_do &= ~mask; - } - else r_.u64[i] = 0; - - mask = 0xF0; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] = a_.u64[i]; - to_do &= ~mask; - } - - mask = 0xCC; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= b_.u64[i]; - to_do &= ~mask; - } - - mask = 0xAA; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= c_.u64[i]; - to_do &= ~mask; - } - - mask = 0x0F; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= ~a_.u64[i]; - to_do &= ~mask; - } - - mask = 0x33; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= ~b_.u64[i]; - to_do &= ~mask; - } - - mask = 0x55; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= ~c_.u64[i]; - to_do &= ~mask; - } - - mask = 0x3C; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= a_.u64[i] ^ b_.u64[i]; - to_do &= ~mask; - } - - mask = 0x5A; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= a_.u64[i] ^ c_.u64[i]; - to_do &= ~mask; - } - - mask = 0x66; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= b_.u64[i] ^ c_.u64[i]; - to_do &= ~mask; - } - - mask = 0xA0; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= a_.u64[i] & c_.u64[i]; - to_do &= ~mask; - } - - mask = 0x50; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= ~c_.u64[i] & a_.u64[i]; - to_do &= ~mask; - } - - mask = 0x0A; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= ~a_.u64[i] & c_.u64[i]; - to_do &= ~mask; - } - - mask = 0x88; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= b_.u64[i] & c_.u64[i]; - to_do &= ~mask; - } - - mask = 0x44; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= ~c_.u64[i] & b_.u64[i]; - to_do &= ~mask; - } - - mask = 0x22; - if ((to_do & mask) && ((imm8 & mask) == mask)) { - r_.u64[i] |= ~b_.u64[i] & c_.u64[i]; - to_do &= ~mask; - } - - if (to_do & 0xc0) { - t = a_.u64[i] & b_.u64[i]; - if ((to_do & 0xc0) == 0xc0) r_.u64[i] |= t; - else if (to_do & 0x80) r_.u64[i] |= c_.u64[i] & t; - else r_.u64[i] |= ~c_.u64[i] & t; - } - - if (to_do & 0x30) { - t = ~b_.u64[i] & a_.u64[i]; - if ((to_do & 0x30) == 0x30) r_.u64[i] |= t; - else if (to_do & 0x20) r_.u64[i] |= c_.u64[i] & t; - else r_.u64[i] |= ~c_.u64[i] & t; - } - - if (to_do & 0x0c) { - t = ~a_.u64[i] & b_.u64[i]; - if ((to_do & 0x0c) == 0x0c) r_.u64[i] |= t; - else if (to_do & 0x08) r_.u64[i] |= c_.u64[i] & t; - else r_.u64[i] |= ~c_.u64[i] & t; - } - - if (to_do & 0x03) { - t = ~(a_.u64[i] | b_.u64[i]); - if ((to_do & 0x03) == 0x03) r_.u64[i] |= t; - else if (to_do & 0x02) r_.u64[i] |= c_.u64[i] & t; - else r_.u64[i] |= ~c_.u64[i] & t; - } - } - #endif - #else - SIMDE_X_TERNARYLOGIC_SWITCH(imm8 & 255) - #endif - - return simde__m512i_from_private(r_); - } -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_ternarylogic_epi32 - #define _mm512_ternarylogic_epi32(a, b, c, imm8) simde_mm512_ternarylogic_epi32(a, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_mask_ternarylogic_epi32(src, k, a, b, imm8) _mm512_mask_ternarylogic_epi32(src, k, a, b, imm8) -#else - #define simde_mm512_mask_ternarylogic_epi32(src, k, a, b, imm8) simde_mm512_mask_mov_epi32(src, k, simde_mm512_ternarylogic_epi32(src, a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_ternarylogic_epi32 - #define _mm512_mask_ternarylogic_epi32(src, k, a, b, imm8) simde_mm512_mask_ternarylogic_epi32(src, k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_maskz_ternarylogic_epi32(k, a, b, c, imm8) _mm512_maskz_ternarylogic_epi32(k, a, b, c, imm8) -#else - #define simde_mm512_maskz_ternarylogic_epi32(k, a, b, c, imm8) simde_mm512_maskz_mov_epi32(k, simde_mm512_ternarylogic_epi32(a, b, c, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_ternarylogic_epi32 - #define _mm512_maskz_ternarylogic_epi32(k, a, b, c, imm8) simde_mm512_maskz_ternarylogic_epi32(k, a, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_ternarylogic_epi64(a, b, c, imm8) _mm_ternarylogic_epi64(a, b, c, imm8) -#else - #define simde_mm_ternarylogic_epi64(a, b, c, imm8) simde_mm_ternarylogic_epi32(a, b, c, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_ternarylogic_epi64 - #define _mm_ternarylogic_epi64(a, b, c, imm8) simde_mm_ternarylogic_epi64(a, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_mask_ternarylogic_epi64(src, k, a, b, imm8) _mm_mask_ternarylogic_epi64(src, k, a, b, imm8) -#else - #define simde_mm_mask_ternarylogic_epi64(src, k, a, b, imm8) simde_mm_mask_mov_epi64(src, k, simde_mm_ternarylogic_epi64(src, a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_ternarylogic_epi64 - #define _mm_mask_ternarylogic_epi64(src, k, a, b, imm8) simde_mm_mask_ternarylogic_epi64(src, k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_maskz_ternarylogic_epi64(k, a, b, c, imm8) _mm_maskz_ternarylogic_epi64(k, a, b, c, imm8) -#else - #define simde_mm_maskz_ternarylogic_epi64(k, a, b, c, imm8) simde_mm_maskz_mov_epi64(k, simde_mm_ternarylogic_epi64(a, b, c, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_ternarylogic_epi64 - #define _mm_maskz_ternarylogic_epi64(k, a, b, c, imm8) simde_mm_maskz_ternarylogic_epi64(k, a, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_ternarylogic_epi64(a, b, c, imm8) _mm256_ternarylogic_epi64(a, b, c, imm8) -#else - #define simde_mm256_ternarylogic_epi64(a, b, c, imm8) simde_mm256_ternarylogic_epi32(a, b, c, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_ternarylogic_epi64 - #define _mm256_ternarylogic_epi64(a, b, c, imm8) simde_mm256_ternarylogic_epi64(a, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_mask_ternarylogic_epi64(src, k, a, b, imm8) _mm256_mask_ternarylogic_epi64(src, k, a, b, imm8) -#else - #define simde_mm256_mask_ternarylogic_epi64(src, k, a, b, imm8) simde_mm256_mask_mov_epi64(src, k, simde_mm256_ternarylogic_epi64(src, a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_ternarylogic_epi64 - #define _mm256_mask_ternarylogic_epi64(src, k, a, b, imm8) simde_mm256_mask_ternarylogic_epi64(src, k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_maskz_ternarylogic_epi64(k, a, b, c, imm8) _mm256_maskz_ternarylogic_epi64(k, a, b, c, imm8) -#else - #define simde_mm256_maskz_ternarylogic_epi64(k, a, b, c, imm8) simde_mm256_maskz_mov_epi64(k, simde_mm256_ternarylogic_epi64(a, b, c, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_ternarylogic_epi64 - #define _mm256_maskz_ternarylogic_epi64(k, a, b, c, imm8) simde_mm256_maskz_ternarylogic_epi64(k, a, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_ternarylogic_epi64(a, b, c, imm8) _mm512_ternarylogic_epi64(a, b, c, imm8) -#else - #define simde_mm512_ternarylogic_epi64(a, b, c, imm8) simde_mm512_ternarylogic_epi32(a, b, c, imm8) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_ternarylogic_epi64 - #define _mm512_ternarylogic_epi64(a, b, c, imm8) simde_mm512_ternarylogic_epi64(a, b, c, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_mask_ternarylogic_epi64(src, k, a, b, imm8) _mm512_mask_ternarylogic_epi64(src, k, a, b, imm8) -#else - #define simde_mm512_mask_ternarylogic_epi64(src, k, a, b, imm8) simde_mm512_mask_mov_epi64(src, k, simde_mm512_ternarylogic_epi64(src, a, b, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_ternarylogic_epi64 - #define _mm512_mask_ternarylogic_epi64(src, k, a, b, imm8) simde_mm512_mask_ternarylogic_epi64(src, k, a, b, imm8) -#endif - -#if defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_maskz_ternarylogic_epi64(k, a, b, c, imm8) _mm512_maskz_ternarylogic_epi64(k, a, b, c, imm8) -#else - #define simde_mm512_maskz_ternarylogic_epi64(k, a, b, c, imm8) simde_mm512_maskz_mov_epi64(k, simde_mm512_ternarylogic_epi64(a, b, c, imm8)) -#endif -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_ternarylogic_epi64 - #define _mm512_maskz_ternarylogic_epi64(k, a, b, c, imm8) simde_mm512_maskz_ternarylogic_epi64(k, a, b, c, imm8) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_TERNARYLOGIC_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/test.h b/ffi-deps/simde/simde/x86/avx512/test.h deleted file mode 100644 index 0d38634..0000000 --- a/ffi-deps/simde/simde/x86/avx512/test.h +++ /dev/null @@ -1,232 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - * 2020 Christopher Moore - * 2021 Andrew Rodriguez - */ - -#if !defined(SIMDE_X86_AVX512_TEST_H) -#define SIMDE_X86_AVX512_TEST_H - -#include "types.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_test_epi32_mask (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_test_epi32_mask(a, b); - #else - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - simde__mmask8 r = 0; - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r |= HEDLEY_STATIC_CAST(simde__mmask16, !!(a_.i32[i] & b_.i32[i]) << i); - } - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_test_epi32_mask -#define _mm256_test_epi32_mask(a, b) simde_mm256_test_epi32_mask(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm256_mask_test_epi32_mask (simde__mmask8 k1, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_test_epi32_mask(k1, a, b); - #else - return simde_mm256_test_epi32_mask(a, b) & k1; - #endif -} -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_test_epi32_mask - #define _mm256_mask_test_epi32_mask(k1, a, b) simde_mm256_mask_test_epi32_mask(k1, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm512_test_epi16_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_test_epi16_mask(a, b); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - simde__mmask32 r = 0; - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i16) / sizeof(a_.i16[0])) ; i++) { - r |= HEDLEY_STATIC_CAST(simde__mmask32, !!(a_.i16[i] & b_.i16[i]) << i); - } - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_test_epi16_mask - #define _mm512_test_epi16_mask(a, b) simde_mm512_test_epi16_mask(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm512_test_epi32_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_test_epi32_mask(a, b); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - simde__mmask16 r = 0; - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i32) / sizeof(a_.i32[0])) ; i++) { - r |= HEDLEY_STATIC_CAST(simde__mmask16, !!(a_.i32[i] & b_.i32[i]) << i); - } - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_test_epi32_mask -#define _mm512_test_epi32_mask(a, b) simde_mm512_test_epi32_mask(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm512_test_epi64_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_test_epi64_mask(a, b); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - simde__mmask8 r = 0; - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r |= HEDLEY_STATIC_CAST(simde__mmask8, !!(a_.i64[i] & b_.i64[i]) << i); - } - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_test_epi64_mask - #define _mm512_test_epi64_mask(a, b) simde_mm512_test_epi64_mask(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_mm512_test_epi8_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_test_epi8_mask(a, b); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - simde__mmask64 r = 0; - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) { - r |= HEDLEY_STATIC_CAST(simde__mmask64, HEDLEY_STATIC_CAST(uint64_t, !!(a_.i8[i] & b_.i8[i])) << i); - } - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_test_epi8_mask - #define _mm512_test_epi8_mask(a, b) simde_mm512_test_epi8_mask(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask32 -simde_mm512_mask_test_epi16_mask (simde__mmask32 k1, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_test_epi16_mask(k1, a, b); - #else - return simde_mm512_test_epi16_mask(a, b) & k1; - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_test_epi16_mask - #define _mm512_mask_test_epi16_mask(k1, a, b) simde_mm512_mask_test_epi16_mask(k1, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask16 -simde_mm512_mask_test_epi32_mask (simde__mmask16 k1, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_test_epi32_mask(k1, a, b); - #else - return simde_mm512_test_epi32_mask(a, b) & k1; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_test_epi32_mask - #define _mm512_mask_test_epi32_mask(k1, a, b) simde_mm512_mask_test_epi32_mask(k1, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm512_mask_test_epi64_mask (simde__mmask8 k1, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_test_epi64_mask(k1, a, b); - #else - return simde_mm512_test_epi64_mask(a, b) & k1; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_test_epi64_mask - #define _mm512_mask_test_epi64_mask(k1, a, b) simde_mm512_mask_test_epi64_mask(k1, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask64 -simde_mm512_mask_test_epi8_mask (simde__mmask64 k1, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_test_epi8_mask(k1, a, b); - #else - return simde_mm512_test_epi8_mask(a, b) & k1; - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_test_epi8_mask - #define _mm512_mask_test_epi8_mask(k1, a, b) simde_mm512_mask_test_epi8_mask(k1, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_TEST_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/testn.h b/ffi-deps/simde/simde/x86/avx512/testn.h deleted file mode 100644 index 4879235..0000000 --- a/ffi-deps/simde/simde/x86/avx512/testn.h +++ /dev/null @@ -1,63 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2021 Andrew Rodriguez - */ - -#if !defined(SIMDE_X86_AVX512_TESTN_H) -#define SIMDE_X86_AVX512_TESTN_H - -#include "types.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__mmask8 -simde_mm512_testn_epi64_mask (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_testn_epi64_mask(a, b); - #else - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - simde__mmask8 r = 0; - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i64) / sizeof(a_.i64[0])) ; i++) { - r |= (!(a_.i64[i] & b_.i64[i])) << i; - } - - return r; - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_testn_epi64_mask - #define _mm512_testn_epi64_mask(a, b) simde_mm512_testn_epi64_mask(a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_TESTN_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/types.h b/ffi-deps/simde/simde/x86/avx512/types.h deleted file mode 100644 index 639df25..0000000 --- a/ffi-deps/simde/simde/x86/avx512/types.h +++ /dev/null @@ -1,821 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - */ - -#if !defined(SIMDE_X86_AVX512_TYPES_H) -#define SIMDE_X86_AVX512_TYPES_H -#include "../avx.h" -#include "../../simde-f16.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -/* The problem is that Microsoft doesn't support 64-byte aligned parameters, except for - * __m512/__m512i/__m512d. Since our private union has an __m512 member it will be 64-byte - * aligned even if we reduce the alignment requirements of other members. - * - * Even if we're on x86 and use the native AVX-512 types for arguments/return values, the - * to/from private functions will break, and I'm not willing to change their APIs to use - * pointers (which would also require more verbose code on the caller side) just to make - * MSVC happy. - * - * If you want to use AVX-512 in SIMDe, you'll need to either upgrade to MSVC 2017 or later, - * or upgrade to a different compiler (clang-cl, perhaps?). If you have an idea of how to - * fix this without requiring API changes (except transparently through macros), patches - * are welcome. - */ - -# if defined(HEDLEY_MSVC_VERSION) && !HEDLEY_MSVC_VERSION_CHECK(19,10,0) -# if defined(SIMDE_X86_AVX512F_NATIVE) -# undef SIMDE_X86_AVX512F_NATIVE -# pragma message("Native AVX-512 support requires MSVC 2017 or later. See comment above (in code) for details.") -# endif -# define SIMDE_AVX512_ALIGN SIMDE_ALIGN_TO_32 -# else -# define SIMDE_AVX512_ALIGN SIMDE_ALIGN_TO_64 -# endif - -typedef union { - #if defined(SIMDE_VECTOR_SUBSCRIPT) - SIMDE_ALIGN_TO_16 int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_16 simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - #endif - SIMDE_ALIGN_TO_16 simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - #else - SIMDE_ALIGN_TO_16 int8_t i8[16]; - SIMDE_ALIGN_TO_16 int16_t i16[8]; - SIMDE_ALIGN_TO_16 int32_t i32[4]; - SIMDE_ALIGN_TO_16 int64_t i64[2]; - SIMDE_ALIGN_TO_16 uint8_t u8[16]; - SIMDE_ALIGN_TO_16 uint16_t u16[8]; - SIMDE_ALIGN_TO_16 uint32_t u32[4]; - SIMDE_ALIGN_TO_16 uint64_t u64[2]; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_16 simde_int128 i128[1]; - SIMDE_ALIGN_TO_16 simde_uint128 u128[1]; - #endif - SIMDE_ALIGN_TO_16 simde_float32 f32[4]; - SIMDE_ALIGN_TO_16 int_fast32_t i32f[16 / sizeof(int_fast32_t)]; - SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)]; - #endif - - SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2]; - SIMDE_ALIGN_TO_16 simde__m64 m64[2]; - - #if defined(SIMDE_X86_AVX512BF16_NATIVE) - SIMDE_ALIGN_TO_16 __m128bh n; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_TO_16 int8x16_t neon_i8; - SIMDE_ALIGN_TO_16 int16x8_t neon_i16; - SIMDE_ALIGN_TO_16 int32x4_t neon_i32; - SIMDE_ALIGN_TO_16 int64x2_t neon_i64; - SIMDE_ALIGN_TO_16 uint8x16_t neon_u8; - SIMDE_ALIGN_TO_16 uint16x8_t neon_u16; - SIMDE_ALIGN_TO_16 uint32x4_t neon_u32; - SIMDE_ALIGN_TO_16 uint64x2_t neon_u64; - SIMDE_ALIGN_TO_16 float32x4_t neon_f32; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - SIMDE_ALIGN_TO_16 float64x2_t neon_f64; - #endif - #elif defined(SIMDE_MIPS_MSA_NATIVE) - v16i8 msa_i8; - v8i16 msa_i16; - v4i32 msa_i32; - v2i64 msa_i64; - v16u8 msa_u8; - v8u16 msa_u16; - v4u32 msa_u32; - v2u64 msa_u64; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - SIMDE_ALIGN_TO_16 v128_t wasm_v128; - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32; - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64; - #endif - #endif -} simde__m128bh_private; - -typedef union { - #if defined(SIMDE_VECTOR_SUBSCRIPT) - SIMDE_ALIGN_TO_32 int8_t i8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int16_t i16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int32_t i32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int64_t i64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint8_t u8 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint16_t u16 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint32_t u32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint64_t u64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_32 simde_int128 i128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 simde_uint128 u128 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #endif - SIMDE_ALIGN_TO_32 simde_float32 f32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 simde_float64 f64 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 int_fast32_t i32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_32 uint_fast32_t u32f SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - #else - SIMDE_ALIGN_TO_32 int8_t i8[32]; - SIMDE_ALIGN_TO_32 int16_t i16[16]; - SIMDE_ALIGN_TO_32 int32_t i32[8]; - SIMDE_ALIGN_TO_32 int64_t i64[4]; - SIMDE_ALIGN_TO_32 uint8_t u8[32]; - SIMDE_ALIGN_TO_32 uint16_t u16[16]; - SIMDE_ALIGN_TO_32 uint32_t u32[8]; - SIMDE_ALIGN_TO_32 uint64_t u64[4]; - SIMDE_ALIGN_TO_32 int_fast32_t i32f[32 / sizeof(int_fast32_t)]; - SIMDE_ALIGN_TO_32 uint_fast32_t u32f[32 / sizeof(uint_fast32_t)]; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_32 simde_int128 i128[2]; - SIMDE_ALIGN_TO_32 simde_uint128 u128[2]; - #endif - SIMDE_ALIGN_TO_32 simde_float32 f32[8]; - SIMDE_ALIGN_TO_32 simde_float64 f64[4]; - #endif - - SIMDE_ALIGN_TO_32 simde__m128_private m128_private[2]; - SIMDE_ALIGN_TO_32 simde__m128 m128[2]; - - #if defined(SIMDE_X86_BF16_NATIVE) - SIMDE_ALIGN_TO_32 __m256bh n; - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(int) altivec_i32[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32[2]; - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(long long) altivec_i64[2]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[2]; - #endif - #endif -} simde__m256bh_private; - -typedef union { - #if defined(SIMDE_VECTOR_SUBSCRIPT) - SIMDE_AVX512_ALIGN int8_t i8 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN int16_t i16 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN int32_t i32 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN int64_t i64 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint8_t u8 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint16_t u16 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint32_t u32 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint64_t u64 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_AVX512_ALIGN simde_int128 i128 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN simde_uint128 u128 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - #endif - SIMDE_AVX512_ALIGN simde_float32 f32 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN simde_float64 f64 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN int_fast32_t i32f SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint_fast32_t u32f SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - #else - SIMDE_AVX512_ALIGN int8_t i8[64]; - SIMDE_AVX512_ALIGN int16_t i16[32]; - SIMDE_AVX512_ALIGN int32_t i32[16]; - SIMDE_AVX512_ALIGN int64_t i64[8]; - SIMDE_AVX512_ALIGN uint8_t u8[64]; - SIMDE_AVX512_ALIGN uint16_t u16[32]; - SIMDE_AVX512_ALIGN uint32_t u32[16]; - SIMDE_AVX512_ALIGN uint64_t u64[8]; - SIMDE_AVX512_ALIGN int_fast32_t i32f[64 / sizeof(int_fast32_t)]; - SIMDE_AVX512_ALIGN uint_fast32_t u32f[64 / sizeof(uint_fast32_t)]; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_AVX512_ALIGN simde_int128 i128[4]; - SIMDE_AVX512_ALIGN simde_uint128 u128[4]; - #endif - SIMDE_AVX512_ALIGN simde_float32 f32[16]; - SIMDE_AVX512_ALIGN simde_float64 f64[8]; - #endif - - SIMDE_AVX512_ALIGN simde__m128_private m128_private[4]; - SIMDE_AVX512_ALIGN simde__m128 m128[4]; - SIMDE_AVX512_ALIGN simde__m256_private m256_private[2]; - SIMDE_AVX512_ALIGN simde__m256 m256[2]; - - #if defined(SIMDE_X86_AVX512BF16_NATIVE) - SIMDE_AVX512_ALIGN __m512bh n; - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32[4]; - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[4]; - #endif - #endif -} simde__m512bh_private; - -typedef union { - #if defined(SIMDE_VECTOR_SUBSCRIPT) - SIMDE_AVX512_ALIGN int8_t i8 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN int16_t i16 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN int32_t i32 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN int64_t i64 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint8_t u8 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint16_t u16 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint32_t u32 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint64_t u64 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_AVX512_ALIGN simde_int128 i128 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN simde_uint128 u128 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - #endif - SIMDE_AVX512_ALIGN simde_float32 f32 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN simde_float64 f64 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN int_fast32_t i32f SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint_fast32_t u32f SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - #else - SIMDE_AVX512_ALIGN int8_t i8[64]; - SIMDE_AVX512_ALIGN int16_t i16[32]; - SIMDE_AVX512_ALIGN int32_t i32[16]; - SIMDE_AVX512_ALIGN int64_t i64[8]; - SIMDE_AVX512_ALIGN uint8_t u8[64]; - SIMDE_AVX512_ALIGN uint16_t u16[32]; - SIMDE_AVX512_ALIGN uint32_t u32[16]; - SIMDE_AVX512_ALIGN uint64_t u64[8]; - SIMDE_AVX512_ALIGN int_fast32_t i32f[64 / sizeof(int_fast32_t)]; - SIMDE_AVX512_ALIGN uint_fast32_t u32f[64 / sizeof(uint_fast32_t)]; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_AVX512_ALIGN simde_int128 i128[4]; - SIMDE_AVX512_ALIGN simde_uint128 u128[4]; - #endif - SIMDE_AVX512_ALIGN simde_float32 f32[16]; - SIMDE_AVX512_ALIGN simde_float64 f64[8]; - #endif - - SIMDE_AVX512_ALIGN simde__m128_private m128_private[4]; - SIMDE_AVX512_ALIGN simde__m128 m128[4]; - SIMDE_AVX512_ALIGN simde__m256_private m256_private[2]; - SIMDE_AVX512_ALIGN simde__m256 m256[2]; - - #if defined(SIMDE_X86_AVX512F_NATIVE) - SIMDE_AVX512_ALIGN __m512 n; - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32[4]; - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[4]; - #endif - #endif -} simde__m512_private; - -typedef union { - #if defined(SIMDE_VECTOR_SUBSCRIPT) - SIMDE_AVX512_ALIGN int8_t i8 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN int16_t i16 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN int32_t i32 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN int64_t i64 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint8_t u8 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint16_t u16 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint32_t u32 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint64_t u64 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_AVX512_ALIGN simde_int128 i128 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN simde_uint128 u128 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - #endif - SIMDE_AVX512_ALIGN simde_float32 f32 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN simde_float64 f64 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN int_fast32_t i32f SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint_fast32_t u32f SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - #else - SIMDE_AVX512_ALIGN int8_t i8[64]; - SIMDE_AVX512_ALIGN int16_t i16[32]; - SIMDE_AVX512_ALIGN int32_t i32[16]; - SIMDE_AVX512_ALIGN int64_t i64[8]; - SIMDE_AVX512_ALIGN uint8_t u8[64]; - SIMDE_AVX512_ALIGN uint16_t u16[32]; - SIMDE_AVX512_ALIGN uint32_t u32[16]; - SIMDE_AVX512_ALIGN uint64_t u64[8]; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_AVX512_ALIGN simde_int128 i128[4]; - SIMDE_AVX512_ALIGN simde_uint128 u128[4]; - #endif - SIMDE_AVX512_ALIGN simde_float32 f32[16]; - SIMDE_AVX512_ALIGN simde_float64 f64[8]; - SIMDE_AVX512_ALIGN int_fast32_t i32f[64 / sizeof(int_fast32_t)]; - SIMDE_AVX512_ALIGN uint_fast32_t u32f[64 / sizeof(uint_fast32_t)]; - #endif - - SIMDE_AVX512_ALIGN simde__m128d_private m128d_private[4]; - SIMDE_AVX512_ALIGN simde__m128d m128d[4]; - SIMDE_AVX512_ALIGN simde__m256d_private m256d_private[2]; - SIMDE_AVX512_ALIGN simde__m256d m256d[2]; - - #if defined(SIMDE_X86_AVX512F_NATIVE) - SIMDE_AVX512_ALIGN __m512d n; - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32[4]; - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[4]; - #endif - #endif -} simde__m512d_private; - -typedef union { - #if defined(SIMDE_VECTOR_SUBSCRIPT) - SIMDE_AVX512_ALIGN int8_t i8 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN int16_t i16 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN int32_t i32 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN int64_t i64 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint8_t u8 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint16_t u16 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint32_t u32 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint64_t u64 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_AVX512_ALIGN simde_int128 i128 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN simde_uint128 u128 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - #endif - #if defined(SIMDE_FLOAT16_VECTOR) - SIMDE_ALIGN_TO_16 simde_float16 f16 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - #else - SIMDE_AVX512_ALIGN simde_float16 f16[32]; - #endif - SIMDE_AVX512_ALIGN simde_float32 f32 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN simde_float64 f64 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN int_fast32_t i32f SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint_fast32_t u32f SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - #else - SIMDE_AVX512_ALIGN int8_t i8[64]; - SIMDE_AVX512_ALIGN int16_t i16[32]; - SIMDE_AVX512_ALIGN int32_t i32[16]; - SIMDE_AVX512_ALIGN int64_t i64[8]; - SIMDE_AVX512_ALIGN uint8_t u8[64]; - SIMDE_AVX512_ALIGN uint16_t u16[32]; - SIMDE_AVX512_ALIGN uint32_t u32[16]; - SIMDE_AVX512_ALIGN uint64_t u64[8]; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_AVX512_ALIGN simde_int128 i128[4]; - SIMDE_AVX512_ALIGN simde_uint128 u128[4]; - #endif - SIMDE_AVX512_ALIGN simde_float16 f16[32]; - SIMDE_AVX512_ALIGN simde_float32 f32[16]; - SIMDE_AVX512_ALIGN simde_float64 f64[8]; - SIMDE_AVX512_ALIGN int_fast32_t i32f[64 / sizeof(int_fast32_t)]; - SIMDE_AVX512_ALIGN uint_fast32_t u32f[64 / sizeof(uint_fast32_t)]; - #endif - - SIMDE_AVX512_ALIGN simde__m128d_private m128d_private[4]; - SIMDE_AVX512_ALIGN simde__m128d m128d[4]; - SIMDE_AVX512_ALIGN simde__m256d_private m256d_private[2]; - SIMDE_AVX512_ALIGN simde__m256d m256d[2]; - - #if defined(SIMDE_X86_AVX512FP16_NATIVE) - SIMDE_AVX512_ALIGN __m512h n; - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32[4]; - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[4]; - #endif - #endif -} simde__m512h_private; - - -typedef union { - #if defined(SIMDE_VECTOR_SUBSCRIPT) - SIMDE_AVX512_ALIGN int8_t i8 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN int16_t i16 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN int32_t i32 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN int64_t i64 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint8_t u8 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint16_t u16 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint32_t u32 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint64_t u64 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_AVX512_ALIGN simde_int128 i128 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN simde_uint128 u128 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - #endif - SIMDE_AVX512_ALIGN simde_float32 f32 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN simde_float64 f64 SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN int_fast32_t i32f SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - SIMDE_AVX512_ALIGN uint_fast32_t u32f SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - #else - SIMDE_AVX512_ALIGN int8_t i8[64]; - SIMDE_AVX512_ALIGN int16_t i16[32]; - SIMDE_AVX512_ALIGN int32_t i32[16]; - SIMDE_AVX512_ALIGN int64_t i64[8]; - SIMDE_AVX512_ALIGN uint8_t u8[64]; - SIMDE_AVX512_ALIGN uint16_t u16[32]; - SIMDE_AVX512_ALIGN uint32_t u32[16]; - SIMDE_AVX512_ALIGN uint64_t u64[8]; - SIMDE_AVX512_ALIGN int_fast32_t i32f[64 / sizeof(int_fast32_t)]; - SIMDE_AVX512_ALIGN uint_fast32_t u32f[64 / sizeof(uint_fast32_t)]; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_AVX512_ALIGN simde_int128 i128[4]; - SIMDE_AVX512_ALIGN simde_uint128 u128[4]; - #endif - SIMDE_AVX512_ALIGN simde_float32 f32[16]; - SIMDE_AVX512_ALIGN simde_float64 f64[8]; - #endif - - SIMDE_AVX512_ALIGN simde__m128i_private m128i_private[4]; - SIMDE_AVX512_ALIGN simde__m128i m128i[4]; - SIMDE_AVX512_ALIGN simde__m256i_private m256i_private[2]; - SIMDE_AVX512_ALIGN simde__m256i m256i[2]; - - #if defined(SIMDE_X86_AVX512F_NATIVE) - SIMDE_AVX512_ALIGN __m512i n; - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32[4]; - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64[4]; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64[4]; - #endif - #endif -} simde__m512i_private; - -/* Intel uses the same header (immintrin.h) for everything AVX and - * later. If native aliases are enabled, and the machine has native - * support for AVX imintrin.h will already have been included, which - * means simde__m512* will already have been defined. So, even - * if the machine doesn't support AVX512F we need to use the native - * type; it has already been defined. - * - * However, we also can't just assume that including immintrin.h does - * actually define these. It could be a compiler which supports AVX - * but not AVX512F, such as GCC < 4.9 or VS < 2017. That's why we - * check to see if _MM_CMPINT_GE is defined; it's part of AVX512F, - * so we assume that if it's present AVX-512F has already been - * declared. - * - * Note that the choice of _MM_CMPINT_GE is deliberate; while GCC - * uses the preprocessor to define all the _MM_CMPINT_* members, - * in most compilers they are simply normal enum members. However, - * all compilers I've looked at use an object-like macro for - * _MM_CMPINT_GE, which is defined to _MM_CMPINT_NLT. _MM_CMPINT_NLT - * is included in case a compiler does the reverse, though I haven't - * run into one which does. - * - * As for the ICC check, unlike other compilers, merely using the - * AVX-512 types causes ICC to generate AVX-512 instructions. */ -#if (defined(_MM_CMPINT_GE) || defined(_MM_CMPINT_NLT)) && \ - (defined(SIMDE_X86_AVX512F_NATIVE) || \ - !(defined(HEDLEY_INTEL_VERSION) || (defined(HEDLEY_MSVC_VERSION) && !defined(__clang__)))) - typedef __m512 simde__m512; - typedef __m512i simde__m512i; - typedef __m512d simde__m512d; - - typedef __mmask8 simde__mmask8; - typedef __mmask16 simde__mmask16; -#else - #if defined(SIMDE_VECTOR_SUBSCRIPT) - typedef simde_float32 simde__m512 SIMDE_AVX512_ALIGN SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - typedef int_fast32_t simde__m512i SIMDE_AVX512_ALIGN SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - typedef simde_float64 simde__m512d SIMDE_AVX512_ALIGN SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - #else - typedef simde__m512_private simde__m512; - typedef simde__m512i_private simde__m512i; - typedef simde__m512d_private simde__m512d; - #endif - - typedef uint8_t simde__mmask8; - typedef uint16_t simde__mmask16; -#endif - -#if (defined(_AVX512BF16INTRIN_H_INCLUDED) || defined(__AVX512BF16INTRIN_H)) && (defined(SIMDE_X86_AVX512BF16_NATIVE) || !defined(HEDLEY_INTEL_VERSION)) - typedef __m128bh simde__m128bh; - typedef __m256bh simde__m256bh; - typedef __m512bh simde__m512bh; -#else - #if defined(SIMDE_VECTOR_SUBSCRIPT) - typedef simde_float32 simde__m128bh SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - typedef simde_float32 simde__m256bh SIMDE_ALIGN_TO_32 SIMDE_VECTOR(32) SIMDE_MAY_ALIAS; - typedef simde_float32 simde__m512bh SIMDE_AVX512_ALIGN SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - #else - typedef simde__m128bh_private simde__m128bh; - typedef simde__m256bh_private simde__m256bh; - typedef simde__m512bh_private simde__m512bh; - #endif -#endif - -#if defined(SIMDE_X86_AVX512FP16_NATIVE) - typedef __m512h simde__m512h; -#else - #if defined(SIMDE_VECTOR_SUBSCRIPT) && defined(SIMDE_FLOAT16_VECTOR) - typedef simde_float16 simde__m512h SIMDE_AVX512_ALIGN SIMDE_VECTOR(64) SIMDE_MAY_ALIAS; - #else - typedef simde__m512h_private simde__m512h; - #endif -#endif - -/* These are really part of AVX-512VL / AVX-512BW (in GCC __mmask32 is - * in avx512vlintrin.h and __mmask64 is in avx512bwintrin.h, in clang - * both are in avx512bwintrin.h), not AVX-512F. However, we don't have - * a good (not-compiler-specific) way to detect if these headers have - * been included. In compilers which support AVX-512F but not - * AVX-512BW/VL (e.g., GCC 4.9) we need typedefs since __mmask{32,64) - * won't exist. - * - * AFAICT __mmask{32,64} are always just typedefs to uint{32,64}_t - * in all compilers, so it's safe to use these instead of typedefs to - * __mmask{16,32}. If you run into a problem with this please file an - * issue and we'll try to figure out a work-around. */ -typedef uint32_t simde__mmask32; -typedef uint64_t simde__mmask64; -#if !defined(__mmask16) && defined(SIMDE_ENABLE_NATIVE_ALIASES) - #if !defined(HEDLEY_INTEL_VERSION) - typedef uint16_t __mmask16; - #else - #define __mmask16 uint16_t; - #endif -#endif -#if !defined(__mmask32) && defined(SIMDE_ENABLE_NATIVE_ALIASES) - #if !defined(HEDLEY_INTEL_VERSION) - typedef uint32_t __mmask32; - #else - #define __mmask32 uint32_t; - #endif -#endif -#if !defined(__mmask64) && defined(SIMDE_ENABLE_NATIVE_ALIASES) - #if !defined(HEDLEY_INTEL_VERSION) - #if defined(HEDLEY_GCC_VERSION) - typedef unsigned long long __mmask64; - #else - typedef uint64_t __mmask64; - #endif - #else - #define __mmask64 uint64_t; - #endif -#endif - -#if !defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) - #if !defined(HEDLEY_INTEL_VERSION) - typedef simde__m512 __m512; - typedef simde__m512i __m512i; - typedef simde__m512d __m512d; - #else - #define __m512 simde__m512 - #define __m512i simde__m512i - #define __m512d simde__m512d - #endif -#endif - -#if !defined(SIMDE_X86_AVX512BF16_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) - #if !defined(HEDLEY_INTEL_VERSION) - typedef simde__m128bh __m128bh; - typedef simde__m256bh __m256bh; - typedef simde__m512bh __m512bh; - #else - #define __m128bh simde__m128bh - #define __m256bh simde__m256bh - #define __m512bh simde__m512bh - #endif -#endif - -#if !defined(SIMDE_X86_AVX512FP16_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) - #if !defined(HEDLEY_INTEL_VERSION) - //typedef simde__m128h __m128h; - //typedef simde__m256h __m256h; - typedef simde__m512h __m512h; - #else - //#define __m128h simde__m128h - //#define __m256h simde__m256h - #define __m512h simde__m512h - #endif -#endif - -HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128bh), "simde__m128bh size incorrect"); -HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128bh_private), "simde__m128bh_private size incorrect"); -HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256bh), "simde__m256bh size incorrect"); -HEDLEY_STATIC_ASSERT(32 == sizeof(simde__m256bh_private), "simde__m256bh_private size incorrect"); -HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512bh), "simde__m512bh size incorrect"); -HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512bh_private), "simde__m512bh_private size incorrect"); -HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512), "simde__m512 size incorrect"); -HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512_private), "simde__m512_private size incorrect"); -HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512i), "simde__m512i size incorrect"); -HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512i_private), "simde__m512i_private size incorrect"); -HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512d), "simde__m512d size incorrect"); -HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512d_private), "simde__m512d_private size incorrect"); -HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512h), "simde__m512h size incorrect"); -HEDLEY_STATIC_ASSERT(64 == sizeof(simde__m512h_private), "simde__m512h_private size incorrect"); -#if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF) -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128bh) == 16, "simde__m128bh is not 16-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128bh_private) == 16, "simde__m128bh_private is not 16-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256bh) == 32, "simde__m256bh is not 16-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m256bh_private) == 32, "simde__m256bh_private is not 16-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512bh) == 32, "simde__m512bh is not 32-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512bh_private) == 32, "simde__m512bh_private is not 32-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512) == 32, "simde__m512 is not 32-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512_private) == 32, "simde__m512_private is not 32-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512i) == 32, "simde__m512i is not 32-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512i_private) == 32, "simde__m512i_private is not 32-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512d) == 32, "simde__m512d is not 32-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512d_private) == 32, "simde__m512d_private is not 32-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512h) == 32, "simde__m512h is not 32-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m512h_private) == 32, "simde__m512h_private is not 32-byte aligned"); -#endif - -#define SIMDE_MM_CMPINT_EQ 0 -#define SIMDE_MM_CMPINT_LT 1 -#define SIMDE_MM_CMPINT_LE 2 -#define SIMDE_MM_CMPINT_FALSE 3 -#define SIMDE_MM_CMPINT_NE 4 -#define SIMDE_MM_CMPINT_NLT 5 -#define SIMDE_MM_CMPINT_NLE 6 -#define SIMDE_MM_CMPINT_TRUE 7 -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && !defined(_MM_CMPINT_EQ) -#define _MM_CMPINT_EQ SIMDE_MM_CMPINT_EQ -#define _MM_CMPINT_LT SIMDE_MM_CMPINT_LT -#define _MM_CMPINT_LE SIMDE_MM_CMPINT_LE -#define _MM_CMPINT_FALSE SIMDE_MM_CMPINT_FALSE -#define _MM_CMPINT_NE SIMDE_MM_CMPINT_NE -#define _MM_CMPINT_NLT SIMDE_MM_CMPINT_NLT -#define _MM_CMPINT_NLE SIMDE_MM_CMPINT_NLE -#define _MM_CMPINT_TRUE SIMDE_CMPINT_TRUE -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128bh -simde__m128bh_from_private(simde__m128bh_private v) { - simde__m128bh r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128bh_private -simde__m128bh_to_private(simde__m128bh v) { - simde__m128bh_private r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256bh -simde__m256bh_from_private(simde__m256bh_private v) { - simde__m256bh r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256bh_private -simde__m256bh_to_private(simde__m256bh v) { - simde__m256bh_private r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512bh -simde__m512bh_from_private(simde__m512bh_private v) { - simde__m512bh r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512bh_private -simde__m512bh_to_private(simde__m512bh v) { - simde__m512bh_private r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde__m512_from_private(simde__m512_private v) { - simde__m512 r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512_private -simde__m512_to_private(simde__m512 v) { - simde__m512_private r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde__m512i_from_private(simde__m512i_private v) { - simde__m512i r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i_private -simde__m512i_to_private(simde__m512i v) { - simde__m512i_private r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde__m512d_from_private(simde__m512d_private v) { - simde__m512d r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d_private -simde__m512d_to_private(simde__m512d v) { - simde__m512d_private r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512h -simde__m512h_from_private(simde__m512h_private v) { - simde__m512h r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512h_private -simde__m512h_to_private(simde__m512h v) { - simde__m512h_private r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_TYPES_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/unpackhi.h b/ffi-deps/simde/simde/x86/avx512/unpackhi.h deleted file mode 100644 index a67a153..0000000 --- a/ffi-deps/simde/simde/x86/avx512/unpackhi.h +++ /dev/null @@ -1,753 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_UNPACKHI_H) -#define SIMDE_X86_AVX512_UNPACKHI_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_unpackhi_epi8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_unpackhi_epi8(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 64, a_.i8, b_.i8, - 8, 72, 9, 73, 10, 74, 11, 75, - 12, 76, 13, 77, 14, 78, 15, 79, - 24, 88, 25, 89, 26, 90, 27, 91, - 28, 92, 29, 93, 30, 94, 31, 95, - 40, 104, 41, 105, 42, 106, 43, 107, - 44, 108, 45, 109, 46, 110, 47, 111, - 56, 120, 57, 121, 58, 122, 59, 123, - 60, 124, 61, 125, 62, 126, 63, 127); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256i[0] = simde_mm256_unpackhi_epi8(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_unpackhi_epi8(a_.m256i[1], b_.m256i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0]) / 2) ; i++) { - r_.i8[2 * i] = a_.i8[i + 8 + ~(~i | 7)]; - r_.i8[2 * i + 1] = b_.i8[i + 8 + ~(~i | 7)]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_unpackhi_epi8 - #define _mm512_unpackhi_epi8(a, b) simde_mm512_unpackhi_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_unpackhi_epi8(simde__m512i src, simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_unpackhi_epi8(src, k, a, b); - #else - return simde_mm512_mask_mov_epi8(src, k, simde_mm512_unpackhi_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_unpackhi_epi8 - #define _mm512_mask_unpackhi_epi8(src, k, a, b) simde_mm512_mask_unpackhi_epi8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_unpackhi_epi8(simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_unpackhi_epi8(k, a, b); - #else - return simde_mm512_maskz_mov_epi8(k, simde_mm512_unpackhi_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_unpackhi_epi8 - #define _mm512_maskz_unpackhi_epi8(k, a, b) simde_mm512_maskz_unpackhi_epi8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_unpackhi_epi8(simde__m256i src, simde__mmask32 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_unpackhi_epi8(src, k, a, b); - #else - return simde_mm256_mask_mov_epi8(src, k, simde_mm256_unpackhi_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_unpackhi_epi8 - #define _mm256_mask_unpackhi_epi8(src, k, a, b) simde_mm256_mask_unpackhi_epi8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_unpackhi_epi8(simde__mmask32 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_unpackhi_epi8(k, a, b); - #else - return simde_mm256_maskz_mov_epi8(k, simde_mm256_unpackhi_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_unpackhi_epi8 - #define _mm256_maskz_unpackhi_epi8(k, a, b) simde_mm256_maskz_unpackhi_epi8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_unpackhi_epi8(simde__m128i src, simde__mmask16 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_unpackhi_epi8(src, k, a, b); - #else - return simde_mm_mask_mov_epi8(src, k, simde_mm_unpackhi_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_unpackhi_epi8 - #define _mm_mask_unpackhi_epi8(src, k, a, b) simde_mm_mask_unpackhi_epi8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_unpackhi_epi8(simde__mmask16 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_unpackhi_epi8(k, a, b); - #else - return simde_mm_maskz_mov_epi8(k, simde_mm_unpackhi_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_unpackhi_epi8 - #define _mm_maskz_unpackhi_epi8(k, a, b) simde_mm_maskz_unpackhi_epi8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_unpackhi_epi16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_unpackhi_epi16(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 =SIMDE_SHUFFLE_VECTOR_(16, 64, a_.i16, b_.i16, - 4, 36, 5, 37, 6, 38, 7, 39, 12, 44, 13, 45, 14, 46, 15, 47, - 20, 52, 21, 53, 22, 54, 23, 55, 28, 60, 29, 61, 30, 62, 31, 63); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256i[0] = simde_mm256_unpackhi_epi16(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_unpackhi_epi16(a_.m256i[1], b_.m256i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0]) / 2) ; i++) { - r_.i16[2 * i] = a_.i16[i + 4 + ~(~i | 3)]; - r_.i16[2 * i + 1] = b_.i16[i + 4 + ~(~i | 3)]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_unpackhi_epi16 - #define _mm512_unpackhi_epi16(a, b) simde_mm512_unpackhi_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_unpackhi_epi16(simde__m512i src, simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_unpackhi_epi16(src, k, a, b); - #else - return simde_mm512_mask_mov_epi16(src, k, simde_mm512_unpackhi_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_unpackhi_epi16 - #define _mm512_mask_unpackhi_epi16(src, k, a, b) simde_mm512_mask_unpackhi_epi16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_unpackhi_epi16(simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_unpackhi_epi16(k, a, b); - #else - return simde_mm512_maskz_mov_epi16(k, simde_mm512_unpackhi_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_unpackhi_epi16 - #define _mm512_maskz_unpackhi_epi16(k, a, b) simde_mm512_maskz_unpackhi_epi16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_unpackhi_epi16(simde__m256i src, simde__mmask16 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_unpackhi_epi16(src, k, a, b); - #else - return simde_mm256_mask_mov_epi16(src, k, simde_mm256_unpackhi_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_unpackhi_epi16 - #define _mm256_mask_unpackhi_epi16(src, k, a, b) simde_mm256_mask_unpackhi_epi16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_unpackhi_epi16(simde__mmask16 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_unpackhi_epi16(k, a, b); - #else - return simde_mm256_maskz_mov_epi16(k, simde_mm256_unpackhi_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_unpackhi_epi16 - #define _mm256_maskz_unpackhi_epi16(k, a, b) simde_mm256_maskz_unpackhi_epi16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_unpackhi_epi16(simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_unpackhi_epi16(src, k, a, b); - #else - return simde_mm_mask_mov_epi16(src, k, simde_mm_unpackhi_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_unpackhi_epi16 - #define _mm_mask_unpackhi_epi16(src, k, a, b) simde_mm_mask_unpackhi_epi16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_unpackhi_epi16(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_unpackhi_epi16(k, a, b); - #else - return simde_mm_maskz_mov_epi16(k, simde_mm_unpackhi_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_unpackhi_epi16 - #define _mm_maskz_unpackhi_epi16(k, a, b) simde_mm_maskz_unpackhi_epi16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_unpackhi_epi32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_unpackhi_epi32(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 64, a_.i32, b_.i32, - 2, 18, 3 , 19, 6, 22, 7, 23, - 10, 26, 11, 27, 14, 30, 15, 31); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256i[0] = simde_mm256_unpackhi_epi32(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_unpackhi_epi32(a_.m256i[1], b_.m256i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0]) / 2) ; i++) { - r_.i32[2 * i] = a_.i32[i + 2 + ~(~i | 1)]; - r_.i32[2 * i + 1] = b_.i32[i + 2 + ~(~i | 1)]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_unpackhi_epi32 - #define _mm512_unpackhi_epi32(a, b) simde_mm512_unpackhi_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_unpackhi_epi32(simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_unpackhi_epi32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_unpackhi_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_unpackhi_epi32 - #define _mm512_mask_unpackhi_epi32(src, k, a, b) simde_mm512_mask_unpackhi_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_unpackhi_epi32(simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_unpackhi_epi32(k, a, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_unpackhi_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_unpackhi_epi32 - #define _mm512_maskz_unpackhi_epi32(k, a, b) simde_mm512_maskz_unpackhi_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_unpackhi_epi32(simde__m256i src, simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_unpackhi_epi32(src, k, a, b); - #else - return simde_mm256_mask_mov_epi32(src, k, simde_mm256_unpackhi_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_unpackhi_epi32 - #define _mm256_mask_unpackhi_epi32(src, k, a, b) simde_mm256_mask_unpackhi_epi32(src, k, a, b) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_unpackhi_epi32(simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_unpackhi_epi32(k, a, b); - #else - return simde_mm256_maskz_mov_epi32(k, simde_mm256_unpackhi_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_unpackhi_epi32 - #define _mm256_maskz_unpackhi_epi32(k, a, b) simde_mm256_maskz_unpackhi_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_unpackhi_epi32(simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_unpackhi_epi32(src, k, a, b); - #else - return simde_mm_mask_mov_epi32(src, k, simde_mm_unpackhi_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_unpackhi_epi32 - #define _mm_mask_unpackhi_epi32(src, k, a, b) simde_mm_mask_unpackhi_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_unpackhi_epi32(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_unpackhi_epi32(k, a, b); - #else - return simde_mm_maskz_mov_epi32(k, simde_mm_unpackhi_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_unpackhi_epi32 - #define _mm_maskz_unpackhi_epi32(k, a, b) simde_mm_maskz_unpackhi_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_unpackhi_epi64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_unpackhi_epi64(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 64, a_.i64, b_.i64, 1, 9, 3, 11, 5, 13, 7, 15); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256i[0] = simde_mm256_unpackhi_epi64(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_unpackhi_epi64(a_.m256i[1], b_.m256i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0]) / 2) ; i++) { - r_.i64[2 * i] = a_.i64[2 * i + 1]; - r_.i64[2 * i + 1] = b_.i64[2 * i + 1]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_unpackhi_epi64 - #define _mm512_unpackhi_epi64(a, b) simde_mm512_unpackhi_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_unpackhi_epi64(simde__m512i src, simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_unpackhi_epi64(src, k, a, b); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_unpackhi_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_unpackhi_epi64 - #define _mm512_mask_unpackhi_epi64(src, k, a, b) simde_mm512_mask_unpackhi_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_unpackhi_epi64(simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_unpackhi_epi64(k, a, b); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_unpackhi_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_unpackhi_epi64 - #define _mm512_maskz_unpackhi_epi64(k, a, b) simde_mm512_maskz_unpackhi_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_unpackhi_epi64(simde__m256i src, simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_unpackhi_epi64(src, k, a, b); - #else - return simde_mm256_mask_mov_epi64(src, k, simde_mm256_unpackhi_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_unpackhi_epi64 - #define _mm256_mask_unpackhi_epi64(src, k, a, b) simde_mm256_mask_unpackhi_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_unpackhi_epi64(simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_unpackhi_epi64(k, a, b); - #else - return simde_mm256_maskz_mov_epi64(k, simde_mm256_unpackhi_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_unpackhi_epi64 - #define _mm256_maskz_unpackhi_epi64(k, a, b) simde_mm256_maskz_unpackhi_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_unpackhi_epi64(simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_unpackhi_epi64(src, k, a, b); - #else - return simde_mm_mask_mov_epi64(src, k, simde_mm_unpackhi_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_unpackhi_epi64 - #define _mm_mask_unpackhi_epi64(src, k, a, b) simde_mm_mask_unpackhi_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_unpackhi_epi64(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_unpackhi_epi64(k, a, b); - #else - return simde_mm_maskz_mov_epi64(k, simde_mm_unpackhi_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_unpackhi_epi64 - #define _mm_maskz_unpackhi_epi64(k, a, b) simde_mm_maskz_unpackhi_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_unpackhi_ps (simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_unpackhi_ps(a, b); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a), - b_ = simde__m512_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 64, a_.f32, b_.f32, - 2, 18, 3 , 19, 6, 22, 7, 23, - 10, 26, 11, 27, 14, 30, 15, 31); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256[0] = simde_mm256_unpackhi_ps(a_.m256[0], b_.m256[0]); - r_.m256[1] = simde_mm256_unpackhi_ps(a_.m256[1], b_.m256[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0]) / 2) ; i++) { - r_.f32[2 * i] = a_.f32[i + 2 + ~(~i | 1)]; - r_.f32[2 * i + 1] = b_.f32[i + 2 + ~(~i | 1)]; - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_unpackhi_ps - #define _mm512_unpackhi_ps(a, b) simde_mm512_unpackhi_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_unpackhi_ps(simde__m512 src, simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_unpackhi_ps(src, k, a, b); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_unpackhi_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_unpackhi_ps - #define _mm512_mask_unpackhi_ps(src, k, a, b) simde_mm512_mask_unpackhi_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_unpackhi_ps(simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_unpackhi_ps(k, a, b); - #else - return simde_mm512_maskz_mov_ps(k, simde_mm512_unpackhi_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_unpackhi_ps - #define _mm512_maskz_unpackhi_ps(k, a, b) simde_mm512_maskz_unpackhi_ps(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_mask_unpackhi_ps(simde__m256 src, simde__mmask8 k, simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_unpackhi_ps(src, k, a, b); - #else - return simde_mm256_mask_mov_ps(src, k, simde_mm256_unpackhi_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_unpackhi_ps - #define _mm256_mask_unpackhi_ps(src, k, a, b) simde_mm256_mask_unpackhi_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_maskz_unpackhi_ps(simde__mmask8 k, simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_unpackhi_ps(k, a, b); - #else - return simde_mm256_maskz_mov_ps(k, simde_mm256_unpackhi_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_unpackhi_ps - #define _mm256_maskz_unpackhi_ps(k, a, b) simde_mm256_maskz_unpackhi_ps(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_mask_unpackhi_ps(simde__m128 src, simde__mmask8 k, simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_unpackhi_ps(src, k, a, b); - #else - return simde_mm_mask_mov_ps(src, k, simde_mm_unpackhi_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_unpackhi_ps - #define _mm_mask_unpackhi_ps(src, k, a, b) simde_mm_mask_unpackhi_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_maskz_unpackhi_ps(simde__mmask8 k, simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_unpackhi_ps(k, a, b); - #else - return simde_mm_maskz_mov_ps(k, simde_mm_unpackhi_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_unpackhi_ps - #define _mm_maskz_unpackhi_ps(k, a, b) simde_mm_maskz_unpackhi_ps(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_unpackhi_pd (simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_unpackhi_pd(a, b); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a), - b_ = simde__m512d_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 64, a_.f64, b_.f64, 1, 9, 3, 11, 5, 13, 7, 15); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256d[0] = simde_mm256_unpackhi_pd(a_.m256d[0], b_.m256d[0]); - r_.m256d[1] = simde_mm256_unpackhi_pd(a_.m256d[1], b_.m256d[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0]) / 2) ; i++) { - r_.f64[2 * i] = a_.f64[2 * i + 1]; - r_.f64[2 * i + 1] = b_.f64[2 * i + 1]; - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_unpackhi_pd - #define _mm512_unpackhi_pd(a, b) simde_mm512_unpackhi_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_unpackhi_pd(simde__m512d src, simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_unpackhi_pd(src, k, a, b); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_unpackhi_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_unpackhi_pd - #define _mm512_mask_unpackhi_pd(src, k, a, b) simde_mm512_mask_unpackhi_pd(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_maskz_unpackhi_pd(simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_unpackhi_pd(k, a, b); - #else - return simde_mm512_maskz_mov_pd(k, simde_mm512_unpackhi_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_unpackhi_pd - #define _mm512_maskz_unpackhi_pd(k, a, b) simde_mm512_maskz_unpackhi_pd(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_mask_unpackhi_pd(simde__m256d src, simde__mmask8 k, simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_unpackhi_pd(src, k, a, b); - #else - return simde_mm256_mask_mov_pd(src, k, simde_mm256_unpackhi_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_unpackhi_pd - #define _mm256_mask_unpackhi_pd(src, k, a, b) simde_mm256_mask_unpackhi_pd(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_maskz_unpackhi_pd(simde__mmask8 k, simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_unpackhi_pd(k, a, b); - #else - return simde_mm256_maskz_mov_pd(k, simde_mm256_unpackhi_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_unpackhi_pd - #define _mm256_maskz_unpackhi_pd(k, a, b) simde_mm256_maskz_unpackhi_pd(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_mask_unpackhi_pd(simde__m128d src, simde__mmask8 k, simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_unpackhi_pd(src, k, a, b); - #else - return simde_mm_mask_mov_pd(src, k, simde_mm_unpackhi_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_unpackhi_pd - #define _mm_mask_unpackhi_pd(src, k, a, b) simde_mm_mask_unpackhi_pd(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_maskz_unpackhi_pd(simde__mmask8 k, simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_unpackhi_pd(k, a, b); - #else - return simde_mm_maskz_mov_pd(k, simde_mm_unpackhi_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_unpackhi_pd - #define _mm_maskz_unpackhi_pd(k, a, b) simde_mm_maskz_unpackhi_pd(k, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_UNPACKHI_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/unpacklo.h b/ffi-deps/simde/simde/x86/avx512/unpacklo.h deleted file mode 100644 index efaa61e..0000000 --- a/ffi-deps/simde/simde/x86/avx512/unpacklo.h +++ /dev/null @@ -1,752 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_UNPACKLO_H) -#define SIMDE_X86_AVX512_UNPACKLO_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_unpacklo_epi8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_unpacklo_epi8(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 64, a_.i8, b_.i8, - 0, 64, 1, 65, 2, 66, 3, 67, - 4, 68, 5, 69, 6, 70, 7, 71, - 16, 80, 17, 81, 18, 82, 19, 83, - 20, 84, 21, 85, 22, 86, 23, 87, - 32, 96, 33, 97, 34, 98, 35, 99, - 36, 100, 37, 101, 38, 102, 39, 103, - 48, 112, 49, 113, 50, 114, 51, 115, - 52, 116, 53, 117, 54, 118, 55, 119); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256i[0] = simde_mm256_unpacklo_epi8(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_unpacklo_epi8(a_.m256i[1], b_.m256i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0]) / 2) ; i++) { - r_.i8[2 * i] = a_.i8[i + ~(~i | 7)]; - r_.i8[2 * i + 1] = b_.i8[i + ~(~i | 7)]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_unpacklo_epi8 - #define _mm512_unpacklo_epi8(a, b) simde_mm512_unpacklo_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_unpacklo_epi8(simde__m512i src, simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_unpacklo_epi8(src, k, a, b); - #else - return simde_mm512_mask_mov_epi8(src, k, simde_mm512_unpacklo_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_unpacklo_epi8 - #define _mm512_mask_unpacklo_epi8(src, k, a, b) simde_mm512_mask_unpacklo_epi8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_unpacklo_epi8(simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_unpacklo_epi8(k, a, b); - #else - return simde_mm512_maskz_mov_epi8(k, simde_mm512_unpacklo_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_unpacklo_epi8 - #define _mm512_maskz_unpacklo_epi8(k, a, b) simde_mm512_maskz_unpacklo_epi8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_unpacklo_epi8(simde__m256i src, simde__mmask32 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_unpacklo_epi8(src, k, a, b); - #else - return simde_mm256_mask_mov_epi8(src, k, simde_mm256_unpacklo_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_unpacklo_epi8 - #define _mm256_mask_unpacklo_epi8(src, k, a, b) simde_mm256_mask_unpacklo_epi8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_unpacklo_epi8(simde__mmask32 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_unpacklo_epi8(k, a, b); - #else - return simde_mm256_maskz_mov_epi8(k, simde_mm256_unpacklo_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_unpacklo_epi8 - #define _mm256_maskz_unpacklo_epi8(k, a, b) simde_mm256_maskz_unpacklo_epi8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_unpacklo_epi8(simde__m128i src, simde__mmask16 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_unpacklo_epi8(src, k, a, b); - #else - return simde_mm_mask_mov_epi8(src, k, simde_mm_unpacklo_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_unpacklo_epi8 - #define _mm_mask_unpacklo_epi8(src, k, a, b) simde_mm_mask_unpacklo_epi8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_unpacklo_epi8(simde__mmask16 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_unpacklo_epi8(k, a, b); - #else - return simde_mm_maskz_mov_epi8(k, simde_mm_unpacklo_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_unpacklo_epi8 - #define _mm_maskz_unpacklo_epi8(k, a, b) simde_mm_maskz_unpacklo_epi8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_unpacklo_epi16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_unpacklo_epi16(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 64, a_.i16, b_.i16, - 0, 32, 1, 33, 2, 34, 3, 35, 8, 40, 9, 41, 10, 42, 11, 43, - 16, 48, 17, 49, 18, 50, 19, 51, 24, 56, 25, 57, 26, 58, 27, 59); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256i[0] = simde_mm256_unpacklo_epi16(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_unpacklo_epi16(a_.m256i[1], b_.m256i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0]) / 2) ; i++) { - r_.i16[2 * i] = a_.i16[i + ~(~i | 3)]; - r_.i16[2 * i + 1] = b_.i16[i + ~(~i | 3)]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_unpacklo_epi16 - #define _mm512_unpacklo_epi16(a, b) simde_mm512_unpacklo_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_unpacklo_epi16(simde__m512i src, simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_mask_unpacklo_epi16(src, k, a, b); - #else - return simde_mm512_mask_mov_epi16(src, k, simde_mm512_unpacklo_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_unpacklo_epi16 - #define _mm512_mask_unpacklo_epi16(src, k, a, b) simde_mm512_mask_unpacklo_epi16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_unpacklo_epi16(simde__mmask32 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - return _mm512_maskz_unpacklo_epi16(k, a, b); - #else - return simde_mm512_maskz_mov_epi16(k, simde_mm512_unpacklo_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_unpacklo_epi16 - #define _mm512_maskz_unpacklo_epi16(k, a, b) simde_mm512_maskz_unpacklo_epi16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_unpacklo_epi16(simde__m256i src, simde__mmask16 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_unpacklo_epi16(src, k, a, b); - #else - return simde_mm256_mask_mov_epi16(src, k, simde_mm256_unpacklo_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_unpacklo_epi16 - #define _mm256_mask_unpacklo_epi16(src, k, a, b) simde_mm256_mask_unpacklo_epi16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_unpacklo_epi16(simde__mmask16 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_unpacklo_epi16(k, a, b); - #else - return simde_mm256_maskz_mov_epi16(k, simde_mm256_unpacklo_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_unpacklo_epi16 - #define _mm256_maskz_unpacklo_epi16(k, a, b) simde_mm256_maskz_unpacklo_epi16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_unpacklo_epi16(simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_unpacklo_epi16(src, k, a, b); - #else - return simde_mm_mask_mov_epi16(src, k, simde_mm_unpacklo_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_unpacklo_epi16 - #define _mm_mask_unpacklo_epi16(src, k, a, b) simde_mm_mask_unpacklo_epi16(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_unpacklo_epi16(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_unpacklo_epi16(k, a, b); - #else - return simde_mm_maskz_mov_epi16(k, simde_mm_unpacklo_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_unpacklo_epi16 - #define _mm_maskz_unpacklo_epi16(k, a, b) simde_mm_maskz_unpacklo_epi16(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_unpacklo_epi32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_unpacklo_epi32(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 64, a_.i32, b_.i32, - 0, 16, 1, 17, 4, 20, 5, 21, - 8, 24, 9, 25, 12, 28, 13, 29); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256i[0] = simde_mm256_unpacklo_epi32(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_unpacklo_epi32(a_.m256i[1], b_.m256i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0]) / 2) ; i++) { - r_.i32[2 * i] = a_.i32[i + ~(~i | 1)]; - r_.i32[2 * i + 1] = b_.i32[i + ~(~i | 1)]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_unpacklo_epi32 - #define _mm512_unpacklo_epi32(a, b) simde_mm512_unpacklo_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_unpacklo_epi32(simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_unpacklo_epi32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_unpacklo_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_unpacklo_epi32 - #define _mm512_mask_unpacklo_epi32(src, k, a, b) simde_mm512_mask_unpacklo_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_unpacklo_epi32(simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_unpacklo_epi32(k, a, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_unpacklo_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_unpacklo_epi32 - #define _mm512_maskz_unpacklo_epi32(k, a, b) simde_mm512_maskz_unpacklo_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_unpacklo_epi32(simde__m256i src, simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_unpacklo_epi32(src, k, a, b); - #else - return simde_mm256_mask_mov_epi32(src, k, simde_mm256_unpacklo_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_unpacklo_epi32 - #define _mm256_mask_unpacklo_epi32(src, k, a, b) simde_mm256_mask_unpacklo_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_unpacklo_epi32(simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_unpacklo_epi32(k, a, b); - #else - return simde_mm256_maskz_mov_epi32(k, simde_mm256_unpacklo_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_unpacklo_epi32 - #define _mm256_maskz_unpacklo_epi32(k, a, b) simde_mm256_maskz_unpacklo_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_unpacklo_epi32(simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_unpacklo_epi32(src, k, a, b); - #else - return simde_mm_mask_mov_epi32(src, k, simde_mm_unpacklo_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_unpacklo_epi32 - #define _mm_mask_unpacklo_epi32(src, k, a, b) simde_mm_mask_unpacklo_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_unpacklo_epi32(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_unpacklo_epi32(k, a, b); - #else - return simde_mm_maskz_mov_epi32(k, simde_mm_unpacklo_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_unpacklo_epi32 - #define _mm_maskz_unpacklo_epi32(k, a, b) simde_mm_maskz_unpacklo_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_unpacklo_epi64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_unpacklo_epi64(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 64, a_.i64, b_.i64, 0, 8, 2, 10, 4, 12, 6, 14); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256i[0] = simde_mm256_unpacklo_epi64(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_unpacklo_epi64(a_.m256i[1], b_.m256i[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0]) / 2) ; i++) { - r_.i64[2 * i] = a_.i64[2 * i]; - r_.i64[2 * i + 1] = b_.i64[2 * i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_unpacklo_epi64 - #define _mm512_unpacklo_epi64(a, b) simde_mm512_unpacklo_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_unpacklo_epi64(simde__m512i src, simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_unpacklo_epi64(src, k, a, b); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_unpacklo_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_unpacklo_epi64 - #define _mm512_mask_unpacklo_epi64(src, k, a, b) simde_mm512_mask_unpacklo_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_unpacklo_epi64(simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_unpacklo_epi64(k, a, b); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_unpacklo_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_unpacklo_epi64 - #define _mm512_maskz_unpacklo_epi64(k, a, b) simde_mm512_maskz_unpacklo_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_unpacklo_epi64(simde__m256i src, simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_unpacklo_epi64(src, k, a, b); - #else - return simde_mm256_mask_mov_epi64(src, k, simde_mm256_unpacklo_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_unpacklo_epi64 - #define _mm256_mask_unpacklo_epi64(src, k, a, b) simde_mm256_mask_unpacklo_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_unpacklo_epi64(simde__mmask8 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_unpacklo_epi64(k, a, b); - #else - return simde_mm256_maskz_mov_epi64(k, simde_mm256_unpacklo_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_unpacklo_epi64 - #define _mm256_maskz_unpacklo_epi64(k, a, b) simde_mm256_maskz_unpacklo_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_unpacklo_epi64(simde__m128i src, simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_unpacklo_epi64(src, k, a, b); - #else - return simde_mm_mask_mov_epi64(src, k, simde_mm_unpacklo_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_unpacklo_epi64 - #define _mm_mask_unpacklo_epi64(src, k, a, b) simde_mm_mask_unpacklo_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_unpacklo_epi64(simde__mmask8 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_unpacklo_epi64(k, a, b); - #else - return simde_mm_maskz_mov_epi64(k, simde_mm_unpacklo_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_unpacklo_epi64 - #define _mm_maskz_unpacklo_epi64(k, a, b) simde_mm_maskz_unpacklo_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_unpacklo_ps (simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_unpacklo_ps(a, b); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a), - b_ = simde__m512_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 64, a_.f32, b_.f32, - 0, 16, 1, 17, 4, 20, 5, 21, - 8, 24, 9, 25, 12, 28, 13, 29); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256[0] = simde_mm256_unpacklo_ps(a_.m256[0], b_.m256[0]); - r_.m256[1] = simde_mm256_unpacklo_ps(a_.m256[1], b_.m256[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0]) / 2) ; i++) { - r_.f32[2 * i] = a_.f32[i + ~(~i | 1)]; - r_.f32[2 * i + 1] = b_.f32[i + ~(~i | 1)]; - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_unpacklo_ps - #define _mm512_unpacklo_ps(a, b) simde_mm512_unpacklo_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_unpacklo_ps(simde__m512 src, simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_unpacklo_ps(src, k, a, b); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_unpacklo_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_unpacklo_ps - #define _mm512_mask_unpacklo_ps(src, k, a, b) simde_mm512_mask_unpacklo_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_unpacklo_ps(simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_unpacklo_ps(k, a, b); - #else - return simde_mm512_maskz_mov_ps(k, simde_mm512_unpacklo_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_unpacklo_ps - #define _mm512_maskz_unpacklo_ps(k, a, b) simde_mm512_maskz_unpacklo_ps(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_mask_unpacklo_ps(simde__m256 src, simde__mmask8 k, simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_unpacklo_ps(src, k, a, b); - #else - return simde_mm256_mask_mov_ps(src, k, simde_mm256_unpacklo_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_unpacklo_ps - #define _mm256_mask_unpacklo_ps(src, k, a, b) simde_mm256_mask_unpacklo_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_maskz_unpacklo_ps(simde__mmask8 k, simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_unpacklo_ps(k, a, b); - #else - return simde_mm256_maskz_mov_ps(k, simde_mm256_unpacklo_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_unpacklo_ps - #define _mm256_maskz_unpacklo_ps(k, a, b) simde_mm256_maskz_unpacklo_ps(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_mask_unpacklo_ps(simde__m128 src, simde__mmask8 k, simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_unpacklo_ps(src, k, a, b); - #else - return simde_mm_mask_mov_ps(src, k, simde_mm_unpacklo_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_unpacklo_ps - #define _mm_mask_unpacklo_ps(src, k, a, b) simde_mm_mask_unpacklo_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_maskz_unpacklo_ps(simde__mmask8 k, simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_unpacklo_ps(k, a, b); - #else - return simde_mm_maskz_mov_ps(k, simde_mm_unpacklo_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_unpacklo_ps - #define _mm_maskz_unpacklo_ps(k, a, b) simde_mm_maskz_unpacklo_ps(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_unpacklo_pd (simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_unpacklo_pd(a, b); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a), - b_ = simde__m512d_to_private(b); - - #if defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 64, a_.f64, b_.f64, 0, 8, 2, 10, 4, 12, 6, 14); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256d[0] = simde_mm256_unpacklo_pd(a_.m256d[0], b_.m256d[0]); - r_.m256d[1] = simde_mm256_unpacklo_pd(a_.m256d[1], b_.m256d[1]); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0]) / 2) ; i++) { - r_.f64[2 * i] = a_.f64[2 * i]; - r_.f64[2 * i + 1] = b_.f64[2 * i]; - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_unpacklo_pd - #define _mm512_unpacklo_pd(a, b) simde_mm512_unpacklo_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_unpacklo_pd(simde__m512d src, simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_unpacklo_pd(src, k, a, b); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_unpacklo_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_unpacklo_pd - #define _mm512_mask_unpacklo_pd(src, k, a, b) simde_mm512_mask_unpacklo_pd(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_maskz_unpacklo_pd(simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_unpacklo_pd(k, a, b); - #else - return simde_mm512_maskz_mov_pd(k, simde_mm512_unpacklo_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_unpacklo_pd - #define _mm512_maskz_unpacklo_pd(k, a, b) simde_mm512_maskz_unpacklo_pd(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_mask_unpacklo_pd(simde__m256d src, simde__mmask8 k, simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_unpacklo_pd(src, k, a, b); - #else - return simde_mm256_mask_mov_pd(src, k, simde_mm256_unpacklo_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_unpacklo_pd - #define _mm256_mask_unpacklo_pd(src, k, a, b) simde_mm256_mask_unpacklo_pd(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_maskz_unpacklo_pd(simde__mmask8 k, simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_unpacklo_pd(k, a, b); - #else - return simde_mm256_maskz_mov_pd(k, simde_mm256_unpacklo_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_unpacklo_pd - #define _mm256_maskz_unpacklo_pd(k, a, b) simde_mm256_maskz_unpacklo_pd(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_mask_unpacklo_pd(simde__m128d src, simde__mmask8 k, simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_unpacklo_pd(src, k, a, b); - #else - return simde_mm_mask_mov_pd(src, k, simde_mm_unpacklo_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_unpacklo_pd - #define _mm_mask_unpacklo_pd(src, k, a, b) simde_mm_mask_unpacklo_pd(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_maskz_unpacklo_pd(simde__mmask8 k, simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_unpacklo_pd(k, a, b); - #else - return simde_mm_maskz_mov_pd(k, simde_mm_unpacklo_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_unpacklo_pd - #define _mm_maskz_unpacklo_pd(k, a, b) simde_mm_maskz_unpacklo_pd(k, a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_UNPACKLO_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/xor.h b/ffi-deps/simde/simde/x86/avx512/xor.h deleted file mode 100644 index 359ab1b..0000000 --- a/ffi-deps/simde/simde/x86/avx512/xor.h +++ /dev/null @@ -1,319 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Himanshi Mathur - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_AVX512_XOR_H) -#define SIMDE_X86_AVX512_XOR_H - -#include "types.h" -#include "../avx2.h" -#include "mov.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_xor_ps (simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_xor_ps(a, b); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a), - b_ = simde__m512_to_private(b); - - /* TODO: generate reduced case to give to Intel */ - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) && !defined(HEDLEY_INTEL_VERSION) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_xor_ps(a_.m256[i], b_.m256[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_CLANG_BAD_VI64_OPS) - r_.i32f = a_.i32f ^ b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i]; - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_xor_ps - #define _mm512_xor_ps(a, b) simde_mm512_xor_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_xor_ps(simde__m512 src, simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_mask_xor_ps(src, k, a, b); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_xor_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_xor_ps - #define _mm512_mask_xor_ps(src, k, a, b) simde_mm512_mask_xor_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_maskz_xor_ps(simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_maskz_xor_ps(k, a, b); - #else - return simde_mm512_maskz_mov_ps(k, simde_mm512_xor_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_xor_ps - #define _mm512_maskz_xor_ps(k, a, b) simde_mm512_maskz_xor_ps(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_xor_pd (simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_xor_pd(a, b); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a), - b_ = simde__m512d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - r_.m256d[0] = simde_mm256_xor_pd(a_.m256d[0], b_.m256d[0]); - r_.m256d[1] = simde_mm256_xor_pd(a_.m256d[1], b_.m256d[1]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_CLANG_BAD_VI64_OPS) - r_.i32f = a_.i32f ^ b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i]; - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_xor_pd - #define _mm512_xor_pd(a, b) simde_mm512_xor_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_xor_pd(simde__m512d src, simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_mask_xor_pd(src, k, a, b); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_xor_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_xor_pd - #define _mm512_mask_xor_pd(src, k, a, b) simde_mm512_mask_xor_pd(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_maskz_xor_pd(simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_AVX512DQ_NATIVE) - return _mm512_maskz_xor_pd(k, a, b); - #else - return simde_mm512_maskz_mov_pd(k, simde_mm512_xor_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512DQ_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_xor_pd - #define _mm512_maskz_xor_pd(k, a, b) simde_mm512_maskz_xor_pd(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_xor_epi32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_xor_epi32(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_xor_si256(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 ^ b_.i32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] ^ b_.i32[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_xor_epi32 - #define _mm512_xor_epi32(a, b) simde_mm512_xor_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_xor_epi32(simde__m512i src, simde__mmask16 k, simde__m512i v2, simde__m512i v3) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_xor_epi32(src, k, v2, v3); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_xor_epi32(v2, v3)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_xor_epi32 - #define _mm512_mask_xor_epi32(src, k, v2, v3) simde_mm512_mask_xor_epi32(src, k, v2, v3) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_xor_epi32(simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_xor_epi32(k, a, b); - #else - return simde_mm512_maskz_mov_epi32(k, simde_mm512_xor_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_xor_epi32 - #define _mm512_maskz_xor_epi32(k, a, b) simde_mm512_maskz_xor_epi32(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_xor_epi64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_xor_epi64(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_xor_si256(a_.m256i[i], b_.m256i[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_CLANG_BAD_VI64_OPS) - r_.i32f = a_.i32f ^ b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i]; - } - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_xor_epi64 - #define _mm512_xor_epi64(a, b) simde_mm512_xor_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_xor_epi64(simde__m512i src, simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_xor_epi64(src, k, a, b); - #else - return simde_mm512_mask_mov_epi64(src, k, simde_mm512_xor_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_xor_epi64 - #define _mm512_mask_xor_epi64(src, k, a, b) simde_mm512_mask_xor_epi64(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_xor_epi64(simde__mmask8 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_xor_epi64(k, a, b); - #else - return simde_mm512_maskz_mov_epi64(k, simde_mm512_xor_epi64(a, b)); - #endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_xor_epi64 - #define _mm512_maskz_xor_epi64(k, a, b) simde_mm512_maskz_xor_epi64(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_xor_si512 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_xor_si512(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_X86_AVX2_NATIVE) - r_.m256i[0] = simde_mm256_xor_si256(a_.m256i[0], b_.m256i[0]); - r_.m256i[1] = simde_mm256_xor_si256(a_.m256i[1], b_.m256i[1]); - #elif defined(SIMDE_X86_SSE2_NATIVE) - r_.m128i[0] = simde_mm_xor_si128(a_.m128i[0], b_.m128i[0]); - r_.m128i[1] = simde_mm_xor_si128(a_.m128i[1], b_.m128i[1]); - r_.m128i[2] = simde_mm_xor_si128(a_.m128i[2], b_.m128i[2]); - r_.m128i[3] = simde_mm_xor_si128(a_.m128i[3], b_.m128i[3]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f ^ b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i]; - } - #endif - - return simde__m512i_from_private(r_); -#endif -} -#if defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_xor_si512 - #define _mm512_xor_si512(a, b) simde_mm512_xor_si512(a, b) -#endif - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_XOR_H) */ diff --git a/ffi-deps/simde/simde/x86/avx512/xorsign.h b/ffi-deps/simde/simde/x86/avx512/xorsign.h deleted file mode 100644 index 38fb5f9..0000000 --- a/ffi-deps/simde/simde/x86/avx512/xorsign.h +++ /dev/null @@ -1,72 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Himanshi Mathur - */ - -/* This is a SIMDe extension which is not part of AVX-512. It exists - * because a lot of numerical methods in SIMDe have algoriths which do - * something like: - * - * float sgn = input < 0 ? -1 : 1; - * ... - * return res * sgn; - * - * Which can be replaced with a much more efficient call to xorsign: - * - * return simde_x_mm512_xorsign_ps(res, input); - * - * While this was originally intended for use in SIMDe, please feel - * free to use it in your code. - */ - -#if !defined(SIMDE_X86_AVX512_XORSIGN_H) -#define SIMDE_X86_AVX512_XORSIGN_H - -#include "types.h" -#include "mov.h" -#include "and.h" -#include "xor.h" -#include "set1.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_x_mm512_xorsign_ps(simde__m512 dest, simde__m512 src) { - return simde_mm512_xor_ps(simde_mm512_and_ps(simde_mm512_set1_ps(-0.0f), src), dest); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_x_mm512_xorsign_pd(simde__m512d dest, simde__m512d src) { - return simde_mm512_xor_pd(simde_mm512_and_pd(simde_mm512_set1_pd(-0.0), src), dest); -} - -SIMDE_END_DECLS_ -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_AVX512_XORSIGN_H) */ diff --git a/ffi-deps/simde/simde/x86/clmul.h b/ffi-deps/simde/simde/x86/clmul.h deleted file mode 100644 index cd4a062..0000000 --- a/ffi-deps/simde/simde/x86/clmul.h +++ /dev/null @@ -1,387 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2016 Thomas Pornin - */ - -/* The portable version is based on the implementation in BearSSL, - * which is MIT licensed, constant-time / branch-free, and documented - * at https://www.bearssl.org/constanttime.html (specifically, we use - * the implementation from ghash_ctmul64.c). */ - -#if !defined(SIMDE_X86_CLMUL_H) -#define SIMDE_X86_CLMUL_H - -#include "avx512/set.h" -#include "avx512/setzero.h" - -#if !defined(SIMDE_X86_PCLMUL_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) -# define SIMDE_X86_PCLMUL_ENABLE_NATIVE_ALIASES -#endif - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -uint64_t -simde_x_clmul_u64(uint64_t x, uint64_t y) { - uint64_t x0, x1, x2, x3; - uint64_t y0, y1, y2, y3; - uint64_t z0, z1, z2, z3; - - x0 = x & UINT64_C(0x1111111111111111); - x1 = x & UINT64_C(0x2222222222222222); - x2 = x & UINT64_C(0x4444444444444444); - x3 = x & UINT64_C(0x8888888888888888); - y0 = y & UINT64_C(0x1111111111111111); - y1 = y & UINT64_C(0x2222222222222222); - y2 = y & UINT64_C(0x4444444444444444); - y3 = y & UINT64_C(0x8888888888888888); - - z0 = (x0 * y0) ^ (x1 * y3) ^ (x2 * y2) ^ (x3 * y1); - z1 = (x0 * y1) ^ (x1 * y0) ^ (x2 * y3) ^ (x3 * y2); - z2 = (x0 * y2) ^ (x1 * y1) ^ (x2 * y0) ^ (x3 * y3); - z3 = (x0 * y3) ^ (x1 * y2) ^ (x2 * y1) ^ (x3 * y0); - - z0 &= UINT64_C(0x1111111111111111); - z1 &= UINT64_C(0x2222222222222222); - z2 &= UINT64_C(0x4444444444444444); - z3 &= UINT64_C(0x8888888888888888); - - return z0 | z1 | z2 | z3; -} - -static uint64_t -simde_x_bitreverse_u64(uint64_t v) { - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint8x8_t bytes = vreinterpret_u8_u64(vmov_n_u64(v)); - bytes = vrbit_u8(bytes); - bytes = vrev64_u8(bytes); - return vget_lane_u64(vreinterpret_u64_u8(bytes), 0); - #elif defined(SIMDE_X86_GFNI_NATIVE) - /* I don't think there is (or likely will ever be) a CPU with GFNI - * but not pclmulq, but this may be useful for things other than - * _mm_clmulepi64_si128. */ - __m128i vec = _mm_cvtsi64_si128(HEDLEY_STATIC_CAST(int64_t, v)); - - /* Reverse bits within each byte */ - vec = _mm_gf2p8affine_epi64_epi8(vec, _mm_cvtsi64_si128(HEDLEY_STATIC_CAST(int64_t, UINT64_C(0x8040201008040201))), 0); - - /* Reverse bytes */ - #if defined(SIMDE_X86_SSSE3_NATIVE) - vec = _mm_shuffle_epi8(vec, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7)); - #else - vec = _mm_or_si128(_mm_slli_epi16(vec, 8), _mm_srli_epi16(vec, 8)); - vec = _mm_shufflelo_epi16(vec, _MM_SHUFFLE(0, 1, 2, 3)); - vec = _mm_shufflehi_epi16(vec, _MM_SHUFFLE(0, 1, 2, 3)); - #endif - - return HEDLEY_STATIC_CAST(uint64_t, _mm_cvtsi128_si64(vec)); - #elif HEDLEY_HAS_BUILTIN(__builtin_bitreverse64) - return __builtin_bitreverse64(v); - #else - v = ((v >> 1) & UINT64_C(0x5555555555555555)) | ((v & UINT64_C(0x5555555555555555)) << 1); - v = ((v >> 2) & UINT64_C(0x3333333333333333)) | ((v & UINT64_C(0x3333333333333333)) << 2); - v = ((v >> 4) & UINT64_C(0x0F0F0F0F0F0F0F0F)) | ((v & UINT64_C(0x0F0F0F0F0F0F0F0F)) << 4); - v = ((v >> 8) & UINT64_C(0x00FF00FF00FF00FF)) | ((v & UINT64_C(0x00FF00FF00FF00FF)) << 8); - v = ((v >> 16) & UINT64_C(0x0000FFFF0000FFFF)) | ((v & UINT64_C(0x0000FFFF0000FFFF)) << 16); - return (v >> 32) | (v << 32); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_clmulepi64_si128 (simde__m128i a, simde__m128i b, const int imm8) - SIMDE_REQUIRE_CONSTANT(imm8) { - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - r_; - - #if SIMDE_NATURAL_VECTOR_SIZE_GE(128) - #if defined(SIMDE_SHUFFLE_VECTOR_) - switch (imm8 & 0x11) { - case 0x00: - b_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, b_.u64, b_.u64, 0, 0); - a_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.u64, a_.u64, 0, 0); - break; - case 0x01: - b_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, b_.u64, b_.u64, 0, 0); - a_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.u64, a_.u64, 1, 1); - break; - case 0x10: - b_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, b_.u64, b_.u64, 1, 1); - a_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.u64, a_.u64, 0, 0); - break; - case 0x11: - b_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, b_.u64, b_.u64, 1, 1); - a_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.u64, a_.u64, 1, 1); - break; - } - #else - { - const uint64_t A = a_.u64[(imm8 ) & 1]; - const uint64_t B = b_.u64[(imm8 >> 4) & 1]; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - a_.u64[i] = A; - b_.u64[i] = B; - } - } - #endif - - simde__m128i_private reversed_; - { - #if defined(SIMDE_SHUFFLE_VECTOR_) - reversed_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.u64, b_.u64, 1, 3); - #else - reversed_.u64[0] = a_.u64[1]; - reversed_.u64[1] = b_.u64[1]; - #endif - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(reversed_.u64) / sizeof(reversed_.u64[0])) ; i++) { - reversed_.u64[i] = simde_x_bitreverse_u64(reversed_.u64[i]); - } - } - - #if defined(SIMDE_SHUFFLE_VECTOR_) - a_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.u64, reversed_.u64, 0, 2); - b_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 16, b_.u64, reversed_.u64, 1, 3); - #else - a_.u64[1] = reversed_.u64[0]; - b_.u64[1] = reversed_.u64[1]; - #endif - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(reversed_.u64) / sizeof(reversed_.u64[0])) ; i++) { - r_.u64[i] = simde_x_clmul_u64(a_.u64[i], b_.u64[i]); - } - - r_.u64[1] = simde_x_bitreverse_u64(r_.u64[1]) >> 1; - #else - r_.u64[0] = simde_x_clmul_u64( a_.u64[imm8 & 1], b_.u64[(imm8 >> 4) & 1]); - r_.u64[1] = simde_x_bitreverse_u64(simde_x_clmul_u64(simde_x_bitreverse_u64(a_.u64[imm8 & 1]), simde_x_bitreverse_u64(b_.u64[(imm8 >> 4) & 1]))) >> 1; - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_PCLMUL_NATIVE) - #if defined(HEDLEY_MCST_LCC_VERSION) - #define simde_mm_clmulepi64_si128(a, b, imm8) (__extension__ ({ \ - SIMDE_LCC_DISABLE_DEPRECATED_WARNINGS \ - _mm_clmulepi64_si128((a), (b), (imm8)); \ - SIMDE_LCC_REVERT_DEPRECATED_WARNINGS \ - })) - #else - #define simde_mm_clmulepi64_si128(a, b, imm8) _mm_clmulepi64_si128(a, b, imm8) - #endif -#elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_AES) && !defined(__clang__) - #define simde_mm_clmulepi64_si128(a, b, imm8) \ - simde__m128i_from_neon_u64( \ - vreinterpretq_u64_p128( \ - vmull_p64( \ - vgetq_lane_p64(vreinterpretq_p64_u64(simde__m128i_to_neon_u64(a)), (imm8 ) & 1), \ - vgetq_lane_p64(vreinterpretq_p64_u64(simde__m128i_to_neon_u64(b)), (imm8 >> 4) & 1) \ - ) \ - ) \ - ) -#endif -#if defined(SIMDE_X86_PCLMUL_ENABLE_NATIVE_ALIASES) - #undef _mm_clmulepi64_si128 - #define _mm_clmulepi64_si128(a, b, imm8) simde_mm_clmulepi64_si128(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_clmulepi64_epi128 (simde__m256i a, simde__m256i b, const int imm8) - SIMDE_REQUIRE_CONSTANT(imm8) { - simde__m256i_private - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b), - r_; - - simde__m128i_private a_lo_, b_lo_, r_lo_, a_hi_, b_hi_, r_hi_; - - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(HEDLEY_IBM_VERSION) - switch (imm8 & 0x01) { - case 0x00: - a_lo_.u64 = __builtin_shufflevector(a_.u64, a_.u64, 0, 2); - break; - case 0x01: - a_lo_.u64 = __builtin_shufflevector(a_.u64, a_.u64, 1, 3); - break; - } - switch (imm8 & 0x10) { - case 0x00: - b_lo_.u64 = __builtin_shufflevector(b_.u64, b_.u64, 0, 2); - break; - case 0x10: - b_lo_.u64 = __builtin_shufflevector(b_.u64, b_.u64, 1, 3); - break; - } - #else - a_lo_.u64[0] = a_.u64[((imm8 >> 0) & 1) + 0]; - a_lo_.u64[1] = a_.u64[((imm8 >> 0) & 1) + 2]; - b_lo_.u64[0] = b_.u64[((imm8 >> 4) & 1) + 0]; - b_lo_.u64[1] = b_.u64[((imm8 >> 4) & 1) + 2]; - #endif - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_hi_.u64) / sizeof(r_hi_.u64[0])) ; i++) { - a_hi_.u64[i] = simde_x_bitreverse_u64(a_lo_.u64[i]); - b_hi_.u64[i] = simde_x_bitreverse_u64(b_lo_.u64[i]); - - r_lo_.u64[i] = simde_x_clmul_u64(a_lo_.u64[i], b_lo_.u64[i]); - r_hi_.u64[i] = simde_x_clmul_u64(a_hi_.u64[i], b_hi_.u64[i]); - - r_hi_.u64[i] = simde_x_bitreverse_u64(r_hi_.u64[i]) >> 1; - } - - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(HEDLEY_IBM_VERSION) - r_.u64 = __builtin_shufflevector(r_lo_.u64, r_hi_.u64, 0, 2, 1, 3); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_ = simde__m256i_to_private(simde_mm256_set_m128i(simde__m128i_from_private(r_hi_), simde__m128i_from_private(r_lo_))); - r_.u64 = SIMDE_SHUFFLE_VECTOR_(64, 32, r_.u64, r_.u64, 0, 2, 1, 3); - #else - r_.u64[0] = r_lo_.u64[0]; - r_.u64[1] = r_hi_.u64[0]; - r_.u64[2] = r_lo_.u64[1]; - r_.u64[3] = r_hi_.u64[1]; - #endif - - return simde__m256i_from_private(r_); -} -#if defined(SIMDE_X86_VPCLMULQDQ_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_clmulepi64_epi128(a, b, imm8) _mm256_clmulepi64_epi128(a, b, imm8) -#endif -#if defined(SIMDE_X86_VPCLMULQDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_clmulepi64_epi128 - #define _mm256_clmulepi64_epi128(a, b, imm8) simde_mm256_clmulepi64_epi128(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_clmulepi64_epi128 (simde__m512i a, simde__m512i b, const int imm8) - SIMDE_REQUIRE_CONSTANT(imm8) { - simde__m512i_private - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b), - r_; - - #if defined(HEDLEY_MSVC_VERSION) - r_ = simde__m512i_to_private(simde_mm512_setzero_si512()); - #endif - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - switch (imm8 & 0x11) { - case 0x00: - r_.m256i[0] = simde_mm256_clmulepi64_epi128(a_.m256i[0], b_.m256i[0], 0x00); - r_.m256i[1] = simde_mm256_clmulepi64_epi128(a_.m256i[1], b_.m256i[1], 0x00); - break; - case 0x01: - r_.m256i[0] = simde_mm256_clmulepi64_epi128(a_.m256i[0], b_.m256i[0], 0x01); - r_.m256i[1] = simde_mm256_clmulepi64_epi128(a_.m256i[1], b_.m256i[1], 0x01); - break; - case 0x10: - r_.m256i[0] = simde_mm256_clmulepi64_epi128(a_.m256i[0], b_.m256i[0], 0x10); - r_.m256i[1] = simde_mm256_clmulepi64_epi128(a_.m256i[1], b_.m256i[1], 0x10); - break; - case 0x11: - r_.m256i[0] = simde_mm256_clmulepi64_epi128(a_.m256i[0], b_.m256i[0], 0x11); - r_.m256i[1] = simde_mm256_clmulepi64_epi128(a_.m256i[1], b_.m256i[1], 0x11); - break; - } - #else - simde__m256i_private a_lo_, b_lo_, r_lo_, a_hi_, b_hi_, r_hi_; - - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(HEDLEY_IBM_VERSION) - switch (imm8 & 0x01) { - case 0x00: - a_lo_.u64 = __builtin_shufflevector(a_.u64, a_.u64, 0, 2, 4, 6); - break; - case 0x01: - a_lo_.u64 = __builtin_shufflevector(a_.u64, a_.u64, 1, 3, 5, 7); - break; - } - switch (imm8 & 0x10) { - case 0x00: - b_lo_.u64 = __builtin_shufflevector(b_.u64, b_.u64, 0, 2, 4, 6); - break; - case 0x10: - b_lo_.u64 = __builtin_shufflevector(b_.u64, b_.u64, 1, 3, 5, 7); - break; - } - #else - a_lo_.u64[0] = a_.u64[((imm8 >> 0) & 1) + 0]; - a_lo_.u64[1] = a_.u64[((imm8 >> 0) & 1) + 2]; - a_lo_.u64[2] = a_.u64[((imm8 >> 0) & 1) + 4]; - a_lo_.u64[3] = a_.u64[((imm8 >> 0) & 1) + 6]; - b_lo_.u64[0] = b_.u64[((imm8 >> 4) & 1) + 0]; - b_lo_.u64[1] = b_.u64[((imm8 >> 4) & 1) + 2]; - b_lo_.u64[2] = b_.u64[((imm8 >> 4) & 1) + 4]; - b_lo_.u64[3] = b_.u64[((imm8 >> 4) & 1) + 6]; - #endif - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_hi_.u64) / sizeof(r_hi_.u64[0])) ; i++) { - a_hi_.u64[i] = simde_x_bitreverse_u64(a_lo_.u64[i]); - b_hi_.u64[i] = simde_x_bitreverse_u64(b_lo_.u64[i]); - - r_lo_.u64[i] = simde_x_clmul_u64(a_lo_.u64[i], b_lo_.u64[i]); - r_hi_.u64[i] = simde_x_clmul_u64(a_hi_.u64[i], b_hi_.u64[i]); - - r_hi_.u64[i] = simde_x_bitreverse_u64(r_hi_.u64[i]) >> 1; - } - - #if HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && !defined(HEDLEY_IBM_VERSION) - r_.u64 = __builtin_shufflevector(r_lo_.u64, r_hi_.u64, 0, 4, 1, 5, 2, 6, 3, 7); - #else - r_.u64[0] = r_lo_.u64[0]; - r_.u64[1] = r_hi_.u64[0]; - r_.u64[2] = r_lo_.u64[1]; - r_.u64[3] = r_hi_.u64[1]; - r_.u64[4] = r_lo_.u64[2]; - r_.u64[5] = r_hi_.u64[2]; - r_.u64[6] = r_lo_.u64[3]; - r_.u64[7] = r_hi_.u64[3]; - #endif - #endif - - return simde__m512i_from_private(r_); -} -#if defined(SIMDE_X86_VPCLMULQDQ_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_clmulepi64_epi128(a, b, imm8) _mm512_clmulepi64_epi128(a, b, imm8) -#endif -#if defined(SIMDE_X86_VPCLMULQDQ_ENABLE_NATIVE_ALIASES) && defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_clmulepi64_epi128 - #define _mm512_clmulepi64_epi128(a, b, imm8) simde_mm512_clmulepi64_epi128(a, b, imm8) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_CLMUL_H) */ diff --git a/ffi-deps/simde/simde/x86/f16c.h b/ffi-deps/simde/simde/x86/f16c.h deleted file mode 100644 index 9522bf6..0000000 --- a/ffi-deps/simde/simde/x86/f16c.h +++ /dev/null @@ -1,172 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2021 Evan Nemerson - */ - -#include "../simde-common.h" -#include "../simde-math.h" -#include "../simde-f16.h" - -#if !defined(SIMDE_X86_F16C_H) -#define SIMDE_X86_F16C_H - -#include "avx.h" - -#if !defined(SIMDE_X86_PF16C_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) -# define SIMDE_X86_PF16C_ENABLE_NATIVE_ALIASES -#endif - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtps_ph(simde__m128 a, const int imm8) { - simde__m128_private a_ = simde__m128_to_private(a); - simde__m128i_private r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - - HEDLEY_STATIC_CAST(void, imm8); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - r_.neon_f16 = vcombine_f16(vcvt_f16_f32(a_.neon_f32), vdup_n_f16(SIMDE_FLOAT16_C(0.0))); - #elif defined(SIMDE_FLOAT16_VECTOR) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.f16[i] = simde_float16_from_float32(a_.f32[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.u16[i] = simde_float16_as_uint16(simde_float16_from_float32(a_.f32[i])); - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_F16C_NATIVE) - #define simde_mm_cvtps_ph(a, imm8) _mm_cvtps_ph(a, imm8) -#endif -#if defined(SIMDE_X86_F16C_ENABLE_NATIVE_ALIASES) - #define _mm_cvtps_ph(a, sae) simde_mm_cvtps_ph(a, sae) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtph_ps(simde__m128i a) { - #if defined(SIMDE_X86_F16C_NATIVE) - return _mm_cvtph_ps(a); - #else - simde__m128i_private a_ = simde__m128i_to_private(a); - simde__m128_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) - r_.neon_f32 = vcvt_f32_f16(vget_low_f16(a_.neon_f16)); - #elif defined(SIMDE_FLOAT16_VECTOR) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.f32[i] = simde_float16_to_float32(a_.f16[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.f32[i] = simde_float16_to_float32(simde_uint16_as_float16(a_.u16[i])); - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_F16C_ENABLE_NATIVE_ALIASES) - #define _mm_cvtph_ps(a) simde_mm_cvtph_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm256_cvtps_ph(simde__m256 a, const int imm8) { - simde__m256_private a_ = simde__m256_to_private(a); - simde__m128i_private r_; - - HEDLEY_STATIC_CAST(void, imm8); - - #if defined(SIMDE_FLOAT16_VECTOR) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.f16[i] = simde_float16_from_float32(a_.f32[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - r_.u16[i] = simde_float16_as_uint16(simde_float16_from_float32(a_.f32[i])); - } - #endif - - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_F16C_NATIVE) - #define simde_mm256_cvtps_ph(a, imm8) _mm256_cvtps_ph(a, imm8) -#endif -#if defined(SIMDE_X86_F16C_ENABLE_NATIVE_ALIASES) - #define _mm256_cvtps_ph(a, imm8) simde_mm256_cvtps_ph(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_cvtph_ps(simde__m128i a) { - #if defined(SIMDE_X86_F16C_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cvtph_ps(a); - #elif defined(SIMDE_X86_F16C_NATIVE) - return _mm256_setr_m128( - _mm_cvtph_ps(a), - _mm_cvtph_ps(_mm_castps_si128(_mm_permute_ps(_mm_castsi128_ps(a), 0xee))) - ); - #else - simde__m128i_private a_ = simde__m128i_to_private(a); - simde__m256_private r_; - - #if defined(SIMDE_FLOAT16_VECTOR) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_float16_to_float32(a_.f16[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_float16_to_float32(simde_uint16_as_float16(a_.u16[i])); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_F16C_ENABLE_NATIVE_ALIASES) - #define _mm256_cvtph_ps(a) simde_mm256_cvtph_ps(a) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_F16C_H) */ diff --git a/ffi-deps/simde/simde/x86/fma.h b/ffi-deps/simde/simde/x86/fma.h deleted file mode 100644 index 630efc5..0000000 --- a/ffi-deps/simde/simde/x86/fma.h +++ /dev/null @@ -1,732 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2019 Evan Nemerson - */ - -#if !defined(SIMDE_X86_FMA_H) -#define SIMDE_X86_FMA_H - -#include "avx.h" - -#if !defined(SIMDE_X86_FMA_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) -# define SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES -#endif - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_fmadd_pd (simde__m128d a, simde__m128d b, simde__m128d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fmadd_pd(a, b, c); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b), - c_ = simde__m128d_to_private(c), - r_; - - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = vec_madd(a_.altivec_f64, b_.altivec_f64, c_.altivec_f64); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vfmaq_f64(c_.neon_f64, b_.neon_f64, a_.neon_f64); - #elif defined(simde_math_fma) && (defined(__FP_FAST_FMA) || defined(FP_FAST_FMA)) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_fma(a_.f64[i], b_.f64[i], c_.f64[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (a_.f64[i] * b_.f64[i]) + c_.f64[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmadd_pd - #define _mm_fmadd_pd(a, b, c) simde_mm_fmadd_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_fmadd_pd (simde__m256d a, simde__m256d b, simde__m256d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fmadd_pd(a, b, c); - #else - return simde_mm256_add_pd(simde_mm256_mul_pd(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fmadd_pd - #define _mm256_fmadd_pd(a, b, c) simde_mm256_fmadd_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_fmadd_ps (simde__m128 a, simde__m128 b, simde__m128 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fmadd_ps(a, b, c); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b), - c_ = simde__m128_to_private(c), - r_; - - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f32 = vec_madd(a_.altivec_f32, b_.altivec_f32, c_.altivec_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) - r_.neon_f32 = vfmaq_f32(c_.neon_f32, b_.neon_f32, a_.neon_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vmlaq_f32(c_.neon_f32, b_.neon_f32, a_.neon_f32); - #elif defined(simde_math_fmaf) && (defined(__FP_FAST_FMAF) || defined(FP_FAST_FMAF)) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_fmaf(a_.f32[i], b_.f32[i], c_.f32[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (a_.f32[i] * b_.f32[i]) + c_.f32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmadd_ps - #define _mm_fmadd_ps(a, b, c) simde_mm_fmadd_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_fmadd_ps (simde__m256 a, simde__m256 b, simde__m256 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fmadd_ps(a, b, c); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(128) - simde__m256_private - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b), - c_ = simde__m256_to_private(c), - r_; - - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_fmadd_ps(a_.m128[i], b_.m128[i], c_.m128[i]); - } - - return simde__m256_from_private(r_); - #else - return simde_mm256_add_ps(simde_mm256_mul_ps(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fmadd_ps - #define _mm256_fmadd_ps(a, b, c) simde_mm256_fmadd_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_fmadd_sd (simde__m128d a, simde__m128d b, simde__m128d c) { - #if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT) - return _mm_fmadd_sd(a, b, c); - #else - return simde_mm_add_sd(simde_mm_mul_sd(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmadd_sd - #define _mm_fmadd_sd(a, b, c) simde_mm_fmadd_sd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_fmadd_ss (simde__m128 a, simde__m128 b, simde__m128 c) { - #if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT) - return _mm_fmadd_ss(a, b, c); - #else - return simde_mm_add_ss(simde_mm_mul_ss(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmadd_ss - #define _mm_fmadd_ss(a, b, c) simde_mm_fmadd_ss(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_fmaddsub_pd (simde__m128d a, simde__m128d b, simde__m128d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fmaddsub_pd(a, b, c); - #else - return simde_mm_addsub_pd(simde_mm_mul_pd(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmaddsub_pd - #define _mm_fmaddsub_pd(a, b, c) simde_mm_fmaddsub_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_fmaddsub_pd (simde__m256d a, simde__m256d b, simde__m256d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fmaddsub_pd(a, b, c); - #else - return simde_mm256_addsub_pd(simde_mm256_mul_pd(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fmaddsub_pd - #define _mm256_fmaddsub_pd(a, b, c) simde_mm256_fmaddsub_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_fmaddsub_ps (simde__m128 a, simde__m128 b, simde__m128 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fmaddsub_ps(a, b, c); - #else - return simde_mm_addsub_ps(simde_mm_mul_ps(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmaddsub_ps - #define _mm_fmaddsub_ps(a, b, c) simde_mm_fmaddsub_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_fmaddsub_ps (simde__m256 a, simde__m256 b, simde__m256 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fmaddsub_ps(a, b, c); - #else - return simde_mm256_addsub_ps(simde_mm256_mul_ps(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fmaddsub_ps - #define _mm256_fmaddsub_ps(a, b, c) simde_mm256_fmaddsub_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_fmsub_pd (simde__m128d a, simde__m128d b, simde__m128d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fmsub_pd(a, b, c); - #else - return simde_mm_sub_pd(simde_mm_mul_pd(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmsub_pd - #define _mm_fmsub_pd(a, b, c) simde_mm_fmsub_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_fmsub_pd (simde__m256d a, simde__m256d b, simde__m256d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fmsub_pd(a, b, c); - #else - return simde_mm256_sub_pd(simde_mm256_mul_pd(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fmsub_pd - #define _mm256_fmsub_pd(a, b, c) simde_mm256_fmsub_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_fmsub_ps (simde__m128 a, simde__m128 b, simde__m128 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fmsub_ps(a, b, c); - #else - return simde_mm_sub_ps(simde_mm_mul_ps(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmsub_ps - #define _mm_fmsub_ps(a, b, c) simde_mm_fmsub_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_fmsub_ps (simde__m256 a, simde__m256 b, simde__m256 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fmsub_ps(a, b, c); - #else - return simde_mm256_sub_ps(simde_mm256_mul_ps(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fmsub_ps - #define _mm256_fmsub_ps(a, b, c) simde_mm256_fmsub_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_fmsub_sd (simde__m128d a, simde__m128d b, simde__m128d c) { - #if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT) - return _mm_fmsub_sd(a, b, c); - #else - return simde_mm_sub_sd(simde_mm_mul_sd(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmsub_sd - #define _mm_fmsub_sd(a, b, c) simde_mm_fmsub_sd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_fmsub_ss (simde__m128 a, simde__m128 b, simde__m128 c) { - #if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT) - return _mm_fmsub_ss(a, b, c); - #else - return simde_mm_sub_ss(simde_mm_mul_ss(a, b), c); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmsub_ss - #define _mm_fmsub_ss(a, b, c) simde_mm_fmsub_ss(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_fmsubadd_pd (simde__m128d a, simde__m128d b, simde__m128d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fmsubadd_pd(a, b, c); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b), - c_ = simde__m128d_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i += 2) { - r_.f64[ i ] = (a_.f64[ i ] * b_.f64[ i ]) + c_.f64[ i ]; - r_.f64[i + 1] = (a_.f64[i + 1] * b_.f64[i + 1]) - c_.f64[i + 1]; - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmsubadd_pd - #define _mm_fmsubadd_pd(a, b, c) simde_mm_fmsubadd_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_fmsubadd_pd (simde__m256d a, simde__m256d b, simde__m256d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fmsubadd_pd(a, b, c); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b), - c_ = simde__m256d_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i += 2) { - r_.f64[ i ] = (a_.f64[ i ] * b_.f64[ i ]) + c_.f64[ i ]; - r_.f64[i + 1] = (a_.f64[i + 1] * b_.f64[i + 1]) - c_.f64[i + 1]; - } - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fmsubadd_pd - #define _mm256_fmsubadd_pd(a, b, c) simde_mm256_fmsubadd_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_fmsubadd_ps (simde__m128 a, simde__m128 b, simde__m128 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fmsubadd_ps(a, b, c); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b), - c_ = simde__m128_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) { - r_.f32[ i ] = (a_.f32[ i ] * b_.f32[ i ]) + c_.f32[ i ]; - r_.f32[i + 1] = (a_.f32[i + 1] * b_.f32[i + 1]) - c_.f32[i + 1]; - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fmsubadd_ps - #define _mm_fmsubadd_ps(a, b, c) simde_mm_fmsubadd_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_fmsubadd_ps (simde__m256 a, simde__m256 b, simde__m256 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fmsubadd_ps(a, b, c); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b), - c_ = simde__m256_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) { - r_.f32[ i ] = (a_.f32[ i ] * b_.f32[ i ]) + c_.f32[ i ]; - r_.f32[i + 1] = (a_.f32[i + 1] * b_.f32[i + 1]) - c_.f32[i + 1]; - } - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fmsubadd_ps - #define _mm256_fmsubadd_ps(a, b, c) simde_mm256_fmsubadd_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_fnmadd_pd (simde__m128d a, simde__m128d b, simde__m128d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fnmadd_pd(a, b, c); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b), - c_ = simde__m128d_to_private(c); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vfmsq_f64(c_.neon_f64, a_.neon_f64, b_.neon_f64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = -(a_.f64[i] * b_.f64[i]) + c_.f64[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fnmadd_pd - #define _mm_fnmadd_pd(a, b, c) simde_mm_fnmadd_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_fnmadd_pd (simde__m256d a, simde__m256d b, simde__m256d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fnmadd_pd(a, b, c); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b), - c_ = simde__m256d_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = -(a_.f64[i] * b_.f64[i]) + c_.f64[i]; - } - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fnmadd_pd - #define _mm256_fnmadd_pd(a, b, c) simde_mm256_fnmadd_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_fnmadd_ps (simde__m128 a, simde__m128 b, simde__m128 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fnmadd_ps(a, b, c); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b), - c_ = simde__m128_to_private(c); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_ARCH_ARM_FMA) - r_.neon_f32 = vfmsq_f32(c_.neon_f32, a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vmlsq_f32(c_.neon_f32, a_.neon_f32, b_.neon_f32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = -(a_.f32[i] * b_.f32[i]) + c_.f32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fnmadd_ps - #define _mm_fnmadd_ps(a, b, c) simde_mm_fnmadd_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_fnmadd_ps (simde__m256 a, simde__m256 b, simde__m256 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fnmadd_ps(a, b, c); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b), - c_ = simde__m256_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = -(a_.f32[i] * b_.f32[i]) + c_.f32[i]; - } - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fnmadd_ps - #define _mm256_fnmadd_ps(a, b, c) simde_mm256_fnmadd_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_fnmadd_sd (simde__m128d a, simde__m128d b, simde__m128d c) { - #if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT) - return _mm_fnmadd_sd(a, b, c); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b), - c_ = simde__m128d_to_private(c); - - r_ = a_; - r_.f64[0] = -(a_.f64[0] * b_.f64[0]) + c_.f64[0]; - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fnmadd_sd - #define _mm_fnmadd_sd(a, b, c) simde_mm_fnmadd_sd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_fnmadd_ss (simde__m128 a, simde__m128 b, simde__m128 c) { - #if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT) - return _mm_fnmadd_ss(a, b, c); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b), - c_ = simde__m128_to_private(c); - - r_ = a_; - r_.f32[0] = -(a_.f32[0] * b_.f32[0]) + c_.f32[0]; - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fnmadd_ss - #define _mm_fnmadd_ss(a, b, c) simde_mm_fnmadd_ss(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_fnmsub_pd (simde__m128d a, simde__m128d b, simde__m128d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fnmsub_pd(a, b, c); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b), - c_ = simde__m128d_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = -(a_.f64[i] * b_.f64[i]) - c_.f64[i]; - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fnmsub_pd - #define _mm_fnmsub_pd(a, b, c) simde_mm_fnmsub_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_fnmsub_pd (simde__m256d a, simde__m256d b, simde__m256d c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fnmsub_pd(a, b, c); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b), - c_ = simde__m256d_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = -(a_.f64[i] * b_.f64[i]) - c_.f64[i]; - } - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fnmsub_pd - #define _mm256_fnmsub_pd(a, b, c) simde_mm256_fnmsub_pd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_fnmsub_ps (simde__m128 a, simde__m128 b, simde__m128 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm_fnmsub_ps(a, b, c); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b), - c_ = simde__m128_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = -(a_.f32[i] * b_.f32[i]) - c_.f32[i]; - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fnmsub_ps - #define _mm_fnmsub_ps(a, b, c) simde_mm_fnmsub_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_fnmsub_ps (simde__m256 a, simde__m256 b, simde__m256 c) { - #if defined(SIMDE_X86_FMA_NATIVE) - return _mm256_fnmsub_ps(a, b, c); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b), - c_ = simde__m256_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = -(a_.f32[i] * b_.f32[i]) - c_.f32[i]; - } - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm256_fnmsub_ps - #define _mm256_fnmsub_ps(a, b, c) simde_mm256_fnmsub_ps(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_fnmsub_sd (simde__m128d a, simde__m128d b, simde__m128d c) { - #if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT) - return _mm_fnmsub_sd(a, b, c); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b), - c_ = simde__m128d_to_private(c); - - r_ = a_; - r_.f64[0] = -(a_.f64[0] * b_.f64[0]) - c_.f64[0]; - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fnmsub_sd - #define _mm_fnmsub_sd(a, b, c) simde_mm_fnmsub_sd(a, b, c) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_fnmsub_ss (simde__m128 a, simde__m128 b, simde__m128 c) { - #if defined(SIMDE_X86_FMA_NATIVE) && !defined(SIMDE_BUG_MCST_LCC_FMA_WRONG_RESULT) - return _mm_fnmsub_ss(a, b, c); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b), - c_ = simde__m128_to_private(c); - - r_ = simde__m128_to_private(a); - r_.f32[0] = -(a_.f32[0] * b_.f32[0]) - c_.f32[0]; - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_FMA_ENABLE_NATIVE_ALIASES) - #undef _mm_fnmsub_ss - #define _mm_fnmsub_ss(a, b, c) simde_mm_fnmsub_ss(a, b, c) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_FMA_H) */ diff --git a/ffi-deps/simde/simde/x86/gfni.h b/ffi-deps/simde/simde/x86/gfni.h deleted file mode 100644 index 5982a34..0000000 --- a/ffi-deps/simde/simde/x86/gfni.h +++ /dev/null @@ -1,1295 +0,0 @@ -/* Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020-2021 Christopher Moore - * 2020 Evan Nemerson - */ - -#if !defined(SIMDE_X86_GFNI_H) -#define SIMDE_X86_GFNI_H - -#include "avx512/add.h" -#include "avx512/and.h" -#include "avx512/broadcast.h" -#include "avx512/cmpeq.h" -#include "avx512/cmpge.h" -#include "avx512/cmpgt.h" -#include "avx512/cmplt.h" -#include "avx512/extract.h" -#include "avx512/insert.h" -#include "avx512/kshift.h" -#include "avx512/mov.h" -#include "avx512/mov_mask.h" -#include "avx512/permutex2var.h" -#include "avx512/set.h" -#include "avx512/set1.h" -#include "avx512/setzero.h" -#include "avx512/shuffle.h" -#include "avx512/srli.h" -#include "avx512/test.h" -#include "avx512/xor.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -/* In all the *gf2p8affine* intrinsics the argument b must be a compile-time constant so we must use macros and simde_x_mm* helper functions */ - -/* N.B. The _mm*gf2p8affineinv_epi64_epi8 and _mm*gf2p8mul_epi8 intrinsics are for a Field Generator Polynomial (FGP) (aka reduction polynomial) of 0x11B */ -/* Only the _mm*gf2p8affine_epi64_epi8 intrinsics do not assume this specific FGP */ - -/* The field generator polynomial is 0x11B but we make the 0x100 bit implicit to fit inside 8 bits */ -#define SIMDE_X86_GFNI_FGP 0x1B - -/* Computing the inverse of a GF element is expensive so use this LUT for an FGP of 0x11B */ - -static const union { - uint8_t u8[256]; - simde__m128i m128i[16]; -} simde_x_gf2p8inverse_lut = { - { - 0x00, 0x01, 0x8d, 0xf6, 0xcb, 0x52, 0x7b, 0xd1, 0xe8, 0x4f, 0x29, 0xc0, 0xb0, 0xe1, 0xe5, 0xc7, - 0x74, 0xb4, 0xaa, 0x4b, 0x99, 0x2b, 0x60, 0x5f, 0x58, 0x3f, 0xfd, 0xcc, 0xff, 0x40, 0xee, 0xb2, - 0x3a, 0x6e, 0x5a, 0xf1, 0x55, 0x4d, 0xa8, 0xc9, 0xc1, 0x0a, 0x98, 0x15, 0x30, 0x44, 0xa2, 0xc2, - 0x2c, 0x45, 0x92, 0x6c, 0xf3, 0x39, 0x66, 0x42, 0xf2, 0x35, 0x20, 0x6f, 0x77, 0xbb, 0x59, 0x19, - 0x1d, 0xfe, 0x37, 0x67, 0x2d, 0x31, 0xf5, 0x69, 0xa7, 0x64, 0xab, 0x13, 0x54, 0x25, 0xe9, 0x09, - 0xed, 0x5c, 0x05, 0xca, 0x4c, 0x24, 0x87, 0xbf, 0x18, 0x3e, 0x22, 0xf0, 0x51, 0xec, 0x61, 0x17, - 0x16, 0x5e, 0xaf, 0xd3, 0x49, 0xa6, 0x36, 0x43, 0xf4, 0x47, 0x91, 0xdf, 0x33, 0x93, 0x21, 0x3b, - 0x79, 0xb7, 0x97, 0x85, 0x10, 0xb5, 0xba, 0x3c, 0xb6, 0x70, 0xd0, 0x06, 0xa1, 0xfa, 0x81, 0x82, - 0x83, 0x7e, 0x7f, 0x80, 0x96, 0x73, 0xbe, 0x56, 0x9b, 0x9e, 0x95, 0xd9, 0xf7, 0x02, 0xb9, 0xa4, - 0xde, 0x6a, 0x32, 0x6d, 0xd8, 0x8a, 0x84, 0x72, 0x2a, 0x14, 0x9f, 0x88, 0xf9, 0xdc, 0x89, 0x9a, - 0xfb, 0x7c, 0x2e, 0xc3, 0x8f, 0xb8, 0x65, 0x48, 0x26, 0xc8, 0x12, 0x4a, 0xce, 0xe7, 0xd2, 0x62, - 0x0c, 0xe0, 0x1f, 0xef, 0x11, 0x75, 0x78, 0x71, 0xa5, 0x8e, 0x76, 0x3d, 0xbd, 0xbc, 0x86, 0x57, - 0x0b, 0x28, 0x2f, 0xa3, 0xda, 0xd4, 0xe4, 0x0f, 0xa9, 0x27, 0x53, 0x04, 0x1b, 0xfc, 0xac, 0xe6, - 0x7a, 0x07, 0xae, 0x63, 0xc5, 0xdb, 0xe2, 0xea, 0x94, 0x8b, 0xc4, 0xd5, 0x9d, 0xf8, 0x90, 0x6b, - 0xb1, 0x0d, 0xd6, 0xeb, 0xc6, 0x0e, 0xcf, 0xad, 0x08, 0x4e, 0xd7, 0xe3, 0x5d, 0x50, 0x1e, 0xb3, - 0x5b, 0x23, 0x38, 0x34, 0x68, 0x46, 0x03, 0x8c, 0xdd, 0x9c, 0x7d, 0xa0, 0xcd, 0x1a, 0x41, 0x1c - } -}; - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_gf2p8matrix_multiply_epi64_epi8 (simde__m128i x, simde__m128i A) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - const __m128i byte_select = _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1); - const __m128i zero = _mm_setzero_si128(); - __m128i r, a, p, X; - - a = _mm_shuffle_epi8(A, _mm_setr_epi8(7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8)); - X = x; - r = zero; - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (int i = 0 ; i < 8 ; i++) { - p = _mm_insert_epi16(zero, _mm_movemask_epi8(a), 0); - p = _mm_shuffle_epi8(p, byte_select); - p = _mm_and_si128(p, _mm_cmpgt_epi8(zero, X)); - r = _mm_xor_si128(r, p); - a = _mm_add_epi8(a, a); - X = _mm_add_epi8(X, X); - } - - return r; - #elif defined(SIMDE_X86_SSE2_NATIVE) - const __m128i zero = _mm_setzero_si128(); - __m128i r, a, p, X; - - a = _mm_shufflehi_epi16(A, (0 << 6) + (1 << 4) + (2 << 2) + (3 << 0)); - a = _mm_shufflelo_epi16(a, (0 << 6) + (1 << 4) + (2 << 2) + (3 << 0)); - a = _mm_or_si128(_mm_slli_epi16(a, 8), _mm_srli_epi16(a, 8)); - X = _mm_unpacklo_epi8(x, _mm_unpackhi_epi64(x, x)); - r = zero; - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (int i = 0 ; i < 8 ; i++) { - p = _mm_set1_epi16(HEDLEY_STATIC_CAST(short, _mm_movemask_epi8(a))); - p = _mm_and_si128(p, _mm_cmpgt_epi8(zero, X)); - r = _mm_xor_si128(r, p); - a = _mm_add_epi8(a, a); - X = _mm_add_epi8(X, X); - } - - return _mm_packus_epi16(_mm_srli_epi16(_mm_slli_epi16(r, 8), 8), _mm_srli_epi16(r, 8)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - static const uint8_t byte_interleave[16] = {0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15}; - static const uint8_t byte_deinterleave[16] = {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; - static const uint8_t mask_d[16] = {128, 128, 64, 64, 32, 32, 16, 16, 8, 8, 4, 4, 2, 2, 1, 1}; - const int8x16_t mask = vreinterpretq_s8_u8(vld1q_u8(mask_d)); - int8x16_t r, a, t, X; - - t = simde__m128i_to_neon_i8(A); - a = vqtbl1q_s8(t, vld1q_u8(byte_interleave)); - t = simde__m128i_to_neon_i8(x); - X = vqtbl1q_s8(t, vld1q_u8(byte_interleave)); - r = vdupq_n_s8(0); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (int i = 0 ; i < 8 ; i++) { - t = vshrq_n_s8(a, 7); - t = vandq_s8(t, mask); - t = vreinterpretq_s8_u16(vdupq_n_u16(vaddvq_u16(vreinterpretq_u16_s8(t)))); - t = vandq_s8(t, vshrq_n_s8(X, 7)); - r = veorq_s8(r, t); - a = vshlq_n_s8(a, 1); - X = vshlq_n_s8(X, 1); - } - - r = vqtbl1q_s8(r, vld1q_u8(byte_deinterleave)); - return simde__m128i_from_neon_i8(r); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - static const uint8_t mask_d[16] = {128, 64, 32, 16, 8, 4, 2, 1, 128, 64, 32, 16, 8, 4, 2, 1}; - const int8x16_t mask = vreinterpretq_s8_u8(vld1q_u8(mask_d)); - int8x16_t r, a, t, X; - int16x8_t t16; - int32x4_t t32; - - a = simde__m128i_to_neon_i8(A); - X = simde__m128i_to_neon_i8(x); - r = vdupq_n_s8(0); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (int i = 0 ; i < 8 ; i++) { - t = vshrq_n_s8(a, 7); - t = vandq_s8(t, mask); - t16 = vreinterpretq_s16_s8 (vorrq_s8 (t , vrev64q_s8 (t ))); - t32 = vreinterpretq_s32_s16(vorrq_s16(t16, vrev64q_s16(t16))); - t = vreinterpretq_s8_s32 (vorrq_s32(t32, vrev64q_s32(t32))); - t = vandq_s8(t, vshrq_n_s8(X, 7)); - r = veorq_s8(r, t); - a = vshlq_n_s8(a, 1); - X = vshlq_n_s8(X, 1); - } - - return simde__m128i_from_neon_i8(r); - #elif defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) byte_interleave = {0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15}; - static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) byte_deinterleave= {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; - static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) bit_select = {0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120}; - static const SIMDE_POWER_ALTIVEC_VECTOR(signed char) zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) a, p, r; - SIMDE_POWER_ALTIVEC_VECTOR(signed char) X; - - X = simde__m128i_to_altivec_i8(x); - a = simde__m128i_to_altivec_u8(A); - X = vec_perm(X, X, byte_interleave); - r = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), zero); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (int i = 0 ; i < 8 ; i++) { - #if defined(SIMDE_BUG_CLANG_50932) - p = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), - vec_bperm(HEDLEY_STATIC_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned __int128), a), bit_select)); - #else - p = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_bperm_u128(a, bit_select)); - #endif - p = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), - vec_splat(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), p), 3)); - p &= X < zero; - r ^= p; - a += a; - X += X; - } - - r = vec_perm(r, r, byte_deinterleave); - return simde__m128i_from_altivec_u8(r); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) mask = {128, 64, 32, 16, 8, 4, 2, 1, 128, 64, 32, 16, 8, 4, 2, 1}; - static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) byte_select = {7, 7, 7, 7, 7, 7, 7, 7, 15, 15, 15, 15, 15, 15, 15, 15}; - static const SIMDE_POWER_ALTIVEC_VECTOR(signed char) zero = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) p, r; - SIMDE_POWER_ALTIVEC_VECTOR(signed char) a, X; - - X = simde__m128i_to_altivec_i8(x); - a = simde__m128i_to_altivec_i8(A); - r = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), zero); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (int i = 0 ; i < 8 ; i++) { - p = a < zero; - p &= mask; - p = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), - vec_sum2(vec_sum4(p, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), zero)), - HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), zero))); - p = vec_perm(p, p, byte_select); - p &= X < zero; - r ^= p; - a += a; - X += X; - } - - return simde__m128i_from_altivec_u8(r); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) byte_interleave = {0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15}; - static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) byte_deinterleave= {0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15}; - static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) bit_select = {64, 72, 80, 88, 96, 104, 112, 120, 0, 8, 16, 24, 32, 40, 48, 56}; - const SIMDE_POWER_ALTIVEC_VECTOR(signed char) zero = vec_splats(HEDLEY_STATIC_CAST(signed char, 0)); - SIMDE_POWER_ALTIVEC_VECTOR(signed char) X; - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) a, p, r; - - X = simde__m128i_to_altivec_i8(x); - a = simde__m128i_to_altivec_u8(A); - X = vec_perm(X, X, byte_interleave); - r = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), zero); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (int i = 0 ; i < 8 ; i++) { - #if defined(SIMDE_BUG_CLANG_50932) - p = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), - vec_bperm(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned __int128), a), bit_select)); - #else - p = vec_bperm(a, bit_select); - #endif - p = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), - vec_splat(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), p), 4)); - p = vec_and(p, vec_cmplt(X, zero)); - r = vec_xor(r, p); - a = vec_add(a, a); - X = vec_add(X, X); - } - - r = vec_perm(r, r, byte_deinterleave); - return simde__m128i_from_altivec_u8(r); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) mask = {128, 64, 32, 16, 8, 4, 2, 1, 128, 64, 32, 16, 8, 4, 2, 1}; - static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) byte_select = {4, 4, 4, 4, 4, 4, 4, 4, 12, 12, 12, 12, 12, 12, 12, 12}; - const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) sevens = vec_splats(HEDLEY_STATIC_CAST(unsigned char, 7)); - const SIMDE_POWER_ALTIVEC_VECTOR(signed char) zero = vec_splats(HEDLEY_STATIC_CAST(signed char, 0)); - SIMDE_POWER_ALTIVEC_VECTOR(signed char) X; - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) a, p, r; - - X = simde__m128i_to_altivec_i8(x); - a = simde__m128i_to_altivec_u8(A); - r = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), zero); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (int i = 0 ; i < 8 ; i++) { - p = vec_sr(a, sevens); - p = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), - vec_msum(p, - mask, - HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), zero))); - p = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), - vec_sum2s(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), p), - HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), zero))); - p = vec_perm(p, p, byte_select); - p = vec_and(p, vec_cmplt(X, zero)); - r = vec_xor(r, p); - a = vec_add(a, a); - X = vec_add(X, X); - } - - return simde__m128i_from_altivec_u8(r); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - const v128_t zero = wasm_i8x16_splat(0); - v128_t a, p, r, X; - - X = simde__m128i_to_wasm_v128(x); - a = simde__m128i_to_wasm_v128(A); - a = wasm_i8x16_shuffle(a, a, 7, 6, 5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); - X = wasm_i8x16_shuffle(X, X, 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15); - r = zero; - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (int i = 0 ; i < 8 ; i++) { - p = wasm_i16x8_splat(HEDLEY_STATIC_CAST(int16_t, wasm_i8x16_bitmask(a))); - p = wasm_v128_and(p, wasm_i8x16_lt(X, zero)); - r = wasm_v128_xor(r, p); - a = wasm_i8x16_add(a, a); - X = wasm_i8x16_add(X, X); - } - - r = wasm_i8x16_shuffle(r, r, 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15); - return simde__m128i_from_wasm_v128(r); - #else - simde__m128i_private - r_, - x_ = simde__m128i_to_private(x), - A_ = simde__m128i_to_private(A); - - const uint64_t ones = UINT64_C(0x0101010101010101); - const uint64_t mask = UINT64_C(0x0102040810204080); - uint64_t q; - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - q = simde_endian_bswap64_le(A_.u64[i / 8]); - q &= HEDLEY_STATIC_CAST(uint64_t, x_.u8[i]) * ones; - q ^= q >> 4; - q ^= q >> 2; - q ^= q >> 1; - q &= ones; - q *= 255; - q &= mask; - q |= q >> 32; - q |= q >> 16; - q |= q >> 8; - r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, q); - } - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_gf2p8matrix_multiply_epi64_epi8 (simde__m256i x, simde__m256i A) { - #if defined(SIMDE_X86_AVX2_NATIVE) - simde__m256i r, a, p; - const simde__m256i byte_select = simde_x_mm256_set_epu64x(UINT64_C(0x0303030303030303), UINT64_C(0x0202020202020202), - UINT64_C(0x0101010101010101), UINT64_C(0x0000000000000000)); - a = simde_mm256_shuffle_epi8(A, simde_mm256_broadcastsi128_si256(simde_x_mm_set_epu64x(UINT64_C(0x08090A0B0C0D0E0F), UINT64_C(0x0001020304050607)))); - r = simde_mm256_setzero_si256(); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (int i = 0 ; i < 8 ; i++) { - p = simde_mm256_set1_epi32(simde_mm256_movemask_epi8(a)); - p = simde_mm256_shuffle_epi8(p, byte_select); - p = simde_mm256_xor_si256(r, p); - r = simde_mm256_blendv_epi8(r, p, x); - a = simde_mm256_add_epi8(a, a); - x = simde_mm256_add_epi8(x, x); - } - - return r; - #else - simde__m256i_private - r_, - x_ = simde__m256i_to_private(x), - A_ = simde__m256i_to_private(A); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_gf2p8matrix_multiply_epi64_epi8(x_.m128i[i], A_.m128i[i]); - } - - return simde__m256i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_gf2p8matrix_multiply_epi64_epi8 (simde__m512i x, simde__m512i A) { - #if defined(SIMDE_X86_AVX512BW_NATIVE) - simde__m512i r, a, p; - const simde__m512i byte_select = simde_x_mm512_set_epu64(UINT64_C(0x0707070707070707), UINT64_C(0x0606060606060606), UINT64_C(0x0505050505050505), UINT64_C(0x0404040404040404), - UINT64_C(0x0303030303030303), UINT64_C(0x0202020202020202), UINT64_C(0x0101010101010101), UINT64_C(0X0000000000000000)); - a = simde_mm512_shuffle_epi8(A, simde_mm512_broadcast_i32x4(simde_x_mm_set_epu64x(UINT64_C(0x08090A0B0C0D0E0F), UINT64_C(0x0001020304050607)))); - r = simde_mm512_setzero_si512(); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (int i = 0 ; i < 8 ; i++) { - p = simde_mm512_set1_epi64(HEDLEY_STATIC_CAST(int64_t, simde_mm512_movepi8_mask(a))); - p = simde_mm512_maskz_shuffle_epi8(simde_mm512_movepi8_mask(x), p, byte_select); - r = simde_mm512_xor_si512(r, p); - a = simde_mm512_add_epi8(a, a); - x = simde_mm512_add_epi8(x, x); - } - - return r; - #else - simde__m512i_private - r_, - x_ = simde__m512i_to_private(x), - A_ = simde__m512i_to_private(A); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_x_mm256_gf2p8matrix_multiply_epi64_epi8(x_.m256i[i], A_.m256i[i]); - } - - return simde__m512i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_gf2p8inverse_epi8 (simde__m128i x) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - /* N.B. CM: this fallback may not be faster */ - simde__m128i r, u, t, test; - const simde__m128i sixteens = simde_mm_set1_epi8(16); - const simde__m128i masked_x = simde_mm_and_si128(x, simde_mm_set1_epi8(0x0F)); - - test = simde_mm_set1_epi8(INT8_MIN /* 0x80 */); - x = simde_mm_xor_si128(x, test); - r = simde_mm_shuffle_epi8(simde_x_gf2p8inverse_lut.m128i[0], masked_x); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (int i = 1 ; i < 16 ; i++) { - t = simde_mm_shuffle_epi8(simde_x_gf2p8inverse_lut.m128i[i], masked_x); - test = simde_mm_add_epi8(test, sixteens); - u = simde_mm_cmplt_epi8(x, test); - r = simde_mm_blendv_epi8(t, r, u); - } - - return r; - #else - simde__m128i_private - r_, - x_ = simde__m128i_to_private(x); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = simde_x_gf2p8inverse_lut.u8[x_.u8[i]]; - } - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_x_mm256_gf2p8inverse_epi8 (simde__m256i x) { - #if defined(SIMDE_X86_AVX2_NATIVE) - /* N.B. CM: this fallback may not be faster */ - simde__m256i r, u, t, test; - const simde__m256i sixteens = simde_mm256_set1_epi8(16); - const simde__m256i masked_x = simde_mm256_and_si256(x, simde_mm256_set1_epi8(0x0F)); - - test = simde_mm256_set1_epi8(INT8_MIN /* 0x80 */); - x = simde_mm256_xor_si256(x, test); - r = simde_mm256_shuffle_epi8(simde_mm256_broadcastsi128_si256(simde_x_gf2p8inverse_lut.m128i[0]), masked_x); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (int i = 1 ; i < 16 ; i++) { - t = simde_mm256_shuffle_epi8(simde_mm256_broadcastsi128_si256(simde_x_gf2p8inverse_lut.m128i[i]), masked_x); - test = simde_mm256_add_epi8(test, sixteens); - u = simde_mm256_cmpgt_epi8(test, x); - r = simde_mm256_blendv_epi8(t, r, u); - } - - return r; - #else - simde__m256i_private - r_, - x_ = simde__m256i_to_private(x); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_x_mm_gf2p8inverse_epi8(x_.m128i[i]); - } - - return simde__m256i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_x_mm512_gf2p8inverse_epi8 (simde__m512i x) { - /* N.B. CM: TODO: later add VBMI version using just two _mm512_permutex2var_epi8 and friends */ - /* But except for Cannon Lake all processors with VBMI also have GFNI */ - #if defined(SIMDE_X86_AVX512BW_NATIVE) - /* N.B. CM: this fallback may not be faster */ - simde__m512i r, test; - const simde__m512i sixteens = simde_mm512_set1_epi8(16); - const simde__m512i masked_x = simde_mm512_and_si512(x, simde_mm512_set1_epi8(0x0F)); - - r = simde_mm512_shuffle_epi8(simde_mm512_broadcast_i32x4(simde_x_gf2p8inverse_lut.m128i[0]), masked_x); - test = sixteens; - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (int i = 1 ; i < 16 ; i++) { - r = simde_mm512_mask_shuffle_epi8(r, simde_mm512_cmpge_epu8_mask(x, test), simde_mm512_broadcast_i32x4(simde_x_gf2p8inverse_lut.m128i[i]), masked_x); - test = simde_mm512_add_epi8(test, sixteens); - } - - return r; - #else - simde__m512i_private - r_, - x_ = simde__m512i_to_private(x); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_x_mm256_gf2p8inverse_epi8(x_.m256i[i]); - } - - return simde__m512i_from_private(r_); - #endif -} - -#define simde_x_mm_gf2p8matrix_multiply_inverse_epi64_epi8(x, A) simde_x_mm_gf2p8matrix_multiply_epi64_epi8(simde_x_mm_gf2p8inverse_epi8(x), A) -#define simde_x_mm256_gf2p8matrix_multiply_inverse_epi64_epi8(x, A) simde_x_mm256_gf2p8matrix_multiply_epi64_epi8(simde_x_mm256_gf2p8inverse_epi8(x), A) -#define simde_x_mm512_gf2p8matrix_multiply_inverse_epi64_epi8(x, A) simde_x_mm512_gf2p8matrix_multiply_epi64_epi8(simde_x_mm512_gf2p8inverse_epi8(x), A) - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_gf2p8affine_epi64_epi8 (simde__m128i x, simde__m128i A, int b) - SIMDE_REQUIRE_CONSTANT_RANGE(b, 0, 255) { - return simde_mm_xor_si128(simde_x_mm_gf2p8matrix_multiply_epi64_epi8(x, A), simde_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, b))); -} -#if defined(SIMDE_X86_GFNI_NATIVE) - #define simde_mm_gf2p8affine_epi64_epi8(x, A, b) _mm_gf2p8affine_epi64_epi8(x, A, b) -#endif -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) - #undef _mm_gf2p8affine_epi64_epi8 - #define _mm_gf2p8affine_epi64_epi8(x, A, b) simde_mm_gf2p8affine_epi64_epi8(x, A, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_gf2p8affine_epi64_epi8 (simde__m256i x, simde__m256i A, int b) - SIMDE_REQUIRE_CONSTANT_RANGE(b, 0, 255) { - return simde_mm256_xor_si256(simde_x_mm256_gf2p8matrix_multiply_epi64_epi8(x, A), simde_mm256_set1_epi8(HEDLEY_STATIC_CAST(int8_t, b))); -} -#if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm256_gf2p8affine_epi64_epi8(x, A, b) _mm256_gf2p8affine_epi64_epi8(x, A, b) -#endif -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) - #undef _mm256_gf2p8affine_epi64_epi8 - #define _mm256_gf2p8affine_epi64_epi8(x, A, b) simde_mm256_gf2p8affine_epi64_epi8(x, A, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_gf2p8affine_epi64_epi8 (simde__m512i x, simde__m512i A, int b) - SIMDE_REQUIRE_CONSTANT_RANGE(b, 0, 255) { - return simde_mm512_xor_si512(simde_x_mm512_gf2p8matrix_multiply_epi64_epi8(x, A), simde_mm512_set1_epi8(HEDLEY_STATIC_CAST(int8_t, b))); -} -#if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_gf2p8affine_epi64_epi8(x, A, b) _mm512_gf2p8affine_epi64_epi8(x, A, b) -#endif -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) - #undef _mm512_gf2p8affine_epi64_epi8 - #define _mm512_gf2p8affine_epi64_epi8(x, A, b) simde_mm512_gf2p8affine_epi64_epi8(x, A, b) -#endif - -#if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) _mm_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) -#else - #define simde_mm_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) simde_mm_mask_mov_epi8(src, k, simde_mm_gf2p8affine_epi64_epi8(x, A, b)) -#endif -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_gf2p8affine_epi64_epi8 - #define _mm_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) simde_mm_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) -#endif - -#if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) _mm256_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) -#else - #define simde_mm256_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) simde_mm256_mask_mov_epi8(src, k, simde_mm256_gf2p8affine_epi64_epi8(x, A, b)) -#endif -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_gf2p8affine_epi64_epi8 - #define _mm256_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) simde_mm256_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) -#endif - -#if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) _mm512_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) -#else - #define simde_mm512_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) simde_mm512_mask_mov_epi8(src, k, simde_mm512_gf2p8affine_epi64_epi8(x, A, b)) -#endif -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_gf2p8affine_epi64_epi8 - #define _mm512_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) simde_mm512_mask_gf2p8affine_epi64_epi8(src, k, x, A, b) -#endif - -#if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_maskz_gf2p8affine_epi64_epi8(k, x, A, b) _mm_maskz_gf2p8affine_epi64_epi8(k, x, A, b) -#else - #define simde_mm_maskz_gf2p8affine_epi64_epi8(k, x, A, b) simde_mm_maskz_mov_epi8(k, simde_mm_gf2p8affine_epi64_epi8(x, A, b)) -#endif -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_gf2p8affine_epi64_epi8 - #define _mm_maskz_gf2p8affine_epi64_epi8(k, x, A, b) simde_mm_maskz_gf2p8affine_epi64_epi8(k, x, A, b) -#endif - -#if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_maskz_gf2p8affine_epi64_epi8(k, x, A, b) _mm256_maskz_gf2p8affine_epi64_epi8(k, x, A, b) -#else - #define simde_mm256_maskz_gf2p8affine_epi64_epi8(k, x, A, b) simde_mm256_maskz_mov_epi8(k, simde_mm256_gf2p8affine_epi64_epi8(x, A, b)) -#endif -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_gf2p8affine_epi64_epi8 - #define _mm256_maskz_gf2p8affine_epi64_epi8(k, x, A, b) simde_mm256_maskz_gf2p8affine_epi64_epi8(k, x, A, b) -#endif - -#if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_maskz_gf2p8affine_epi64_epi8(k, x, A, b) _mm512_maskz_gf2p8affine_epi64_epi8(k, x, A, b) -#else - #define simde_mm512_maskz_gf2p8affine_epi64_epi8(k, x, A, b) simde_mm512_maskz_mov_epi8(k, simde_mm512_gf2p8affine_epi64_epi8(x, A, b)) -#endif -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_gf2p8affine_epi64_epi8 - #define _mm512_maskz_gf2p8affine_epi64_epi8(k, x, A, b) simde_mm512_maskz_gf2p8affine_epi64_epi8(k, x, A, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_gf2p8affineinv_epi64_epi8 (simde__m128i x, simde__m128i A, int b) - SIMDE_REQUIRE_CONSTANT_RANGE(b, 0, 255) { - return simde_mm_xor_si128(simde_x_mm_gf2p8matrix_multiply_inverse_epi64_epi8(x, A), simde_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, b))); -} -#if defined(SIMDE_X86_GFNI_NATIVE) - #define simde_mm_gf2p8affineinv_epi64_epi8(x, A, b) _mm_gf2p8affineinv_epi64_epi8(x, A, b) -#endif -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) - #undef _mm_gf2p8affineinv_epi64_epi8 - #define _mm_gf2p8affineinv_epi64_epi8(x, A, b) simde_mm_gf2p8affineinv_epi64_epi8(x, A, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_gf2p8affineinv_epi64_epi8 (simde__m256i x, simde__m256i A, int b) - SIMDE_REQUIRE_CONSTANT_RANGE(b, 0, 255) { - return simde_mm256_xor_si256(simde_x_mm256_gf2p8matrix_multiply_inverse_epi64_epi8(x, A), simde_mm256_set1_epi8(HEDLEY_STATIC_CAST(int8_t, b))); -} -#if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - #define simde_mm256_gf2p8affineinv_epi64_epi8(x, A, b) _mm256_gf2p8affineinv_epi64_epi8(x, A, b) -#endif -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) - #undef _mm256_gf2p8affineinv_epi64_epi8 - #define _mm256_gf2p8affineinv_epi64_epi8(x, A, b) simde_mm256_gf2p8affineinv_epi64_epi8(x, A, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_gf2p8affineinv_epi64_epi8 (simde__m512i x, simde__m512i A, int b) - SIMDE_REQUIRE_CONSTANT_RANGE(b, 0, 255) { - return simde_mm512_xor_si512(simde_x_mm512_gf2p8matrix_multiply_inverse_epi64_epi8(x, A), simde_mm512_set1_epi8(HEDLEY_STATIC_CAST(int8_t, b))); -} -#if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_gf2p8affineinv_epi64_epi8(x, A, b) _mm512_gf2p8affineinv_epi64_epi8(x, A, b) -#endif -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) - #undef _mm512_gf2p8affineinv_epi64_epi8 - #define _mm512_gf2p8affineinv_epi64_epi8(x, A, b) simde_mm512_gf2p8affineinv_epi64_epi8(x, A, b) -#endif - -#if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) _mm_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) -#else - #define simde_mm_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) simde_mm_mask_mov_epi8(src, k, simde_mm_gf2p8affineinv_epi64_epi8(x, A, b)) -#endif -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_gf2p8affineinv_epi64_epi8 - #define _mm_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) simde_mm_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) -#endif - -#if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) _mm256_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) -#else - #define simde_mm256_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) simde_mm256_mask_mov_epi8(src, k, simde_mm256_gf2p8affineinv_epi64_epi8(x, A, b)) -#endif -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_gf2p8affineinv_epi64_epi8 - #define _mm256_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) simde_mm256_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) -#endif - -#if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) _mm512_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) -#else - #define simde_mm512_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) simde_mm512_mask_mov_epi8(src, k, simde_mm512_gf2p8affineinv_epi64_epi8(x, A, b)) -#endif -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_gf2p8affineinv_epi64_epi8 - #define _mm512_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) simde_mm512_mask_gf2p8affineinv_epi64_epi8(src, k, x, A, b) -#endif - -#if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) _mm_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) -#else - #define simde_mm_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) simde_mm_maskz_mov_epi8(k, simde_mm_gf2p8affineinv_epi64_epi8(x, A, b)) -#endif -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_gf2p8affineinv_epi64_epi8 - #define _mm_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) simde_mm_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) -#endif - -#if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - #define simde_mm256_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) _mm256_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) -#else - #define simde_mm256_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) simde_mm256_maskz_mov_epi8(k, simde_mm256_gf2p8affineinv_epi64_epi8(x, A, b)) -#endif -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_gf2p8affineinv_epi64_epi8 - #define _mm256_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) simde_mm256_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) -#endif - -#if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - #define simde_mm512_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) _mm512_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) -#else - #define simde_mm512_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) simde_mm512_maskz_mov_epi8(k, simde_mm512_gf2p8affineinv_epi64_epi8(x, A, b)) -#endif -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_gf2p8affineinv_epi64_epi8 - #define _mm512_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) simde_mm512_maskz_gf2p8affineinv_epi64_epi8(k, x, A, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i simde_mm_gf2p8mul_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_GFNI_NATIVE) && (defined(SIMDE_X86_AVX512VL_NATIVE) || !defined(SIMDE_X86_AVX512F_NATIVE)) - return _mm_gf2p8mul_epi8(a, b); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const poly8x16_t pa = vreinterpretq_p8_u8(simde__m128i_to_neon_u8(a)); - const poly8x16_t pb = vreinterpretq_p8_u8(simde__m128i_to_neon_u8(b)); - const uint8x16_t lo = vreinterpretq_u8_p16(vmull_p8(vget_low_p8(pa), vget_low_p8(pb))); - #if defined (SIMDE_ARM_NEON_A64V8_NATIVE) - uint8x16_t hi = vreinterpretq_u8_p16(vmull_high_p8(pa, pb)); - #else - uint8x16_t hi = vreinterpretq_u8_p16(vmull_p8(vget_high_p8(pa), vget_high_p8(pb))); - #endif - uint8x16x2_t hilo = vuzpq_u8(lo, hi); - uint8x16_t r = hilo.val[0]; - hi = hilo.val[1]; - const uint8x16_t idxHi = vshrq_n_u8(hi, 4); - const uint8x16_t idxLo = vandq_u8(hi, vdupq_n_u8(0xF)); - - #if defined (SIMDE_ARM_NEON_A64V8_NATIVE) - static const uint8_t reduceLutHiData[] = { - 0x00, 0xab, 0x4d, 0xe6, 0x9a, 0x31, 0xd7, 0x7c, - 0x2f, 0x84, 0x62, 0xc9, 0xb5, 0x1e, 0xf8, 0x53 - }; - static const uint8_t reduceLutLoData[] = { - 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, - 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99 - }; - const uint8x16_t reduceLutHi = vld1q_u8(reduceLutHiData); - const uint8x16_t reduceLutLo = vld1q_u8(reduceLutLoData); - r = veorq_u8(r, vqtbl1q_u8(reduceLutHi, idxHi)); - r = veorq_u8(r, vqtbl1q_u8(reduceLutLo, idxLo)); - #else - static const uint8_t reduceLutHiData[] = { - 0x00, 0x2f, - 0xab, 0x84, - 0x4d, 0x62, - 0xe6, 0xc9, - 0x9a, 0xb5, - 0x31, 0x1e, - 0xd7, 0xf8, - 0x7c, 0x53 - }; - static const uint8_t reduceLutLoData[] = { - 0x00, 0xd8, - 0x1b, 0xc3, - 0x36, 0xee, - 0x2d, 0xf5, - 0x6c, 0xb4, - 0x77, 0xaf, - 0x5a, 0x82, - 0x41, 0x99 - }; - const uint8x8x2_t reduceLutHi = vld2_u8(reduceLutHiData); - const uint8x8x2_t reduceLutLo = vld2_u8(reduceLutLoData); - r = veorq_u8(r, vcombine_u8(vtbl2_u8(reduceLutHi, vget_low_u8(idxHi)), vtbl2_u8(reduceLutHi, vget_high_u8(idxHi)))); - r = veorq_u8(r, vcombine_u8(vtbl2_u8(reduceLutLo, vget_low_u8(idxLo)), vtbl2_u8(reduceLutLo, vget_high_u8(idxLo)))); - #endif - return simde__m128i_from_neon_u8(r); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) x, y, lo, hi; - SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) even, odd, mask0x00FF; - x = simde__m128i_to_altivec_u8(a); - y = simde__m128i_to_altivec_u8(b); - mask0x00FF = vec_splats(HEDLEY_STATIC_CAST(unsigned short, 0x00FF)); - lo = y & HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), mask0x00FF); - hi = y ^ lo; - even = vec_gfmsum(x, lo); - odd = vec_gfmsum(x, hi); - lo = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_sel(vec_rli(odd, 8), even, mask0x00FF)); - hi = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_sel(odd, vec_rli(even, 8), mask0x00FF)); - const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) reduceLutHi = {0x00, 0xab, 0x4d, 0xe6, 0x9a, 0x31, 0xd7, 0x7c, 0x2f, 0x84, 0x62, 0xc9, 0xb5, 0x1e, 0xf8, 0x53}; - const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) reduceLutLo = {0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99}; - lo = lo ^ vec_perm(reduceLutHi, reduceLutHi, vec_rli(hi, 4)); - lo = lo ^ vec_perm(reduceLutLo, reduceLutLo, hi); - return simde__m128i_from_altivec_u8(lo); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) x, y, r, t, m; - x = simde__m128i_to_altivec_u8(a); - y = simde__m128i_to_altivec_u8(b); - - const SIMDE_POWER_ALTIVEC_VECTOR(signed char) zero = vec_splat_s8(0); - - m = vec_splat_u8(0x01); - - const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) fgp = vec_splats(HEDLEY_STATIC_CAST(unsigned char, SIMDE_X86_GFNI_FGP)); - t = vec_and(y, m); - t = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_cmpeq(t, m)); - r = vec_and(x, t); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (int i = 0 ; i < 7 ; i++) { - t = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_cmplt(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), x), zero)); - x = vec_add(x, x); - t = vec_and(fgp, t); - x = vec_xor(x, t); - m = vec_add(m, m); - t = vec_and(y, m); - t = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_cmpeq(t, m)); - t = vec_and(x, t); - r = vec_xor(r, t); - } - - return simde__m128i_from_altivec_u8(r); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t x, y, r, t, m; - x = simde__m128i_to_wasm_v128(a); - y = simde__m128i_to_wasm_v128(b); - - m = wasm_i8x16_splat(0x01); - - const v128_t fgp = wasm_i8x16_splat(SIMDE_X86_GFNI_FGP); - - t = wasm_v128_and(y, m); - t = wasm_i8x16_eq(t, m); - r = wasm_v128_and(x, t); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (int i = 0 ; i < 7 ; i++) { - t = wasm_i8x16_shr(x, 7); - x = wasm_i8x16_add(x, x); - t = wasm_v128_and(fgp, t); - x = wasm_v128_xor(x, t); - m = wasm_i8x16_add(m, m); - t = wasm_v128_and(y, m); - t = wasm_i8x16_eq(t, m); - t = wasm_v128_and(x, t); - r = wasm_v128_xor(r, t); - } - - return simde__m128i_from_wasm_v128(r); - #elif defined(SIMDE_X86_AVX512BW_NATIVE) - simde__m512i r4, t4, u4; - simde__mmask64 ma, mb; - - simde__m512i a4 = simde_mm512_broadcast_i32x4(a); - const simde__m512i zero = simde_mm512_setzero_si512(); - simde__mmask16 m8 = simde_mm512_cmpeq_epi32_mask(zero, zero); - - const simde__m512i b4 = simde_mm512_broadcast_i32x4(b); - - simde__m512i bits = simde_mm512_set_epi64(0x4040404040404040, - 0x4040404040404040, - 0x1010101010101010, - 0x1010101010101010, - 0x0404040404040404, - 0x0404040404040404, - 0x0101010101010101, - 0x0101010101010101); - - const simde__m512i fgp = simde_mm512_set1_epi8(SIMDE_X86_GFNI_FGP); - - for (int i = 0 ; i < 3 ; i++) { - m8 = simde_kshiftli_mask16(m8, 4); - - ma = simde_mm512_cmplt_epi8_mask(a4, zero); - u4 = simde_mm512_add_epi8(a4, a4); - t4 = simde_mm512_maskz_mov_epi8(ma, fgp); - u4 = simde_mm512_xor_epi32(u4, t4); - - ma = simde_mm512_cmplt_epi8_mask(u4, zero); - u4 = simde_mm512_add_epi8(u4, u4); - t4 = simde_mm512_maskz_mov_epi8(ma, fgp); - a4 = simde_mm512_mask_xor_epi32(a4, m8, u4, t4); - } - - mb = simde_mm512_test_epi8_mask(b4, bits); - bits = simde_mm512_add_epi8(bits, bits); - ma = simde_mm512_cmplt_epi8_mask(a4, zero); - r4 = simde_mm512_maskz_mov_epi8(mb, a4); - mb = simde_mm512_test_epi8_mask(b4, bits); - a4 = simde_mm512_add_epi8(a4, a4); - t4 = simde_mm512_maskz_mov_epi8(ma, fgp); - a4 = simde_mm512_xor_si512(a4, t4); - t4 = simde_mm512_maskz_mov_epi8(mb, a4); - r4 = simde_mm512_xor_si512(r4, t4); - - r4 = simde_mm512_xor_si512(r4, simde_mm512_shuffle_i32x4(r4, r4, (1 << 6) + (0 << 4) + (3 << 2) + 2)); - r4 = simde_mm512_xor_si512(r4, simde_mm512_shuffle_i32x4(r4, r4, (0 << 6) + (3 << 4) + (2 << 2) + 1)); - - return simde_mm512_extracti32x4_epi32(r4, 0); - #elif defined(SIMDE_X86_AVX2_NATIVE) - simde__m256i r2, t2; - simde__m256i a2 = simde_mm256_broadcastsi128_si256(a); - const simde__m256i zero = simde_mm256_setzero_si256(); - const simde__m256i fgp = simde_mm256_set1_epi8(SIMDE_X86_GFNI_FGP); - const simde__m256i ones = simde_mm256_set1_epi8(0x01); - simde__m256i b2 = simde_mm256_set_m128i(simde_mm_srli_epi64(b, 4), b); - - for (int i = 0 ; i < 4 ; i++) { - t2 = simde_mm256_cmpgt_epi8(zero, a2); - t2 = simde_mm256_and_si256(fgp, t2); - a2 = simde_mm256_add_epi8(a2, a2); - a2 = simde_mm256_xor_si256(a2, t2); - } - - a2 = simde_mm256_inserti128_si256(a2, a, 0); - - t2 = simde_mm256_and_si256(b2, ones); - t2 = simde_mm256_cmpeq_epi8(t2, ones); - r2 = simde_mm256_and_si256(a2, t2); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (int i = 0 ; i < 3 ; i++) { - t2 = simde_mm256_cmpgt_epi8(zero, a2); - t2 = simde_mm256_and_si256(fgp, t2); - a2 = simde_mm256_add_epi8(a2, a2); - a2 = simde_mm256_xor_si256(a2, t2); - b2 = simde_mm256_srli_epi64(b2, 1); - t2 = simde_mm256_and_si256(b2, ones); - t2 = simde_mm256_cmpeq_epi8(t2, ones); - t2 = simde_mm256_and_si256(a2, t2); - r2 = simde_mm256_xor_si256(r2, t2); - } - - return simde_mm_xor_si128(simde_mm256_extracti128_si256(r2, 1), - simde_mm256_extracti128_si256(r2, 0)); - #elif defined(SIMDE_X86_SSE2_NATIVE) - simde__m128i r, t; - const simde__m128i zero = simde_mm_setzero_si128(); - const simde__m128i ones = simde_mm_set1_epi8(0x01); - - const simde__m128i fgp = simde_mm_set1_epi8(SIMDE_X86_GFNI_FGP); - - t = simde_mm_and_si128(b, ones); - t = simde_mm_cmpeq_epi8(t, ones); - r = simde_mm_and_si128(a, t); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (int i = 0 ; i < 7 ; i++) { - t = simde_mm_cmpgt_epi8(zero, a); - t = simde_mm_and_si128(fgp, t); - a = simde_mm_add_epi8(a, a); - a = simde_mm_xor_si128(a, t); - b = simde_mm_srli_epi64(b, 1); - t = simde_mm_and_si128(b, ones); - t = simde_mm_cmpeq_epi8(t, ones); - t = simde_mm_and_si128(a, t); - r = simde_mm_xor_si128(r, t); - } - - return r; - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - const uint8_t fgp = SIMDE_X86_GFNI_FGP; - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = 0; - while ((a_.u8[i] != 0) && (b_.u8[i] != 0)) { - if (b_.u8[i] & 1) - r_.u8[i] ^= a_.u8[i]; - - if (a_.u8[i] & 0x80) - a_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, (a_.u8[i] << 1) ^ fgp); - else - a_.u8[i] <<= 1; - - b_.u8[i] >>= 1; - } - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_gf2p8mul_epi8 - #define _mm_gf2p8mul_epi8(a, b) simde_mm_gf2p8mul_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_gf2p8mul_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_GFNI_NATIVE) && (defined(SIMDE_X86_AVX512VL_NATIVE) || (defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_X86_AVX512F_NATIVE))) - return _mm256_gf2p8mul_epi8(a, b); - #elif !defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - simde__mmask64 ma, mb; - simde__m512i r, t, s; - simde__m512i a2 = simde_mm512_broadcast_i64x4(a); - const simde__m512i zero = simde_mm512_setzero_si512(); - - const simde__m512i fgp = simde_mm512_set1_epi8(SIMDE_X86_GFNI_FGP); - - s = simde_mm512_set1_epi8(0x01); - - for (int i = 0 ; i < 4 ; i++) { - ma = simde_mm512_cmplt_epi8_mask(a2, zero); - a2 = simde_mm512_add_epi8(a2, a2); - t = simde_mm512_xor_si512(a2, fgp); - a2 = simde_mm512_mask_mov_epi8(a2, ma, t); - } - - simde__m512i b2 = simde_mm512_inserti64x4(zero, simde_mm256_srli_epi64(b, 4), 1); - b2 = simde_mm512_inserti64x4(b2, b, 0); - a2 = simde_mm512_inserti64x4(a2, a, 0); - - mb = simde_mm512_test_epi8_mask(b2, s); - r = simde_mm512_maskz_mov_epi8(mb, a2); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (int i = 0 ; i < 3 ; i++) { - ma = simde_mm512_cmplt_epi8_mask(a2, zero); - s = simde_mm512_add_epi8(s, s); - mb = simde_mm512_test_epi8_mask(b2, s); - a2 = simde_mm512_add_epi8(a2, a2); - t = simde_mm512_maskz_mov_epi8(ma, fgp); - a2 = simde_mm512_xor_si512(a2, t); - t = simde_mm512_maskz_mov_epi8(mb, a2); - r = simde_mm512_xor_si512(r, t); - } - - return simde_mm256_xor_si256(simde_mm512_extracti64x4_epi64(r, 1), - simde_mm512_extracti64x4_epi64(r, 0)); - #elif !defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX2_NATIVE) - simde__m256i r, t; - const simde__m256i zero = simde_mm256_setzero_si256(); - const simde__m256i ones = simde_mm256_set1_epi8(0x01); - - const simde__m256i fgp = simde_mm256_set1_epi8(SIMDE_X86_GFNI_FGP); - - t = simde_mm256_and_si256(b, ones); - t = simde_mm256_cmpeq_epi8(t, ones); - r = simde_mm256_and_si256(a, t); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (int i = 0 ; i < 7 ; i++) { - t = simde_mm256_cmpgt_epi8(zero, a); - t = simde_mm256_and_si256(fgp, t); - a = simde_mm256_add_epi8(a, a); - a = simde_mm256_xor_si256(a, t); - b = simde_mm256_srli_epi64(b, 1); - t = simde_mm256_and_si256(b, ones); - t = simde_mm256_cmpeq_epi8(t, ones); - t = simde_mm256_and_si256(a, t); - r = simde_mm256_xor_si256(r, t); - } - - return r; - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_gf2p8mul_epi8(a_.m128i[i], b_.m128i[i]); - } - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX_ENABLE_NATIVE_ALIASES) - #undef _mm256_gf2p8mul_epi8 - #define _mm256_gf2p8mul_epi8(a, b) simde_mm256_gf2p8mul_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_gf2p8mul_epi8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_gf2p8mul_epi8(a, b); - #elif !defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) - simde__m512i r, s, t; - simde__mmask64 ma, mb; - const simde__m512i zero = simde_mm512_setzero_si512(); - - const simde__m512i fgp = simde_mm512_set1_epi8(SIMDE_X86_GFNI_FGP); - - s = simde_mm512_set1_epi8(0x01); - - mb = simde_mm512_test_epi8_mask(b, s); - r = simde_mm512_maskz_mov_epi8(mb, a); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (int i = 0 ; i < 7 ; i++) { - ma = simde_mm512_cmplt_epi8_mask(a, zero); - s = simde_mm512_add_epi8(s, s); - mb = simde_mm512_test_epi8_mask(b, s); - a = simde_mm512_add_epi8(a, a); - t = simde_mm512_maskz_mov_epi8(ma, fgp); - a = simde_mm512_xor_si512(a, t); - t = simde_mm512_maskz_mov_epi8(mb, a); - r = simde_mm512_xor_si512(r, t); - } - - return r; - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if !defined(__INTEL_COMPILER) - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_gf2p8mul_epi8(a_.m128i[i], b_.m128i[i]); - } - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_gf2p8mul_epi8 - #define _mm512_gf2p8mul_epi8(a, b) simde_mm512_gf2p8mul_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mask_gf2p8mul_epi8 (simde__m128i src, simde__mmask16 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_mask_gf2p8mul_epi8(src, k, a, b); - #else - return simde_mm_mask_mov_epi8(src, k, simde_mm_gf2p8mul_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_mask_gf2p8mul_epi8 - #define _mm_mask_gf2p8mul_epi8(src, k, a, b) simde_mm_mask_gf2p8mul_epi8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_mask_gf2p8mul_epi8 (simde__m256i src, simde__mmask32 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_mask_gf2p8mul_epi8(src, k, a, b); - #else - return simde_mm256_mask_mov_epi8(src, k, simde_mm256_gf2p8mul_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_mask_gf2p8mul_epi8 - #define _mm256_mask_gf2p8mul_epi8(src, k, a, b) simde_mm256_mask_gf2p8mul_epi8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_gf2p8mul_epi8 (simde__m512i src, simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_gf2p8mul_epi8(src, k, a, b); - #else - return simde_mm512_mask_mov_epi8(src, k, simde_mm512_gf2p8mul_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_gf2p8mul_epi8 - #define _mm512_mask_gf2p8mul_epi8(src, k, a, b) simde_mm512_mask_gf2p8mul_epi8(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maskz_gf2p8mul_epi8 (simde__mmask16 k, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_maskz_gf2p8mul_epi8(k, a, b); - #else - return simde_mm_maskz_mov_epi8(k, simde_mm_gf2p8mul_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm_maskz_gf2p8mul_epi8 - #define _mm_maskz_gf2p8mul_epi8(k, a, b) simde_mm_maskz_gf2p8mul_epi8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_maskz_gf2p8mul_epi8 (simde__mmask32 k, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_maskz_gf2p8mul_epi8(k, a, b); - #else - return simde_mm256_maskz_mov_epi8(k, simde_mm256_gf2p8mul_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) - #undef _mm256_maskz_gf2p8mul_epi8 - #define _mm256_maskz_gf2p8mul_epi8(k, a, b) simde_mm256_maskz_gf2p8mul_epi8(k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_maskz_gf2p8mul_epi8 (simde__mmask64 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_GFNI_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_maskz_gf2p8mul_epi8(k, a, b); - #else - return simde_mm512_maskz_mov_epi8(k, simde_mm512_gf2p8mul_epi8(a, b)); - #endif -} -#if defined(SIMDE_X86_GFNI_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512F_ENABLE_NATIVE_ALIASES) - #undef _mm512_maskz_gf2p8mul_epi8 - #define _mm512_maskz_gf2p8mul_epi8(k, a, b) simde_mm512_maskz_gf2p8mul_epi8(k, a, b) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_GFNI_H) */ diff --git a/ffi-deps/simde/simde/x86/mmx.h b/ffi-deps/simde/simde/x86/mmx.h deleted file mode 100644 index e294af8..0000000 --- a/ffi-deps/simde/simde/x86/mmx.h +++ /dev/null @@ -1,2398 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2017-2020 Evan Nemerson - */ - -#if !defined(SIMDE_X86_MMX_H) -#define SIMDE_X86_MMX_H - -#include "../simde-common.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS - -#if defined(SIMDE_X86_MMX_NATIVE) - #define SIMDE_X86_MMX_USE_NATIVE_TYPE -#elif defined(SIMDE_X86_SSE_NATIVE) - #define SIMDE_X86_MMX_USE_NATIVE_TYPE -#endif - -#if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE) - #include -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #include -#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - #include -#endif - -#include -#include - -SIMDE_BEGIN_DECLS_ - -typedef union { - #if defined(SIMDE_VECTOR_SUBSCRIPT) - SIMDE_ALIGN_TO_8 int8_t i8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_8 int16_t i16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_8 int32_t i32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_8 int64_t i64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_8 uint8_t u8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_8 uint16_t u16 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_8 uint32_t u32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_8 uint64_t u64 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_8 simde_float32 f32 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_8 int_fast32_t i32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_8 uint_fast32_t u32f SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; - #else - SIMDE_ALIGN_TO_8 int8_t i8[8]; - SIMDE_ALIGN_TO_8 int16_t i16[4]; - SIMDE_ALIGN_TO_8 int32_t i32[2]; - SIMDE_ALIGN_TO_8 int64_t i64[1]; - SIMDE_ALIGN_TO_8 uint8_t u8[8]; - SIMDE_ALIGN_TO_8 uint16_t u16[4]; - SIMDE_ALIGN_TO_8 uint32_t u32[2]; - SIMDE_ALIGN_TO_8 uint64_t u64[1]; - SIMDE_ALIGN_TO_8 simde_float32 f32[2]; - SIMDE_ALIGN_TO_8 int_fast32_t i32f[8 / sizeof(int_fast32_t)]; - SIMDE_ALIGN_TO_8 uint_fast32_t u32f[8 / sizeof(uint_fast32_t)]; - #endif - - #if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE) - __m64 n; - #endif - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int8x8_t neon_i8; - int16x4_t neon_i16; - int32x2_t neon_i32; - int64x1_t neon_i64; - uint8x8_t neon_u8; - uint16x4_t neon_u16; - uint32x2_t neon_u32; - uint64x1_t neon_u64; - float32x2_t neon_f32; - #endif - #if defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - int8x8_t mmi_i8; - int16x4_t mmi_i16; - int32x2_t mmi_i32; - int64_t mmi_i64; - uint8x8_t mmi_u8; - uint16x4_t mmi_u16; - uint32x2_t mmi_u32; - uint64_t mmi_u64; - #endif -} simde__m64_private; - -#if defined(SIMDE_X86_MMX_USE_NATIVE_TYPE) - typedef __m64 simde__m64; -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - typedef int32x2_t simde__m64; -#elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - typedef int32x2_t simde__m64; -#elif defined(SIMDE_VECTOR_SUBSCRIPT) - typedef int32_t simde__m64 SIMDE_ALIGN_TO_8 SIMDE_VECTOR(8) SIMDE_MAY_ALIAS; -#else - typedef simde__m64_private simde__m64; -#endif - -#if !defined(SIMDE_X86_MMX_USE_NATIVE_TYPE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) - #define SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES - typedef simde__m64 __m64; -#endif - -HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64), "__m64 size incorrect"); -HEDLEY_STATIC_ASSERT(8 == sizeof(simde__m64_private), "__m64 size incorrect"); -#if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF) -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m64) == 8, "simde__m64 is not 8-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m64_private) == 8, "simde__m64_private is not 8-byte aligned"); -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde__m64_from_private(simde__m64_private v) { - simde__m64 r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64_private -simde__m64_to_private(simde__m64 v) { - simde__m64_private r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -#define SIMDE_X86_GENERATE_CONVERSION_FUNCTION(simde_type, source_type, isax, fragment) \ - SIMDE_FUNCTION_ATTRIBUTES \ - simde__##simde_type \ - simde__##simde_type##_from_##isax##_##fragment(source_type value) { \ - simde__##simde_type##_private r_; \ - r_.isax##_##fragment = value; \ - return simde__##simde_type##_from_private(r_); \ - } \ - \ - SIMDE_FUNCTION_ATTRIBUTES \ - source_type \ - simde__##simde_type##_to_##isax##_##fragment(simde__##simde_type value) { \ - simde__##simde_type##_private r_ = simde__##simde_type##_to_private(value); \ - return r_.isax##_##fragment; \ - } - -#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int8x8_t, neon, i8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int16x4_t, neon, i16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int32x2_t, neon, i32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int64x1_t, neon, i64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint8x8_t, neon, u8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint16x4_t, neon, u16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint32x2_t, neon, u32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint64x1_t, neon, u64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, float32x2_t, neon, f32) -#endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */ - -#if defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int8x8_t, mmi, i8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int16x4_t, mmi, i16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int32x2_t, mmi, i32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, int64_t, mmi, i64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint8x8_t, mmi, u8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint16x4_t, mmi, u16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint32x2_t, mmi, u32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m64, uint64_t, mmi, u64) -#endif /* defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) */ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_add_pi8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_add_pi8(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vadd_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i8 = paddb_s(a_.mmi_i8, b_.mmi_i8); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = a_.i8 + b_.i8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] + b_.i8[i]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_paddb(a, b) simde_mm_add_pi8(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_add_pi8(a, b) simde_mm_add_pi8(a, b) -# define _m_paddb(a, b) simde_m_paddb(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_add_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_add_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vadd_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = paddh_s(a_.mmi_i16, b_.mmi_i16); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = a_.i16 + b_.i16; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] + b_.i16[i]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_paddw(a, b) simde_mm_add_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_add_pi16(a, b) simde_mm_add_pi16(a, b) -# define _m_paddw(a, b) simde_mm_add_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_add_pi32 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_add_pi32(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vadd_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i32 = paddw_s(a_.mmi_i32, b_.mmi_i32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 + b_.i32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] + b_.i32[i]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_paddd(a, b) simde_mm_add_pi32(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_add_pi32(a, b) simde_mm_add_pi32(a, b) -# define _m_paddd(a, b) simde_mm_add_pi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_adds_pi8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_adds_pi8(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vqadd_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i8 = paddsb(a_.mmi_i8, b_.mmi_i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - if ((((b_.i8[i]) > 0) && ((a_.i8[i]) > (INT8_MAX - (b_.i8[i]))))) { - r_.i8[i] = INT8_MAX; - } else if ((((b_.i8[i]) < 0) && ((a_.i8[i]) < (INT8_MIN - (b_.i8[i]))))) { - r_.i8[i] = INT8_MIN; - } else { - r_.i8[i] = (a_.i8[i]) + (b_.i8[i]); - } - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_paddsb(a, b) simde_mm_adds_pi8(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_adds_pi8(a, b) simde_mm_adds_pi8(a, b) -# define _m_paddsb(a, b) simde_mm_adds_pi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_adds_pu8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_adds_pu8(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vqadd_u8(a_.neon_u8, b_.neon_u8); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_u8 = paddusb(a_.mmi_u8, b_.mmi_u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - const uint_fast16_t x = HEDLEY_STATIC_CAST(uint_fast16_t, a_.u8[i]) + HEDLEY_STATIC_CAST(uint_fast16_t, b_.u8[i]); - if (x > UINT8_MAX) - r_.u8[i] = UINT8_MAX; - else - r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_paddusb(a, b) simde_mm_adds_pu8(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_adds_pu8(a, b) simde_mm_adds_pu8(a, b) -# define _m_paddusb(a, b) simde_mm_adds_pu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_adds_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_adds_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vqadd_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = paddsh(a_.mmi_i16, b_.mmi_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - if ((((b_.i16[i]) > 0) && ((a_.i16[i]) > (INT16_MAX - (b_.i16[i]))))) { - r_.i16[i] = INT16_MAX; - } else if ((((b_.i16[i]) < 0) && ((a_.i16[i]) < (SHRT_MIN - (b_.i16[i]))))) { - r_.i16[i] = SHRT_MIN; - } else { - r_.i16[i] = (a_.i16[i]) + (b_.i16[i]); - } - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_paddsw(a, b) simde_mm_adds_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_adds_pi16(a, b) simde_mm_adds_pi16(a, b) -# define _m_paddsw(a, b) simde_mm_adds_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_adds_pu16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_adds_pu16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vqadd_u16(a_.neon_u16, b_.neon_u16); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_u16 = paddush(a_.mmi_u16, b_.mmi_u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - const uint32_t x = a_.u16[i] + b_.u16[i]; - if (x > UINT16_MAX) - r_.u16[i] = UINT16_MAX; - else - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_paddusw(a, b) simde_mm_adds_pu16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_adds_pu16(a, b) simde_mm_adds_pu16(a, b) -# define _m_paddusw(a, b) simde_mm_adds_pu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_and_si64 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_and_si64(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vand_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 & b_.i64; - #else - r_.i64[0] = a_.i64[0] & b_.i64[0]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pand(a, b) simde_mm_and_si64(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_and_si64(a, b) simde_mm_and_si64(a, b) -# define _m_pand(a, b) simde_mm_and_si64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_andnot_si64 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_andnot_si64(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vbic_s32(b_.neon_i32, a_.neon_i32); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i32 = pandn_sw(a_.mmi_i32, b_.mmi_i32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = ~a_.i32f & b_.i32f; - #else - r_.u64[0] = (~(a_.u64[0])) & (b_.u64[0]); - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pandn(a, b) simde_mm_andnot_si64(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_andnot_si64(a, b) simde_mm_andnot_si64(a, b) -# define _m_pandn(a, b) simde_mm_andnot_si64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cmpeq_pi8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_cmpeq_pi8(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vceq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i8 = pcmpeqb_s(a_.mmi_i8, b_.mmi_i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_cmpeq_pi8(a, b) simde_mm_cmpeq_pi8(a, b) -# define _m_pcmpeqb(a, b) simde_mm_cmpeq_pi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cmpeq_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_cmpeq_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vceq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = pcmpeqh_s(a_.mmi_i16, b_.mmi_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_cmpeq_pi16(a, b) simde_mm_cmpeq_pi16(a, b) -# define _m_pcmpeqw(a, b) simde_mm_cmpeq_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cmpeq_pi32 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_cmpeq_pi32(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vceq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i32 = pcmpeqw_s(a_.mmi_i32, b_.mmi_i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_cmpeq_pi32(a, b) simde_mm_cmpeq_pi32(a, b) -# define _m_pcmpeqd(a, b) simde_mm_cmpeq_pi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cmpgt_pi8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_cmpgt_pi8(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vcgt_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i8 = pcmpgtb_s(a_.mmi_i8, b_.mmi_i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_cmpgt_pi8(a, b) simde_mm_cmpgt_pi8(a, b) -# define _m_pcmpgtb(a, b) simde_mm_cmpgt_pi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cmpgt_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_cmpgt_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vcgt_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = pcmpgth_s(a_.mmi_i16, b_.mmi_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_cmpgt_pi16(a, b) simde_mm_cmpgt_pi16(a, b) -# define _m_pcmpgtw(a, b) simde_mm_cmpgt_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cmpgt_pi32 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_cmpgt_pi32(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcgt_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i32 = pcmpgtw_s(a_.mmi_i32, b_.mmi_i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_cmpgt_pi32(a, b) simde_mm_cmpgt_pi32(a, b) -# define _m_pcmpgtd(a, b) simde_mm_cmpgt_pi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int64_t -simde_mm_cvtm64_si64 (simde__m64 a) { - #if defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI) - return _mm_cvtm64_si64(a); - #else - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - HEDLEY_DIAGNOSTIC_PUSH - #if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(10,0,0) - #pragma clang diagnostic ignored "-Wvector-conversion" - #endif - return vget_lane_s64(a_.neon_i64, 0); - HEDLEY_DIAGNOSTIC_POP - #else - return a_.i64[0]; - #endif - #endif -} -#define simde_m_to_int64(a) simde_mm_cvtm64_si64(a) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) -# define _mm_cvtm64_si64(a) simde_mm_cvtm64_si64(a) -# define _m_to_int64(a) simde_mm_cvtm64_si64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cvtsi32_si64 (int32_t a) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtsi32_si64(a); - #else - simde__m64_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const int32_t av[2] = { a, 0 }; - r_.neon_i32 = vld1_s32(av); - #else - r_.i32[0] = a; - r_.i32[1] = 0; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_from_int(a) simde_mm_cvtsi32_si64(a) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_cvtsi32_si64(a) simde_mm_cvtsi32_si64(a) -# define _m_from_int(a) simde_mm_cvtsi32_si64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cvtsi64_m64 (int64_t a) { - #if defined(SIMDE_X86_MMX_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(__PGI) - return _mm_cvtsi64_m64(a); - #else - simde__m64_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vld1_s64(&a); - #else - r_.i64[0] = a; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_from_int64(a) simde_mm_cvtsi64_m64(a) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) -# define _mm_cvtsi64_m64(a) simde_mm_cvtsi64_m64(a) -# define _m_from_int64(a) simde_mm_cvtsi64_m64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_cvtsi64_si32 (simde__m64 a) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtsi64_si32(a); - #else - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - HEDLEY_DIAGNOSTIC_PUSH - #if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(10,0,0) - #pragma clang diagnostic ignored "-Wvector-conversion" - #endif - return vget_lane_s32(a_.neon_i32, 0); - HEDLEY_DIAGNOSTIC_POP - #else - return a_.i32[0]; - #endif - #endif -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_cvtsi64_si32(a) simde_mm_cvtsi64_si32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_empty (void) { - #if defined(SIMDE_X86_MMX_NATIVE) - _mm_empty(); - #else - /* noop */ - #endif -} -#define simde_m_empty() simde_mm_empty() -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_empty() simde_mm_empty() -# define _m_empty() simde_mm_empty() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_madd_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_madd_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int32x4_t i1 = vmull_s16(a_.neon_i16, b_.neon_i16); - r_.neon_i32 = vpadd_s32(vget_low_s32(i1), vget_high_s32(i1)); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i32 = pmaddhw(a_.mmi_i16, b_.mmi_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i += 2) { - r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) + (a_.i16[i + 1] * b_.i16[i + 1]); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pmaddwd(a, b) simde_mm_madd_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_madd_pi16(a, b) simde_mm_madd_pi16(a, b) -# define _m_pmaddwd(a, b) simde_mm_madd_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_mulhi_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_mulhi_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16); - const uint32x4_t t2 = vshrq_n_u32(vreinterpretq_u32_s32(t1), 16); - const uint16x4_t t3 = vmovn_u32(t2); - r_.neon_u16 = t3; - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = pmulhh(a_.mmi_i16, b_.mmi_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, ((a_.i16[i] * b_.i16[i]) >> 16)); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_mulhi_pi16(a, b) simde_mm_mulhi_pi16(a, b) -# define _m_pmulhw(a, b) simde_mm_mulhi_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_mullo_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_mullo_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const int32x4_t t1 = vmull_s16(a_.neon_i16, b_.neon_i16); - const uint16x4_t t2 = vmovn_u32(vreinterpretq_u32_s32(t1)); - r_.neon_u16 = t2; - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = pmullh(a_.mmi_i16, b_.mmi_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, ((a_.i16[i] * b_.i16[i]) & 0xffff)); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pmullw(a, b) simde_mm_mullo_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_mullo_pi16(a, b) simde_mm_mullo_pi16(a, b) -# define _m_pmullw(a, b) simde_mm_mullo_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_or_si64 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_or_si64(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vorr_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 | b_.i64; - #else - r_.i64[0] = a_.i64[0] | b_.i64[0]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_por(a, b) simde_mm_or_si64(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_or_si64(a, b) simde_mm_or_si64(a, b) -# define _m_por(a, b) simde_mm_or_si64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_packs_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_packs_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vqmovn_s16(vcombine_s16(a_.neon_i16, b_.neon_i16)); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i8 = packsshb(a_.mmi_i16, b_.mmi_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - if (a_.i16[i] < INT8_MIN) { - r_.i8[i] = INT8_MIN; - } else if (a_.i16[i] > INT8_MAX) { - r_.i8[i] = INT8_MAX; - } else { - r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, a_.i16[i]); - } - } - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - if (b_.i16[i] < INT8_MIN) { - r_.i8[i + 4] = INT8_MIN; - } else if (b_.i16[i] > INT8_MAX) { - r_.i8[i + 4] = INT8_MAX; - } else { - r_.i8[i + 4] = HEDLEY_STATIC_CAST(int8_t, b_.i16[i]); - } - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_packsswb(a, b) simde_mm_packs_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_packs_pi16(a, b) simde_mm_packs_pi16(a, b) -# define _m_packsswb(a, b) simde_mm_packs_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_packs_pi32 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_packs_pi32(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vqmovn_s32(vcombine_s32(a_.neon_i32, b_.neon_i32)); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = packsswh(a_.mmi_i32, b_.mmi_i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (8 / sizeof(a_.i32[0])) ; i++) { - if (a_.i32[i] < SHRT_MIN) { - r_.i16[i] = SHRT_MIN; - } else if (a_.i32[i] > INT16_MAX) { - r_.i16[i] = INT16_MAX; - } else { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i32[i]); - } - } - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (8 / sizeof(b_.i32[0])) ; i++) { - if (b_.i32[i] < SHRT_MIN) { - r_.i16[i + 2] = SHRT_MIN; - } else if (b_.i32[i] > INT16_MAX) { - r_.i16[i + 2] = INT16_MAX; - } else { - r_.i16[i + 2] = HEDLEY_STATIC_CAST(int16_t, b_.i32[i]); - } - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_packssdw(a, b) simde_mm_packs_pi32(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_packs_pi32(a, b) simde_mm_packs_pi32(a, b) -# define _m_packssdw(a, b) simde_mm_packs_pi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_packs_pu16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_packs_pu16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - const int16x8_t t1 = vcombine_s16(a_.neon_i16, b_.neon_i16); - - /* Set elements which are < 0 to 0 */ - const int16x8_t t2 = vandq_s16(t1, vreinterpretq_s16_u16(vcgezq_s16(t1))); - - /* Vector with all s16 elements set to UINT8_MAX */ - const int16x8_t vmax = vmovq_n_s16(HEDLEY_STATIC_CAST(int16_t, UINT8_MAX)); - - /* Elements which are within the acceptable range */ - const int16x8_t le_max = vandq_s16(t2, vreinterpretq_s16_u16(vcleq_s16(t2, vmax))); - const int16x8_t gt_max = vandq_s16(vmax, vreinterpretq_s16_u16(vcgtq_s16(t2, vmax))); - - /* Final values as 16-bit integers */ - const int16x8_t values = vorrq_s16(le_max, gt_max); - - r_.neon_u8 = vmovn_u16(vreinterpretq_u16_s16(values)); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_u8 = packushb(a_.mmi_u16, b_.mmi_u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - if (a_.i16[i] > UINT8_MAX) { - r_.u8[i] = UINT8_MAX; - } else if (a_.i16[i] < 0) { - r_.u8[i] = 0; - } else { - r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, a_.i16[i]); - } - } - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - if (b_.i16[i] > UINT8_MAX) { - r_.u8[i + 4] = UINT8_MAX; - } else if (b_.i16[i] < 0) { - r_.u8[i + 4] = 0; - } else { - r_.u8[i + 4] = HEDLEY_STATIC_CAST(uint8_t, b_.i16[i]); - } - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_packuswb(a, b) simde_mm_packs_pu16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_packs_pu16(a, b) simde_mm_packs_pu16(a, b) -# define _m_packuswb(a, b) simde_mm_packs_pu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_set_pi8 (int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0); - #else - simde__m64_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const int8_t v[sizeof(r_.i8) / sizeof(r_.i8[0])] = { e0, e1, e2, e3, e4, e5, e6, e7 }; - r_.neon_i8 = vld1_s8(v); - #else - r_.i8[0] = e0; - r_.i8[1] = e1; - r_.i8[2] = e2; - r_.i8[3] = e3; - r_.i8[4] = e4; - r_.i8[5] = e5; - r_.i8[6] = e6; - r_.i8[7] = e7; - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_set_pi8(e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_x_mm_set_pu8 (uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4, uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0) { - simde__m64_private r_; - - #if defined(SIMDE_X86_MMX_NATIVE) - r_.n = _mm_set_pi8( - HEDLEY_STATIC_CAST(int8_t, e7), - HEDLEY_STATIC_CAST(int8_t, e6), - HEDLEY_STATIC_CAST(int8_t, e5), - HEDLEY_STATIC_CAST(int8_t, e4), - HEDLEY_STATIC_CAST(int8_t, e3), - HEDLEY_STATIC_CAST(int8_t, e2), - HEDLEY_STATIC_CAST(int8_t, e1), - HEDLEY_STATIC_CAST(int8_t, e0)); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const uint8_t v[sizeof(r_.u8) / sizeof(r_.u8[0])] = { e0, e1, e2, e3, e4, e5, e6, e7 }; - r_.neon_u8 = vld1_u8(v); - #else - r_.u8[0] = e0; - r_.u8[1] = e1; - r_.u8[2] = e2; - r_.u8[3] = e3; - r_.u8[4] = e4; - r_.u8[5] = e5; - r_.u8[6] = e6; - r_.u8[7] = e7; - #endif - - return simde__m64_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_set_pi16 (int16_t e3, int16_t e2, int16_t e1, int16_t e0) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_set_pi16(e3, e2, e1, e0); - #else - simde__m64_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const int16_t v[sizeof(r_.i16) / sizeof(r_.i16[0])] = { e0, e1, e2, e3 }; - r_.neon_i16 = vld1_s16(v); - #else - r_.i16[0] = e0; - r_.i16[1] = e1; - r_.i16[2] = e2; - r_.i16[3] = e3; - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_set_pi16(e3, e2, e1, e0) simde_mm_set_pi16(e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_x_mm_set_pu16 (uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) { - simde__m64_private r_; - -#if defined(SIMDE_X86_MMX_NATIVE) - r_.n = _mm_set_pi16( - HEDLEY_STATIC_CAST(int16_t, e3), - HEDLEY_STATIC_CAST(int16_t, e2), - HEDLEY_STATIC_CAST(int16_t, e1), - HEDLEY_STATIC_CAST(int16_t, e0) - ); -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const uint16_t v[sizeof(r_.u16) / sizeof(r_.u16[0])] = { e0, e1, e2, e3 }; - r_.neon_u16 = vld1_u16(v); -#else - r_.u16[0] = e0; - r_.u16[1] = e1; - r_.u16[2] = e2; - r_.u16[3] = e3; -#endif - - return simde__m64_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_x_mm_set_pu32 (uint32_t e1, uint32_t e0) { - simde__m64_private r_; - -#if defined(SIMDE_X86_MMX_NATIVE) - r_.n = _mm_set_pi32( - HEDLEY_STATIC_CAST(int32_t, e1), - HEDLEY_STATIC_CAST(int32_t, e0)); -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const uint32_t v[sizeof(r_.u32) / sizeof(r_.u32[0])] = { e0, e1 }; - r_.neon_u32 = vld1_u32(v); -#else - r_.u32[0] = e0; - r_.u32[1] = e1; -#endif - - return simde__m64_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_set_pi32 (int32_t e1, int32_t e0) { - simde__m64_private r_; - -#if defined(SIMDE_X86_MMX_NATIVE) - r_.n = _mm_set_pi32(e1, e0); -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const int32_t v[sizeof(r_.i32) / sizeof(r_.i32[0])] = { e0, e1 }; - r_.neon_i32 = vld1_s32(v); -#else - r_.i32[0] = e0; - r_.i32[1] = e1; -#endif - - return simde__m64_from_private(r_); -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_set_pi32(e1, e0) simde_mm_set_pi32(e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_x_mm_set_pi64 (int64_t e0) { - simde__m64_private r_; - -#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const int64_t v[sizeof(r_.i64) / sizeof(r_.i64[0])] = { e0 }; - r_.neon_i64 = vld1_s64(v); -#else - r_.i64[0] = e0; -#endif - - return simde__m64_from_private(r_); -} - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_x_mm_set_f32x2 (simde_float32 e1, simde_float32 e0) { - simde__m64_private r_; - -#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const simde_float32 v[sizeof(r_.f32) / sizeof(r_.f32[0])] = { e0, e1 }; - r_.neon_f32 = vld1_f32(v); -#else - r_.f32[0] = e0; - r_.f32[1] = e1; -#endif - - return simde__m64_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_set1_pi8 (int8_t a) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_set1_pi8(a); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - simde__m64_private r_; - r_.neon_i8 = vmov_n_s8(a); - return simde__m64_from_private(r_); - #else - return simde_mm_set_pi8(a, a, a, a, a, a, a, a); - #endif -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_set1_pi8(a) simde_mm_set1_pi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_set1_pi16 (int16_t a) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_set1_pi16(a); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - simde__m64_private r_; - r_.neon_i16 = vmov_n_s16(a); - return simde__m64_from_private(r_); - #else - return simde_mm_set_pi16(a, a, a, a); - #endif -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_set1_pi16(a) simde_mm_set1_pi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_set1_pi32 (int32_t a) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_set1_pi32(a); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - simde__m64_private r_; - r_.neon_i32 = vmov_n_s32(a); - return simde__m64_from_private(r_); - #else - return simde_mm_set_pi32(a, a); - #endif -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_set1_pi32(a) simde_mm_set1_pi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_setr_pi8 (int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0); - #else - return simde_mm_set_pi8(e0, e1, e2, e3, e4, e5, e6, e7); - #endif -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_pi8(e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_setr_pi16 (int16_t e3, int16_t e2, int16_t e1, int16_t e0) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_setr_pi16(e3, e2, e1, e0); - #else - return simde_mm_set_pi16(e0, e1, e2, e3); - #endif -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_setr_pi16(e3, e2, e1, e0) simde_mm_setr_pi16(e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_setr_pi32 (int32_t e1, int32_t e0) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_setr_pi32(e1, e0); - #else - return simde_mm_set_pi32(e0, e1); - #endif -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_setr_pi32(e1, e0) simde_mm_setr_pi32(e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_setzero_si64 (void) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_setzero_si64(); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - simde__m64_private r_; - r_.neon_u32 = vmov_n_u32(0); - return simde__m64_from_private(r_); - #else - return simde_mm_set_pi32(0, 0); - #endif -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_setzero_si64() simde_mm_setzero_si64() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_x_mm_load_si64 (const void* mem_addr) { - simde__m64 r; - simde_memcpy(&r, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m64), sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_x_mm_loadu_si64 (const void* mem_addr) { - simde__m64 r; - simde_memcpy(&r, mem_addr, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_x_mm_store_si64 (void* mem_addr, simde__m64 value) { - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m64), &value, sizeof(value)); -} - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_x_mm_storeu_si64 (void* mem_addr, simde__m64 value) { - simde_memcpy(mem_addr, &value, sizeof(value)); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_x_mm_setone_si64 (void) { - return simde_mm_set1_pi32(~INT32_C(0)); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sll_pi16 (simde__m64 a, simde__m64 count) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_sll_pi16(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private count_ = simde__m64_to_private(count); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - HEDLEY_DIAGNOSTIC_PUSH - #if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(10,0,0) - #pragma clang diagnostic ignored "-Wvector-conversion" - #endif - r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16(HEDLEY_STATIC_CAST(int16_t, vget_lane_u64(count_.neon_u64, 0)))); - HEDLEY_DIAGNOSTIC_POP - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT) - if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) - return simde_mm_setzero_si64(); - - r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, count_.u64[0]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 << count_.u64[0]; - #else - if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) { - simde_memset(&r_, 0, sizeof(r_)); - return simde__m64_from_private(r_); - } - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i] << count_.u64[0]); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psllw(a, count) simde_mm_sll_pi16(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_sll_pi16(a, count) simde_mm_sll_pi16(a, count) -# define _m_psllw(a, count) simde_mm_sll_pi16(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sll_pi32 (simde__m64 a, simde__m64 count) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_sll_pi32(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private count_ = simde__m64_to_private(count); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - HEDLEY_DIAGNOSTIC_PUSH - #if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(10,0,0) - #pragma clang diagnostic ignored "-Wvector-conversion" - #endif - r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32(HEDLEY_STATIC_CAST(int32_t, vget_lane_u64(count_.neon_u64, 0)))); - HEDLEY_DIAGNOSTIC_POP - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 << count_.u64[0]; - #else - if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) { - simde_memset(&r_, 0, sizeof(r_)); - return simde__m64_from_private(r_); - } - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] << count_.u64[0]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pslld(a, count) simde_mm_sll_pi32(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_sll_pi32(a, count) simde_mm_sll_pi32(a, count) -# define _m_pslld(a, count) simde_mm_sll_pi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_slli_pi16 (simde__m64 a, int count) { - #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) - return _mm_slli_pi16(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = psllh_s(a_.mmi_i16, count); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT) - if (HEDLEY_UNLIKELY(count > 15)) - return simde_mm_setzero_si64(); - - r_.i16 = a_.i16 << HEDLEY_STATIC_CAST(int16_t, count); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 << count; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16((int16_t) count)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i] << count); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psllwi(a, count) simde_mm_slli_pi16(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_slli_pi16(a, count) simde_mm_slli_pi16(a, count) -# define _m_psllwi(a, count) simde_mm_slli_pi16(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_slli_pi32 (simde__m64 a, int count) { - #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) - return _mm_slli_pi32(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 << count; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32((int32_t) count)); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i32 = psllw_s(a_.mmi_i32, b_.mmi_i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] << count; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pslldi(a, b) simde_mm_slli_pi32(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_slli_pi32(a, count) simde_mm_slli_pi32(a, count) -# define _m_pslldi(a, count) simde_mm_slli_pi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_slli_si64 (simde__m64 a, int count) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_slli_si64(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i64 = a_.i64 << count; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vshl_s64(a_.neon_i64, vmov_n_s64((int64_t) count)); - #else - r_.u64[0] = a_.u64[0] << count; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psllqi(a, count) simde_mm_slli_si64(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_slli_si64(a, count) simde_mm_slli_si64(a, count) -# define _m_psllqi(a, count) simde_mm_slli_si64(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sll_si64 (simde__m64 a, simde__m64 count) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_sll_si64(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private count_ = simde__m64_to_private(count); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vshl_s64(a_.neon_i64, count_.neon_i64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 << count_.i64; - #else - if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) { - simde_memset(&r_, 0, sizeof(r_)); - return simde__m64_from_private(r_); - } - - r_.u64[0] = a_.u64[0] << count_.u64[0]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psllq(a, count) simde_mm_sll_si64(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_sll_si64(a, count) simde_mm_sll_si64(a, count) -# define _m_psllq(a, count) simde_mm_sll_si64(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_srl_pi16 (simde__m64 a, simde__m64 count) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_srl_pi16(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private count_ = simde__m64_to_private(count); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_BUG_CLANG_POWER9_16x4_BAD_SHIFT) - if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) - return simde_mm_setzero_si64(); - - r_.u16 = a_.u16 >> HEDLEY_STATIC_CAST(uint16_t, count_.u64[0]); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u16 = a_.u16 >> count_.u64[0]; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vshl_u16(a_.neon_u16, vmov_n_s16(-((int16_t) vget_lane_u64(count_.neon_u64, 0)))); - #else - if (HEDLEY_UNLIKELY(count_.u64[0] > 15)) { - simde_memset(&r_, 0, sizeof(r_)); - return simde__m64_from_private(r_); - } - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < sizeof(r_.u16) / sizeof(r_.u16[0]) ; i++) { - r_.u16[i] = a_.u16[i] >> count_.u64[0]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psrlw(a, count) simde_mm_srl_pi16(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_srl_pi16(a, count) simde_mm_srl_pi16(a, count) -# define _m_psrlw(a, count) simde_mm_srl_pi16(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_srl_pi32 (simde__m64 a, simde__m64 count) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_srl_pi32(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private count_ = simde__m64_to_private(count); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = a_.u32 >> count_.u64[0]; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vshl_u32(a_.neon_u32, vmov_n_s32(-((int32_t) vget_lane_u64(count_.neon_u64, 0)))); - #else - if (HEDLEY_UNLIKELY(count_.u64[0] > 31)) { - simde_memset(&r_, 0, sizeof(r_)); - return simde__m64_from_private(r_); - } - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < sizeof(r_.u32) / sizeof(r_.u32[0]) ; i++) { - r_.u32[i] = a_.u32[i] >> count_.u64[0]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psrld(a, count) simde_mm_srl_pi32(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_srl_pi32(a, count) simde_mm_srl_pi32(a, count) -# define _m_psrld(a, count) simde_mm_srl_pi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_srli_pi16 (simde__m64 a, int count) { - #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) - return _mm_srli_pi16(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u16 = a_.u16 >> count; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vshl_u16(a_.neon_u16, vmov_n_s16(-((int16_t) count))); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = psrlh_s(a_.mmi_i16, b_.mmi_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = a_.u16[i] >> count; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psrlwi(a, count) simde_mm_srli_pi16(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_srli_pi16(a, count) simde_mm_srli_pi16(a, count) -# define _m_psrlwi(a, count) simde_mm_srli_pi16(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_srli_pi32 (simde__m64 a, int count) { - #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) - return _mm_srli_pi32(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = a_.u32 >> count; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vshl_u32(a_.neon_u32, vmov_n_s32(-((int32_t) count))); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i32 = psrlw_s(a_.mmi_i32, b_.mmi_i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] >> count; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psrldi(a, count) simde_mm_srli_pi32(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_srli_pi32(a, count) simde_mm_srli_pi32(a, count) -# define _m_psrldi(a, count) simde_mm_srli_pi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_srli_si64 (simde__m64 a, int count) { - #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) - return _mm_srli_si64(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u64 = vshl_u64(a_.neon_u64, vmov_n_s64(-count)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u64 = a_.u64 >> count; - #else - r_.u64[0] = a_.u64[0] >> count; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psrlqi(a, count) simde_mm_srli_si64(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_srli_si64(a, count) simde_mm_srli_si64(a, count) -# define _m_psrlqi(a, count) simde_mm_srli_si64(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_srl_si64 (simde__m64 a, simde__m64 count) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_srl_si64(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private count_ = simde__m64_to_private(count); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vshl_u64(a_.neon_u64, vneg_s64(count_.neon_i64)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = a_.u64 >> count_.u64; - #else - if (HEDLEY_UNLIKELY(count_.u64[0] > 63)) { - simde_memset(&r_, 0, sizeof(r_)); - return simde__m64_from_private(r_); - } - - r_.u64[0] = a_.u64[0] >> count_.u64[0]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psrlq(a, count) simde_mm_srl_si64(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_srl_si64(a, count) simde_mm_srl_si64(a, count) -# define _m_psrlq(a, count) simde_mm_srl_si64(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_srai_pi16 (simde__m64 a, int count) { - #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) - return _mm_srai_pi16(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 >> (count & 0xff); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16(-HEDLEY_STATIC_CAST(int16_t, count))); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = psrah_s(a_.mmi_i16, count); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] >> (count & 0xff); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psrawi(a, count) simde_mm_srai_pi16(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_srai_pi16(a, count) simde_mm_srai_pi16(a, count) -# define _m_psrawi(a, count) simde_mm_srai_pi16(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_srai_pi32 (simde__m64 a, int count) { - #if defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) - return _mm_srai_pi32(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 >> (count & 0xff); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32(-HEDLEY_STATIC_CAST(int32_t, count))); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i32 = psraw_s(a_.mmi_i32, count); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] >> (count & 0xff); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psradi(a, count) simde_mm_srai_pi32(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_srai_pi32(a, count) simde_mm_srai_pi32(a, count) -# define _m_psradi(a, count) simde_mm_srai_pi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sra_pi16 (simde__m64 a, simde__m64 count) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_sra_pi16(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private count_ = simde__m64_to_private(count); - const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 15 ? 15 : count_.i64[0])); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 >> cnt; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vshl_s16(a_.neon_i16, vmov_n_s16(-HEDLEY_STATIC_CAST(int16_t, vget_lane_u64(count_.neon_u64, 0)))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] >> cnt; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psraw(a, count) simde_mm_sra_pi16(a, count) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_sra_pi16(a, count) simde_mm_sra_pi16(a, count) -# define _m_psraw(a, count) simde_mm_sra_pi16(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sra_pi32 (simde__m64 a, simde__m64 count) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_sra_pi32(a, count); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private count_ = simde__m64_to_private(count); - const int32_t cnt = (count_.u64[0] > 31) ? 31 : HEDLEY_STATIC_CAST(int32_t, count_.u64[0]); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 >> cnt; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vshl_s32(a_.neon_i32, vmov_n_s32(-HEDLEY_STATIC_CAST(int32_t, vget_lane_u64(count_.neon_u64, 0)))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] >> cnt; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psrad(a, b) simde_mm_sra_pi32(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_sra_pi32(a, count) simde_mm_sra_pi32(a, count) -# define _m_psrad(a, count) simde_mm_sra_pi32(a, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sub_pi8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_sub_pi8(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vsub_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i8 = psubb_s(a_.mmi_i8, b_.mmi_i8); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = a_.i8 - b_.i8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] - b_.i8[i]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psubb(a, b) simde_mm_sub_pi8(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_sub_pi8(a, b) simde_mm_sub_pi8(a, b) -# define _m_psubb(a, b) simde_mm_sub_pi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sub_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_sub_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vsub_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = psubh_s(a_.mmi_i16, b_.mmi_i16); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = a_.i16 - b_.i16; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] - b_.i16[i]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psubw(a, b) simde_mm_sub_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_sub_pi16(a, b) simde_mm_sub_pi16(a, b) -# define _m_psubw(a, b) simde_mm_sub_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sub_pi32 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_sub_pi32(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vsub_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i32 = psubw_s(a_.mmi_i32, b_.mmi_i32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 - b_.i32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] - b_.i32[i]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psubd(a, b) simde_mm_sub_pi32(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_sub_pi32(a, b) simde_mm_sub_pi32(a, b) -# define _m_psubd(a, b) simde_mm_sub_pi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_subs_pi8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_subs_pi8(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vqsub_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i8 = psubsb(a_.mmi_i8, b_.mmi_i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - if (((b_.i8[i]) > 0 && (a_.i8[i]) < INT8_MIN + (b_.i8[i]))) { - r_.i8[i] = INT8_MIN; - } else if ((b_.i8[i]) < 0 && (a_.i8[i]) > INT8_MAX + (b_.i8[i])) { - r_.i8[i] = INT8_MAX; - } else { - r_.i8[i] = (a_.i8[i]) - (b_.i8[i]); - } - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psubsb(a, b) simde_mm_subs_pi8(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_subs_pi8(a, b) simde_mm_subs_pi8(a, b) -# define _m_psubsb(a, b) simde_mm_subs_pi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_subs_pu8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_subs_pu8(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vqsub_u8(a_.neon_u8, b_.neon_u8); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_u8 = psubusb(a_.mmi_u8, b_.mmi_u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - const int32_t x = a_.u8[i] - b_.u8[i]; - if (x < 0) { - r_.u8[i] = 0; - } else if (x > UINT8_MAX) { - r_.u8[i] = UINT8_MAX; - } else { - r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, x); - } - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psubusb(a, b) simde_mm_subs_pu8(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_subs_pu8(a, b) simde_mm_subs_pu8(a, b) -# define _m_psubusb(a, b) simde_mm_subs_pu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_subs_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_subs_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vqsub_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = psubsh(a_.mmi_i16, b_.mmi_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - if (((b_.i16[i]) > 0 && (a_.i16[i]) < SHRT_MIN + (b_.i16[i]))) { - r_.i16[i] = SHRT_MIN; - } else if ((b_.i16[i]) < 0 && (a_.i16[i]) > INT16_MAX + (b_.i16[i])) { - r_.i16[i] = INT16_MAX; - } else { - r_.i16[i] = (a_.i16[i]) - (b_.i16[i]); - } - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psubsw(a, b) simde_mm_subs_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_subs_pi16(a, b) simde_mm_subs_pi16(a, b) -# define _m_psubsw(a, b) simde_mm_subs_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_subs_pu16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_subs_pu16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vqsub_u16(a_.neon_u16, b_.neon_u16); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_u16 = psubush(a_.mmi_u16, b_.mmi_u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - const int x = a_.u16[i] - b_.u16[i]; - if (x < 0) { - r_.u16[i] = 0; - } else if (x > UINT16_MAX) { - r_.u16[i] = UINT16_MAX; - } else { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, x); - } - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psubusw(a, b) simde_mm_subs_pu16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_subs_pu16(a, b) simde_mm_subs_pu16(a, b) -# define _m_psubusw(a, b) simde_mm_subs_pu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_unpackhi_pi8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_unpackhi_pi8(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i8 = vzip2_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i8 = punpckhbh_s(a_.mmi_i8, b_.mmi_i8); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 4, 12, 5, 13, 6, 14, 7, 15); - #else - r_.i8[0] = a_.i8[4]; - r_.i8[1] = b_.i8[4]; - r_.i8[2] = a_.i8[5]; - r_.i8[3] = b_.i8[5]; - r_.i8[4] = a_.i8[6]; - r_.i8[5] = b_.i8[6]; - r_.i8[6] = a_.i8[7]; - r_.i8[7] = b_.i8[7]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_unpackhi_pi8(a, b) simde_mm_unpackhi_pi8(a, b) -# define _m_punpckhbw(a, b) simde_mm_unpackhi_pi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_unpackhi_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_unpackhi_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i16 = vzip2_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = punpckhhw_s(a_.mmi_i16, b_.mmi_i16); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 2, 6, 3, 7); - #else - r_.i16[0] = a_.i16[2]; - r_.i16[1] = b_.i16[2]; - r_.i16[2] = a_.i16[3]; - r_.i16[3] = b_.i16[3]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_unpackhi_pi16(a, b) simde_mm_unpackhi_pi16(a, b) -# define _m_punpckhwd(a, b) simde_mm_unpackhi_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_unpackhi_pi32 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_unpackhi_pi32(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i32 = vzip2_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i32 = punpckhwd_s(a_.mmi_i32, b_.mmi_i32); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 1, 3); - #else - r_.i32[0] = a_.i32[1]; - r_.i32[1] = b_.i32[1]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_unpackhi_pi32(a, b) simde_mm_unpackhi_pi32(a, b) -# define _m_punpckhdq(a, b) simde_mm_unpackhi_pi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_unpacklo_pi8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_unpacklo_pi8(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i8 = vzip1_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i8 = punpcklbh_s(a_.mmi_i8, b_.mmi_i8); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 8, a_.i8, b_.i8, 0, 8, 1, 9, 2, 10, 3, 11); - #else - r_.i8[0] = a_.i8[0]; - r_.i8[1] = b_.i8[0]; - r_.i8[2] = a_.i8[1]; - r_.i8[3] = b_.i8[1]; - r_.i8[4] = a_.i8[2]; - r_.i8[5] = b_.i8[2]; - r_.i8[6] = a_.i8[3]; - r_.i8[7] = b_.i8[3]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_unpacklo_pi8(a, b) simde_mm_unpacklo_pi8(a, b) -# define _m_punpcklbw(a, b) simde_mm_unpacklo_pi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_unpacklo_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_unpacklo_pi16(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i16 = vzip1_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i16 = punpcklhw_s(a_.mmi_i16, b_.mmi_i16); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 0, 4, 1, 5); - #else - r_.i16[0] = a_.i16[0]; - r_.i16[1] = b_.i16[0]; - r_.i16[2] = a_.i16[1]; - r_.i16[3] = b_.i16[1]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_unpacklo_pi16(a, b) simde_mm_unpacklo_pi16(a, b) -# define _m_punpcklwd(a, b) simde_mm_unpacklo_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_unpacklo_pi32 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_unpacklo_pi32(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i32 = vzip1_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) - r_.mmi_i32 = punpcklwd_s(a_.mmi_i32, b_.mmi_i32); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 0, 2); - #else - r_.i32[0] = a_.i32[0]; - r_.i32[1] = b_.i32[0]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_unpacklo_pi32(a, b) simde_mm_unpacklo_pi32(a, b) -# define _m_punpckldq(a, b) simde_mm_unpacklo_pi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_xor_si64 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _mm_xor_si64(a, b); - #else - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = veor_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f ^ b_.i32f; - #else - r_.u64[0] = a_.u64[0] ^ b_.u64[0]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pxor(a, b) simde_mm_xor_si64(a, b) -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _mm_xor_si64(a, b) simde_mm_xor_si64(a, b) -# define _m_pxor(a, b) simde_mm_xor_si64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_m_to_int (simde__m64 a) { - #if defined(SIMDE_X86_MMX_NATIVE) - return _m_to_int(a); - #else - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - HEDLEY_DIAGNOSTIC_PUSH - #if HEDLEY_HAS_WARNING("-Wvector-conversion") && SIMDE_DETECT_CLANG_VERSION_NOT(10,0,0) - #pragma clang diagnostic ignored "-Wvector-conversion" - #endif - return vget_lane_s32(a_.neon_i32, 0); - HEDLEY_DIAGNOSTIC_POP - #else - return a_.i32[0]; - #endif - #endif -} -#if defined(SIMDE_X86_MMX_ENABLE_NATIVE_ALIASES) -# define _m_to_int(a) simde_m_to_int(a) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_MMX_H) */ diff --git a/ffi-deps/simde/simde/x86/sse.h b/ffi-deps/simde/simde/x86/sse.h deleted file mode 100644 index a3e060a..0000000 --- a/ffi-deps/simde/simde/x86/sse.h +++ /dev/null @@ -1,4830 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2017-2020 Evan Nemerson - * 2015-2017 John W. Ratcliff - * 2015 Brandon Rowlett - * 2015 Ken Fast - */ - -#if !defined(SIMDE_X86_SSE_H) -#define SIMDE_X86_SSE_H - -#include "mmx.h" -#include "../simde-f16.h" - -#if defined(_WIN32) && !defined(SIMDE_X86_SSE_NATIVE) && defined(_MSC_VER) - #define NOMINMAX - #include -#endif - -#if defined(__ARM_ACLE) - #include -#endif - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -typedef union { - #if defined(SIMDE_VECTOR_SUBSCRIPT) - SIMDE_ALIGN_TO_16 int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_16 simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - #endif - #if defined(SIMDE_FLOAT16_VECTOR) - SIMDE_ALIGN_TO_16 simde_float16 f16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - #else - SIMDE_ALIGN_TO_16 simde_float16 f16[8]; - #endif - SIMDE_ALIGN_TO_16 simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - #else - SIMDE_ALIGN_TO_16 int8_t i8[16]; - SIMDE_ALIGN_TO_16 int16_t i16[8]; - SIMDE_ALIGN_TO_16 int32_t i32[4]; - SIMDE_ALIGN_TO_16 int64_t i64[2]; - SIMDE_ALIGN_TO_16 uint8_t u8[16]; - SIMDE_ALIGN_TO_16 uint16_t u16[8]; - SIMDE_ALIGN_TO_16 uint32_t u32[4]; - SIMDE_ALIGN_TO_16 uint64_t u64[2]; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_16 simde_int128 i128[1]; - SIMDE_ALIGN_TO_16 simde_uint128 u128[1]; - #endif - SIMDE_ALIGN_TO_16 simde_float16 f16[8]; - SIMDE_ALIGN_TO_16 simde_float32 f32[4]; - SIMDE_ALIGN_TO_16 int_fast32_t i32f[16 / sizeof(int_fast32_t)]; - SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)]; - #endif - - SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2]; - SIMDE_ALIGN_TO_16 simde__m64 m64[2]; - - #if defined(SIMDE_X86_SSE_NATIVE) - SIMDE_ALIGN_TO_16 __m128 n; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_TO_16 int8x16_t neon_i8; - SIMDE_ALIGN_TO_16 int16x8_t neon_i16; - SIMDE_ALIGN_TO_16 int32x4_t neon_i32; - SIMDE_ALIGN_TO_16 int64x2_t neon_i64; - SIMDE_ALIGN_TO_16 uint8x16_t neon_u8; - SIMDE_ALIGN_TO_16 uint16x8_t neon_u16; - SIMDE_ALIGN_TO_16 uint32x4_t neon_u32; - SIMDE_ALIGN_TO_16 uint64x2_t neon_u64; - SIMDE_ALIGN_TO_16 float32x4_t neon_f32; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - SIMDE_ALIGN_TO_16 float64x2_t neon_f64; - #endif - #elif defined(SIMDE_MIPS_MSA_NATIVE) - v16i8 msa_i8; - v8i16 msa_i16; - v4i32 msa_i32; - v2i64 msa_i64; - v16u8 msa_u8; - v8u16 msa_u16; - v4u32 msa_u32; - v2u64 msa_u64; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - SIMDE_ALIGN_TO_16 v128_t wasm_v128; - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32; - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64; - #endif - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - v16i8 lsx_i8; - v8i16 lsx_i16; - v4i32 lsx_i32; - v2i64 lsx_i64; - v16u8 lsx_u8; - v8u16 lsx_u16; - v4u32 lsx_u32; - v2u64 lsx_u64; - v4f32 lsx_f32; - v2f64 lsx_f64; - #endif -} simde__m128_private; - -#if defined(SIMDE_X86_SSE_NATIVE) - typedef __m128 simde__m128; -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - typedef float32x4_t simde__m128; -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - typedef v128_t simde__m128; -#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128; -#elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - typedef v4f32 simde__m128; -#elif defined(SIMDE_VECTOR_SUBSCRIPT) - typedef simde_float32 simde__m128 SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; -#else - typedef simde__m128_private simde__m128; -#endif - -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - typedef simde__m128 __m128; -#endif - -HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128), "simde__m128 size incorrect"); -HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128_private), "simde__m128_private size incorrect"); -#if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF) -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128) == 16, "simde__m128 is not 16-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128_private) == 16, "simde__m128_private is not 16-byte aligned"); -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde__m128_from_private(simde__m128_private v) { - simde__m128 r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128_private -simde__m128_to_private(simde__m128 v) { - simde__m128_private r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int8x16_t, neon, i8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int16x8_t, neon, i16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int32x4_t, neon, i32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, int64x2_t, neon, i64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint8x16_t, neon, u8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint16x8_t, neon, u16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint32x4_t, neon, u32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, uint64x2_t, neon, u64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, float32x4_t, neon, f32) - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, float64x2_t, neon, f64) - #endif -#endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */ - -#if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32) - - #if defined(SIMDE_BUG_GCC_95782) - SIMDE_FUNCTION_ATTRIBUTES - SIMDE_POWER_ALTIVEC_VECTOR(float) - simde__m128_to_altivec_f32(simde__m128 value) { - simde__m128_private r_ = simde__m128_to_private(value); - return r_.altivec_f32; - } - - SIMDE_FUNCTION_ATTRIBUTES - simde__m128 - simde__m128_from_altivec_f32(SIMDE_POWER_ALTIVEC_VECTOR(float) value) { - simde__m128_private r_; - r_.altivec_f32 = value; - return simde__m128_from_private(r_); - } - #else - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(float), altivec, f32) - #endif - - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64) - #endif -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v128_t, wasm, v128); -#endif /* defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) */ - -#if defined(SIMDE_LOONGARCH_LSX_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v16i8, lsx, i8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v8i16, lsx, i16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v4i32, lsx, i32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v2i64, lsx, i64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v16u8, lsx, u8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v8u16, lsx, u16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v4u32, lsx, u32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v2u64, lsx, u64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v4f32, lsx, f32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128, v2f64, lsx, f64) -#endif /* defined(SIMDE_LOONGARCH_LSX_NATIVE) */ - -enum { - #if defined(SIMDE_X86_SSE_NATIVE) - SIMDE_MM_ROUND_NEAREST = _MM_ROUND_NEAREST, - SIMDE_MM_ROUND_DOWN = _MM_ROUND_DOWN, - SIMDE_MM_ROUND_UP = _MM_ROUND_UP, - SIMDE_MM_ROUND_TOWARD_ZERO = _MM_ROUND_TOWARD_ZERO - #else - SIMDE_MM_ROUND_NEAREST = 0x0000, - SIMDE_MM_ROUND_DOWN = 0x2000, - SIMDE_MM_ROUND_UP = 0x4000, - SIMDE_MM_ROUND_TOWARD_ZERO = 0x6000 - #endif -}; -#if defined(_MM_ROUND_MASK) -# define SIMDE_MM_ROUND_MASK _MM_ROUND_MASK -#else -# define SIMDE_MM_ROUND_MASK (0x6000) -#endif -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _MM_ROUND_MASK SIMDE_MM_ROUND_MASK -#endif - -#if defined(_MM_FROUND_TO_NEAREST_INT) -# define SIMDE_MM_FROUND_TO_NEAREST_INT _MM_FROUND_TO_NEAREST_INT -# define SIMDE_MM_FROUND_TO_NEG_INF _MM_FROUND_TO_NEG_INF -# define SIMDE_MM_FROUND_TO_POS_INF _MM_FROUND_TO_POS_INF -# define SIMDE_MM_FROUND_TO_ZERO _MM_FROUND_TO_ZERO -# define SIMDE_MM_FROUND_CUR_DIRECTION _MM_FROUND_CUR_DIRECTION - -# define SIMDE_MM_FROUND_RAISE_EXC _MM_FROUND_RAISE_EXC -# define SIMDE_MM_FROUND_NO_EXC _MM_FROUND_NO_EXC -#else -# define SIMDE_MM_FROUND_TO_NEAREST_INT 0x00 -# define SIMDE_MM_FROUND_TO_NEG_INF 0x01 -# define SIMDE_MM_FROUND_TO_POS_INF 0x02 -# define SIMDE_MM_FROUND_TO_ZERO 0x03 -# define SIMDE_MM_FROUND_CUR_DIRECTION 0x04 - -# define SIMDE_MM_FROUND_RAISE_EXC 0x00 -# define SIMDE_MM_FROUND_NO_EXC 0x08 -#endif - -#define SIMDE_MM_FROUND_NINT \ - (SIMDE_MM_FROUND_TO_NEAREST_INT | SIMDE_MM_FROUND_RAISE_EXC) -#define SIMDE_MM_FROUND_FLOOR \ - (SIMDE_MM_FROUND_TO_NEG_INF | SIMDE_MM_FROUND_RAISE_EXC) -#define SIMDE_MM_FROUND_CEIL \ - (SIMDE_MM_FROUND_TO_POS_INF | SIMDE_MM_FROUND_RAISE_EXC) -#define SIMDE_MM_FROUND_TRUNC \ - (SIMDE_MM_FROUND_TO_ZERO | SIMDE_MM_FROUND_RAISE_EXC) -#define SIMDE_MM_FROUND_RINT \ - (SIMDE_MM_FROUND_CUR_DIRECTION | SIMDE_MM_FROUND_RAISE_EXC) -#define SIMDE_MM_FROUND_NEARBYINT \ - (SIMDE_MM_FROUND_CUR_DIRECTION | SIMDE_MM_FROUND_NO_EXC) - -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) && !defined(_MM_FROUND_TO_NEAREST_INT) -# define _MM_FROUND_TO_NEAREST_INT SIMDE_MM_FROUND_TO_NEAREST_INT -# define _MM_FROUND_TO_NEG_INF SIMDE_MM_FROUND_TO_NEG_INF -# define _MM_FROUND_TO_POS_INF SIMDE_MM_FROUND_TO_POS_INF -# define _MM_FROUND_TO_ZERO SIMDE_MM_FROUND_TO_ZERO -# define _MM_FROUND_CUR_DIRECTION SIMDE_MM_FROUND_CUR_DIRECTION -# define _MM_FROUND_RAISE_EXC SIMDE_MM_FROUND_RAISE_EXC -# define _MM_FROUND_NINT SIMDE_MM_FROUND_NINT -# define _MM_FROUND_FLOOR SIMDE_MM_FROUND_FLOOR -# define _MM_FROUND_CEIL SIMDE_MM_FROUND_CEIL -# define _MM_FROUND_TRUNC SIMDE_MM_FROUND_TRUNC -# define _MM_FROUND_RINT SIMDE_MM_FROUND_RINT -# define _MM_FROUND_NEARBYINT SIMDE_MM_FROUND_NEARBYINT -#endif - -#if defined(_MM_EXCEPT_INVALID) -# define SIMDE_MM_EXCEPT_INVALID _MM_EXCEPT_INVALID -#else -# define SIMDE_MM_EXCEPT_INVALID (0x0001) -#endif -#if defined(_MM_EXCEPT_DENORM) -# define SIMDE_MM_EXCEPT_DENORM _MM_EXCEPT_DENORM -#else -# define SIMDE_MM_EXCEPT_DENORM (0x0002) -#endif -#if defined(_MM_EXCEPT_DIV_ZERO) -# define SIMDE_MM_EXCEPT_DIV_ZERO _MM_EXCEPT_DIV_ZERO -#else -# define SIMDE_MM_EXCEPT_DIV_ZERO (0x0004) -#endif -#if defined(_MM_EXCEPT_OVERFLOW) -# define SIMDE_MM_EXCEPT_OVERFLOW _MM_EXCEPT_OVERFLOW -#else -# define SIMDE_MM_EXCEPT_OVERFLOW (0x0008) -#endif -#if defined(_MM_EXCEPT_UNDERFLOW) -# define SIMDE_MM_EXCEPT_UNDERFLOW _MM_EXCEPT_UNDERFLOW -#else -# define SIMDE_MM_EXCEPT_UNDERFLOW (0x0010) -#endif -#if defined(_MM_EXCEPT_INEXACT) -# define SIMDE_MM_EXCEPT_INEXACT _MM_EXCEPT_INEXACT -#else -# define SIMDE_MM_EXCEPT_INEXACT (0x0020) -#endif -#if defined(_MM_EXCEPT_MASK) -# define SIMDE_MM_EXCEPT_MASK _MM_EXCEPT_MASK -#else -# define SIMDE_MM_EXCEPT_MASK \ - (SIMDE_MM_EXCEPT_INVALID | SIMDE_MM_EXCEPT_DENORM | \ - SIMDE_MM_EXCEPT_DIV_ZERO | SIMDE_MM_EXCEPT_OVERFLOW | \ - SIMDE_MM_EXCEPT_UNDERFLOW | SIMDE_MM_EXCEPT_INEXACT) -#endif -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _MM_EXCEPT_INVALID SIMDE_MM_EXCEPT_INVALID - #define _MM_EXCEPT_DENORM SIMDE_MM_EXCEPT_DENORM - #define _MM_EXCEPT_DIV_ZERO SIMDE_MM_EXCEPT_DIV_ZERO - #define _MM_EXCEPT_OVERFLOW SIMDE_MM_EXCEPT_OVERFLOW - #define _MM_EXCEPT_UNDERFLOW SIMDE_MM_EXCEPT_UNDERFLOW - #define _MM_EXCEPT_INEXACT SIMDE_MM_EXCEPT_INEXACT - #define _MM_EXCEPT_MASK SIMDE_MM_EXCEPT_MASK -#endif - -#if defined(_MM_MASK_INVALID) -# define SIMDE_MM_MASK_INVALID _MM_MASK_INVALID -#else -# define SIMDE_MM_MASK_INVALID (0x0080) -#endif -#if defined(_MM_MASK_DENORM) -# define SIMDE_MM_MASK_DENORM _MM_MASK_DENORM -#else -# define SIMDE_MM_MASK_DENORM (0x0100) -#endif -#if defined(_MM_MASK_DIV_ZERO) -# define SIMDE_MM_MASK_DIV_ZERO _MM_MASK_DIV_ZERO -#else -# define SIMDE_MM_MASK_DIV_ZERO (0x0200) -#endif -#if defined(_MM_MASK_OVERFLOW) -# define SIMDE_MM_MASK_OVERFLOW _MM_MASK_OVERFLOW -#else -# define SIMDE_MM_MASK_OVERFLOW (0x0400) -#endif -#if defined(_MM_MASK_UNDERFLOW) -# define SIMDE_MM_MASK_UNDERFLOW _MM_MASK_UNDERFLOW -#else -# define SIMDE_MM_MASK_UNDERFLOW (0x0800) -#endif -#if defined(_MM_MASK_INEXACT) -# define SIMDE_MM_MASK_INEXACT _MM_MASK_INEXACT -#else -# define SIMDE_MM_MASK_INEXACT (0x1000) -#endif -#if defined(_MM_MASK_MASK) -# define SIMDE_MM_MASK_MASK _MM_MASK_MASK -#else -# define SIMDE_MM_MASK_MASK \ - (SIMDE_MM_MASK_INVALID | SIMDE_MM_MASK_DENORM | \ - SIMDE_MM_MASK_DIV_ZERO | SIMDE_MM_MASK_OVERFLOW | \ - SIMDE_MM_MASK_UNDERFLOW | SIMDE_MM_MASK_INEXACT) -#endif -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _MM_MASK_INVALID SIMDE_MM_MASK_INVALID - #define _MM_MASK_DENORM SIMDE_MM_MASK_DENORM - #define _MM_MASK_DIV_ZERO SIMDE_MM_MASK_DIV_ZERO - #define _MM_MASK_OVERFLOW SIMDE_MM_MASK_OVERFLOW - #define _MM_MASK_UNDERFLOW SIMDE_MM_MASK_UNDERFLOW - #define _MM_MASK_INEXACT SIMDE_MM_MASK_INEXACT - #define _MM_MASK_MASK SIMDE_MM_MASK_MASK -#endif - -#if defined(_MM_FLUSH_ZERO_MASK) -# define SIMDE_MM_FLUSH_ZERO_MASK _MM_FLUSH_ZERO_MASK -#else -# define SIMDE_MM_FLUSH_ZERO_MASK (0x8000) -#endif -#if defined(_MM_FLUSH_ZERO_ON) -# define SIMDE_MM_FLUSH_ZERO_ON _MM_FLUSH_ZERO_ON -#else -# define SIMDE_MM_FLUSH_ZERO_ON (0x8000) -#endif -#if defined(_MM_FLUSH_ZERO_OFF) -# define SIMDE_MM_FLUSH_ZERO_OFF _MM_FLUSH_ZERO_OFF -#else -# define SIMDE_MM_FLUSH_ZERO_OFF (0x0000) -#endif -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _MM_FLUSH_ZERO_MASK SIMDE_MM_FLUSH_ZERO_MASK - #define _MM_FLUSH_ZERO_ON SIMDE_MM_FLUSH_ZERO_ON - #define _MM_FLUSH_ZERO_OFF SIMDE_MM_FLUSH_ZERO_OFF -#endif - -SIMDE_FUNCTION_ATTRIBUTES -uint32_t -SIMDE_MM_GET_ROUNDING_MODE(void) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _MM_GET_ROUNDING_MODE(); - #elif defined(SIMDE_HAVE_FENV_H) - unsigned int vfe_mode; - - switch (fegetround()) { - #if defined(FE_TONEAREST) - case FE_TONEAREST: - vfe_mode = SIMDE_MM_ROUND_NEAREST; - break; - #endif - - #if defined(FE_TOWARDZERO) - case FE_TOWARDZERO: - vfe_mode = SIMDE_MM_ROUND_DOWN; - break; - #endif - - #if defined(FE_UPWARD) - case FE_UPWARD: - vfe_mode = SIMDE_MM_ROUND_UP; - break; - #endif - - #if defined(FE_DOWNWARD) - case FE_DOWNWARD: - vfe_mode = SIMDE_MM_ROUND_TOWARD_ZERO; - break; - #endif - - default: - vfe_mode = SIMDE_MM_ROUND_NEAREST; - break; - } - - return vfe_mode; - #else - return SIMDE_MM_ROUND_NEAREST; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _MM_GET_ROUNDING_MODE() SIMDE_MM_GET_ROUNDING_MODE() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -SIMDE_MM_SET_ROUNDING_MODE(uint32_t a) { - #if defined(SIMDE_X86_SSE_NATIVE) - _MM_SET_ROUNDING_MODE(a); - #elif defined(SIMDE_HAVE_FENV_H) - int fe_mode = FE_TONEAREST; - - switch (a) { - #if defined(FE_TONEAREST) - case SIMDE_MM_ROUND_NEAREST: - fe_mode = FE_TONEAREST; - break; - #endif - - #if defined(FE_TOWARDZERO) - case SIMDE_MM_ROUND_TOWARD_ZERO: - fe_mode = FE_TOWARDZERO; - break; - #endif - - #if defined(FE_DOWNWARD) - case SIMDE_MM_ROUND_DOWN: - fe_mode = FE_DOWNWARD; - break; - #endif - - #if defined(FE_UPWARD) - case SIMDE_MM_ROUND_UP: - fe_mode = FE_UPWARD; - break; - #endif - - default: - return; - } - - fesetround(fe_mode); - #else - (void) a; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _MM_SET_ROUNDING_MODE(a) SIMDE_MM_SET_ROUNDING_MODE(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -uint32_t -SIMDE_MM_GET_FLUSH_ZERO_MODE (void) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_getcsr() & _MM_FLUSH_ZERO_MASK; - #else - return SIMDE_MM_FLUSH_ZERO_OFF; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _MM_GET_FLUSH_ZERO_MODE(a) SIMDE_MM_GET_FLUSH_ZERO_MODE(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -SIMDE_MM_SET_FLUSH_ZERO_MODE (uint32_t a) { - #if defined(SIMDE_X86_SSE_NATIVE) - _MM_SET_FLUSH_ZERO_MODE(a); - #else - (void) a; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _MM_SET_FLUSH_ZERO_MODE(a) SIMDE_MM_SET_FLUSH_ZERO_MODE(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -uint32_t -simde_mm_getcsr (void) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_getcsr(); - #else - return SIMDE_MM_GET_ROUNDING_MODE(); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _mm_getcsr() simde_mm_getcsr() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_setcsr (uint32_t a) { - #if defined(SIMDE_X86_SSE_NATIVE) - _mm_setcsr(a); - #else - SIMDE_MM_SET_ROUNDING_MODE(HEDLEY_STATIC_CAST(uint32_t, a & SIMDE_MM_ROUND_MASK)); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _mm_setcsr(a) simde_mm_setcsr(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_round_ps (simde__m128 a, int rounding, int lax_rounding) - SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) - SIMDE_REQUIRE_CONSTANT_RANGE(lax_rounding, 0, 1) { - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - (void) lax_rounding; - - /* For architectures which lack a current direction SIMD instruction. - * - * Note that NEON actually has a current rounding mode instruction, - * but in ARMv8+ the rounding mode is ignored and nearest is always - * used, so we treat ARMv7 as having a rounding mode but ARMv8 as - * not. */ - #if \ - defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \ - defined(SIMDE_ARM_NEON_A32V8) - if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION) - rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE()) << 13; - #endif - - switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { - case SIMDE_MM_FROUND_CUR_DIRECTION: - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_round(a_.altivec_f32)); - #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399) - r_.neon_f32 = vrndiq_f32(a_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_nearest(a_.wasm_v128); - #elif defined(simde_math_nearbyintf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_nearbyintf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_NEAREST_INT: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_rint(a_.altivec_f32)); - #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) - r_.neon_f32 = vrndnq_f32(a_.neon_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfrintrne_s(a_.lsx_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_nearest(a_.wasm_v128); - #elif defined(simde_math_roundevenf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_roundevenf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_NEG_INF: - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_floor(a_.altivec_f32)); - #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) - r_.neon_f32 = vrndmq_f32(a_.neon_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfrintrm_s(a_.lsx_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_floor(a_.wasm_v128); - #elif defined(simde_math_floorf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_floorf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_POS_INF: - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_ceil(a_.altivec_f32)); - #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) - r_.neon_f32 = vrndpq_f32(a_.neon_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfrintrp_s(a_.lsx_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_ceil(a_.wasm_v128); - #elif defined(simde_math_ceilf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_ceilf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_ZERO: - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_trunc(a_.altivec_f32)); - #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) - r_.neon_f32 = vrndq_f32(a_.neon_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfrintrz_s(a_.lsx_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_trunc(a_.wasm_v128); - #elif defined(simde_math_truncf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_truncf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - #endif - break; - - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - } - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) - #define simde_mm_round_ps(a, rounding) _mm_round_ps((a), (rounding)) -#else - #define simde_mm_round_ps(a, rounding) simde_x_mm_round_ps((a), (rounding), 0) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #define _mm_round_ps(a, rounding) simde_mm_round_ps((a), (rounding)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_set_ps (simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_set_ps(e3, e2, e1, e0); - #else - simde__m128_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_TO_16 simde_float32 data[4] = { e0, e1, e2, e3 }; - r_.neon_f32 = vld1q_f32(data); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_make(e0, e1, e2, e3); - #else - r_.f32[0] = e0; - r_.f32[1] = e1; - r_.f32[2] = e2; - r_.f32[3] = e3; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_set_ps(e3, e2, e1, e0) simde_mm_set_ps(e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_set_ps1 (simde_float32 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_set_ps1(a); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return vdupq_n_f32(a); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - (void) a; - return vec_splats(a); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - return (simde__m128)__lsx_vldrepl_w(&a, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f32x4_splat(a); - #else - return simde_mm_set_ps(a, a, a, a); - #endif -} -#define simde_mm_set1_ps(a) simde_mm_set_ps1(a) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_set_ps1(a) simde_mm_set_ps1(a) -# define _mm_set1_ps(a) simde_mm_set1_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_move_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_move_ss(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vsetq_lane_f32(vgetq_lane_f32(b_.neon_f32, 0), a_.neon_f32, 0); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) m = { ~0U, 0U, 0U, 0U }; - r_.altivec_f32 = vec_sel(a_.altivec_f32, b_.altivec_f32, m); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_shuffle(b_.wasm_v128, a_.wasm_v128, 0, 1, 2, 3, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vextrins_w(a_.lsx_i64, b_.lsx_i64, 0); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 4, 1, 2, 3); - #else - r_.f32[0] = b_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_move_ss(a, b) simde_mm_move_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_broadcastlow_ps(simde__m128 a) { - /* This function broadcasts the first element in the inpu vector to - * all lanes. It is used to avoid generating spurious exceptions in - * *_ss functions since there may be garbage in the upper lanes. */ - - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_shuffle_ps(a, a, 0); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vdupq_laneq_f32(a_.neon_f32, 0); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = vec_splat(a_.altivec_f32, 0); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vreplvei_w(a_.lsx_i64, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_splat(a_.f32[0]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 0, 0); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[0]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_add_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_add_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vaddq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_add(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = vec_add(a_.altivec_f32, b_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f32 = __lsx_vfadd_s(a_.lsx_f32, b_.lsx_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = a_.f32 + b_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i] + b_.f32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_add_ps(a, b) simde_mm_add_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_add_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_add_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_add_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_add_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32_t b0 = vgetq_lane_f32(b_.neon_f32, 0); - float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); - // the upper values in the result must be the remnants of . - r_.neon_f32 = vaddq_f32(a_.neon_f32, value); - #else - r_.f32[0] = a_.f32[0] + b_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_add_ss(a, b) simde_mm_add_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_and_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_and_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vand_v(a_.lsx_i64, b_.lsx_i64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 & b_.i32; - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = vec_and(a_.altivec_f32, b_.altivec_f32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] & b_.i32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_and_ps(a, b) simde_mm_and_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_andnot_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_andnot_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - r_.altivec_f32 = vec_andc(b_.altivec_f32, a_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vandn_v(a_.lsx_i64, b_.lsx_i64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = ~a_.i32 & b_.i32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = ~(a_.i32[i]) & b_.i32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_andnot_ps(a, b) simde_mm_andnot_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_xor_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_xor_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_xor(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vxor_v(a_.lsx_i64, b_.lsx_i64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f ^ b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] ^ b_.u32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_xor_ps(a, b) simde_mm_xor_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_or_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_or_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vor_v(a_.lsx_i64, b_.lsx_i64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f | b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] | b_.u32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_or_ps(a, b) simde_mm_or_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_not_ps(simde__m128 a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - __m128i ai = _mm_castps_si128(a); - return _mm_castsi128_ps(_mm_ternarylogic_epi32(ai, ai, ai, 0x55)); - #elif defined(SIMDE_X86_SSE2_NATIVE) - /* Note: we use ints instead of floats because we don't want cmpeq - * to return false for (NaN, NaN) */ - __m128i ai = _mm_castps_si128(a); - return _mm_castsi128_ps(_mm_andnot_si128(ai, _mm_cmpeq_epi32(ai, ai))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vmvnq_s32(a_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_not(a_.wasm_v128); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vnor_v(a_.lsx_i64, a_.lsx_i64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = ~a_.i32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = ~(a_.i32[i]); - } - #endif - - return simde__m128_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_select_ps(simde__m128 a, simde__m128 b, simde__m128 mask) { - /* This function is for when you want to blend two elements together - * according to a mask. It is similar to _mm_blendv_ps, except that - * it is undefined whether the blend is based on the highest bit in - * each lane (like blendv) or just bitwise operations. This allows - * us to implement the function efficiently everywhere. - * - * Basically, you promise that all the lanes in mask are either 0 or - * ~0. */ - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_blendv_ps(a, b, mask); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b), - mask_ = simde__m128_to_private(mask); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vbslq_s32(mask_.neon_u32, b_.neon_i32, a_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_bitselect(b_.wasm_v128, a_.wasm_v128, mask_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, mask_.altivec_u32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vbitsel_v(a_.lsx_i64, b_.lsx_i64, mask_.lsx_i64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 ^ ((a_.i32 ^ b_.i32) & mask_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] ^ ((a_.i32[i] ^ b_.i32[i]) & mask_.i32[i]); - } - #endif - - return simde__m128_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_avg_pu16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_avg_pu16(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vrhadd_u16(b_.neon_u16, a_.neon_u16); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_GCC_100761) - uint32_t wa SIMDE_VECTOR(16); - uint32_t wb SIMDE_VECTOR(16); - uint32_t wr SIMDE_VECTOR(16); - SIMDE_CONVERT_VECTOR_(wa, a_.u16); - SIMDE_CONVERT_VECTOR_(wb, b_.u16); - wr = (wa + wb + 1) >> 1; - SIMDE_CONVERT_VECTOR_(r_.u16, wr); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pavgw(a, b) simde_mm_avg_pu16(a, b) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_avg_pu16(a, b) simde_mm_avg_pu16(a, b) -# define _m_pavgw(a, b) simde_mm_avg_pu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_avg_pu8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_avg_pu8(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vrhadd_u8(b_.neon_u8, a_.neon_u8); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_GCC_100761) - uint16_t wa SIMDE_VECTOR(16); - uint16_t wb SIMDE_VECTOR(16); - uint16_t wr SIMDE_VECTOR(16); - SIMDE_CONVERT_VECTOR_(wa, a_.u8); - SIMDE_CONVERT_VECTOR_(wb, b_.u8); - wr = (wa + wb + 1) >> 1; - SIMDE_CONVERT_VECTOR_(r_.u8, wr); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pavgb(a, b) simde_mm_avg_pu8(a, b) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_avg_pu8(a, b) simde_mm_avg_pu8(a, b) -# define _m_pavgb(a, b) simde_mm_avg_pu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_abs_ps(simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - simde_float32 mask_; - uint32_t u32_ = UINT32_C(0x7FFFFFFF); - simde_memcpy(&mask_, &u32_, sizeof(u32_)); - return _mm_and_ps(_mm_set1_ps(mask_), a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vabsq_f32(a_.neon_f32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - r_.altivec_f32 = vec_abs(a_.altivec_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_abs(a_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_fabsf(a_.f32[i]); - } - #endif - - return simde__m128_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpeq_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmpeq_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vceqq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_eq(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmpeq(a_.altivec_f32, b_.altivec_f32)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfcmp_ceq_s(a_.lsx_f32, b_.lsx_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.f32 == b_.f32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (a_.f32[i] == b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpeq_ps(a, b) simde_mm_cmpeq_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpeq_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmpeq_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_cmpeq_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_cmpeq_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - r_.u32[0] = (a_.f32[0] == b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpeq_ss(a, b) simde_mm_cmpeq_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpge_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmpge_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcgeq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_ge(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmpge(a_.altivec_f32, b_.altivec_f32)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfcmp_cle_s(b_.lsx_f32, a_.lsx_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 >= b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (a_.f32[i] >= b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpge_ps(a, b) simde_mm_cmpge_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpge_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI) - return _mm_cmpge_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_cmpge_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_cmpge_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - r_.u32[0] = (a_.f32[0] >= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpge_ss(a, b) simde_mm_cmpge_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpgt_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmpgt_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcgtq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_gt(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmpgt(a_.altivec_f32, b_.altivec_f32)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfcmp_clt_s(b_.lsx_f32, a_.lsx_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 > b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (a_.f32[i] > b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpgt_ps(a, b) simde_mm_cmpgt_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpgt_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI) - return _mm_cmpgt_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_cmpgt_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_cmpgt_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - r_.u32[0] = (a_.f32[0] > b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpgt_ss(a, b) simde_mm_cmpgt_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmple_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmple_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcleq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_le(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmple(a_.altivec_f32, b_.altivec_f32)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfcmp_cle_s(a_.lsx_f32, b_.lsx_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 <= b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (a_.f32[i] <= b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmple_ps(a, b) simde_mm_cmple_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmple_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmple_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_cmple_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_cmple_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - r_.u32[0] = (a_.f32[0] <= b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmple_ss(a, b) simde_mm_cmple_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmplt_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmplt_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcltq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_lt(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmplt(a_.altivec_f32, b_.altivec_f32)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfcmp_clt_s(a_.lsx_f32, b_.lsx_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 < b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (a_.f32[i] < b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmplt_ps(a, b) simde_mm_cmplt_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmplt_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmplt_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_cmplt_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_cmplt_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - r_.u32[0] = (a_.f32[0] < b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmplt_ss(a, b) simde_mm_cmplt_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpneq_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmpneq_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_ne(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_cmpeq(a_.altivec_f32, b_.altivec_f32)); - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), vec_nor(r_.altivec_f32, r_.altivec_f32)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfcmp_cune_s(a_.lsx_f32, b_.lsx_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.f32 != b_.f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (a_.f32[i] != b_.f32[i]) ? ~UINT32_C(0) : UINT32_C(0); - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpneq_ps(a, b) simde_mm_cmpneq_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpneq_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmpneq_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_cmpneq_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_cmpneq_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - r_.u32[0] = (a_.f32[0] != b_.f32[0]) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpneq_ss(a, b) simde_mm_cmpneq_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpnge_ps (simde__m128 a, simde__m128 b) { - return simde_mm_cmplt_ps(a, b); -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpnge_ps(a, b) simde_mm_cmpnge_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpnge_ss (simde__m128 a, simde__m128 b) { - return simde_mm_cmplt_ss(a, b); -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpnge_ss(a, b) simde_mm_cmpnge_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpngt_ps (simde__m128 a, simde__m128 b) { - return simde_mm_cmple_ps(a, b); -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpngt_ps(a, b) simde_mm_cmpngt_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpngt_ss (simde__m128 a, simde__m128 b) { - return simde_mm_cmple_ss(a, b); -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpngt_ss(a, b) simde_mm_cmpngt_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpnle_ps (simde__m128 a, simde__m128 b) { - return simde_mm_cmpgt_ps(a, b); -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpnle_ps(a, b) simde_mm_cmpnle_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpnle_ss (simde__m128 a, simde__m128 b) { - return simde_mm_cmpgt_ss(a, b); -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpnle_ss(a, b) simde_mm_cmpnle_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpnlt_ps (simde__m128 a, simde__m128 b) { - return simde_mm_cmpge_ps(a, b); -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpnlt_ps(a, b) simde_mm_cmpnlt_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpnlt_ss (simde__m128 a, simde__m128 b) { - return simde_mm_cmpge_ss(a, b); -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpnlt_ss(a, b) simde_mm_cmpnlt_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpord_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmpord_ps(a, b); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_v128_and(wasm_f32x4_eq(a, a), wasm_f32x4_eq(b, b)); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - /* Note: NEON does not have ordered compare builtin - Need to compare a eq a and b eq b to check for NaN - Do AND of results to get final */ - uint32x4_t ceqaa = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t ceqbb = vceqq_f32(b_.neon_f32, b_.neon_f32); - r_.neon_u32 = vandq_u32(ceqaa, ceqbb); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_and(wasm_f32x4_eq(a_.wasm_v128, a_.wasm_v128), wasm_f32x4_eq(b_.wasm_v128, b_.wasm_v128)); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), - vec_and(vec_cmpeq(a_.altivec_f32, a_.altivec_f32), vec_cmpeq(b_.altivec_f32, b_.altivec_f32))); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfcmp_cun_s(a_.lsx_f32, b_.lsx_f32); - r_.lsx_i64 = __lsx_vnor_v(r_.lsx_i64, r_.lsx_i64); - #elif defined(simde_math_isnanf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (simde_math_isnanf(a_.f32[i]) || simde_math_isnanf(b_.f32[i])) ? UINT32_C(0) : ~UINT32_C(0); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpord_ps(a, b) simde_mm_cmpord_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpunord_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmpunord_ps(a, b); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_v128_or(wasm_f32x4_ne(a, a), wasm_f32x4_ne(b, b)); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t ceqaa = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t ceqbb = vceqq_f32(b_.neon_f32, b_.neon_f32); - r_.neon_u32 = vmvnq_u32(vandq_u32(ceqaa, ceqbb)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_or(wasm_f32x4_ne(a_.wasm_v128, a_.wasm_v128), wasm_f32x4_ne(b_.wasm_v128, b_.wasm_v128)); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), - vec_nand(vec_cmpeq(a_.altivec_f32, a_.altivec_f32), vec_cmpeq(b_.altivec_f32, b_.altivec_f32))); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), - vec_and(vec_cmpeq(a_.altivec_f32, a_.altivec_f32), vec_cmpeq(b_.altivec_f32, b_.altivec_f32))); - r_.altivec_f32 = vec_nor(r_.altivec_f32, r_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vfcmp_cun_s(a_.lsx_f32, b_.lsx_f32); - #elif defined(simde_math_isnanf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = (simde_math_isnanf(a_.f32[i]) || simde_math_isnanf(b_.f32[i])) ? ~UINT32_C(0) : UINT32_C(0); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpunord_ps(a, b) simde_mm_cmpunord_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpunord_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI) - return _mm_cmpunord_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_cmpunord_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_cmpunord_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(simde_math_isnanf) - r_.u32[0] = (simde_math_isnanf(a_.f32[0]) || simde_math_isnanf(b_.f32[0])) ? ~UINT32_C(0) : UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpunord_ss(a, b) simde_mm_cmpunord_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comieq_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_comieq_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); - uint32x4_t a_eq_b = vceqq_f32(a_.neon_f32, b_.neon_f32); - return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f32x4_extract_lane(a_.wasm_v128, 0) == wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #else - return a_.f32[0] == b_.f32[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_comieq_ss(a, b) simde_mm_comieq_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comige_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_comige_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_ge_b = vcgeq_f32(a_.neon_f32, b_.neon_f32); - return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f32x4_extract_lane(a_.wasm_v128, 0) >= wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #else - return a_.f32[0] >= b_.f32[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_comige_ss(a, b) simde_mm_comige_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comigt_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_comigt_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_gt_b = vcgtq_f32(a_.neon_f32, b_.neon_f32); - return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f32x4_extract_lane(a_.wasm_v128, 0) > wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #else - return a_.f32[0] > b_.f32[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_comigt_ss(a, b) simde_mm_comigt_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comile_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_comile_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); - uint32x4_t a_le_b = vcleq_f32(a_.neon_f32, b_.neon_f32); - return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f32x4_extract_lane(a_.wasm_v128, 0) <= wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #else - return a_.f32[0] <= b_.f32[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_comile_ss(a, b) simde_mm_comile_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comilt_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_comilt_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); - uint32x4_t a_lt_b = vcltq_f32(a_.neon_f32, b_.neon_f32); - return !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f32x4_extract_lane(a_.wasm_v128, 0) < wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #else - return a_.f32[0] < b_.f32[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_comilt_ss(a, b) simde_mm_comilt_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comineq_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_comineq_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32)); - return !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f32x4_extract_lane(a_.wasm_v128, 0) != wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #else - return a_.f32[0] != b_.f32[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_comineq_ss(a, b) simde_mm_comineq_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_copysign_ps(simde__m128 dest, simde__m128 src) { - simde__m128_private - r_, - dest_ = simde__m128_to_private(dest), - src_ = simde__m128_to_private(src); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const uint32x4_t sign_pos = vreinterpretq_u32_f32(vdupq_n_f32(-SIMDE_FLOAT32_C(0.0))); - r_.neon_u32 = vbslq_u32(sign_pos, src_.neon_u32, dest_.neon_u32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - const v128_t sign_pos = wasm_f32x4_splat(-0.0f); - r_.wasm_v128 = wasm_v128_bitselect(src_.wasm_v128, dest_.wasm_v128, sign_pos); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - #if defined(SIMDE_BUG_VEC_CPSGN_REVERSED_ARGS) - r_.altivec_f32 = vec_cpsgn(dest_.altivec_f32, src_.altivec_f32); - #else - r_.altivec_f32 = vec_cpsgn(src_.altivec_f32, dest_.altivec_f32); - #endif - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - const SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) sign_pos = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), vec_splats(-0.0f)); - r_.altivec_f32 = vec_sel(dest_.altivec_f32, src_.altivec_f32, sign_pos); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - const v4f32 sign_pos = {-0.0f, -0.0f, -0.0f, -0.0f}; - r_.lsx_i64 = __lsx_vbitsel_v(dest_.lsx_i64, src_.lsx_i64, (v2i64)sign_pos); - #elif defined(SIMDE_IEEE754_STORAGE) - (void) src_; - (void) dest_; - simde__m128 sign_pos = simde_mm_set1_ps(-0.0f); - r_ = simde__m128_to_private(simde_mm_xor_ps(dest, simde_mm_and_ps(simde_mm_xor_ps(dest, src), sign_pos))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_copysignf(dest_.f32[i], src_.f32[i]); - } - #endif - - return simde__m128_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_xorsign_ps(simde__m128 dest, simde__m128 src) { - return simde_mm_xor_ps(simde_mm_and_ps(simde_mm_set1_ps(-0.0f), src), dest); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvt_pi2ps (simde__m128 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvt_pi2ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vcombine_f32(vcvt_f32_s32(b_.neon_i32), vget_high_f32(a_.neon_f32)); - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, b_.i32); - r_.m64_private[1] = a_.m64_private[1]; - #else - r_.f32[0] = (simde_float32) b_.i32[0]; - r_.f32[1] = (simde_float32) b_.i32[1]; - r_.i32[2] = a_.i32[2]; - r_.i32[3] = a_.i32[3]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvt_pi2ps(a, b) simde_mm_cvt_pi2ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cvt_ps2pi (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvt_ps2pi(a); - #else - simde__m64_private r_; - simde__m128_private a_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - a_ = simde__m128_to_private(simde_mm_round_ps(a, SIMDE_MM_FROUND_CUR_DIRECTION)); - r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32)); - #elif defined(SIMDE_CONVERT_VECTOR_) && SIMDE_NATURAL_VECTOR_SIZE_GE(128) && !defined(SIMDE_BUG_GCC_100761) - a_ = simde__m128_to_private(simde_mm_round_ps(a, SIMDE_MM_FROUND_CUR_DIRECTION)); - SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].f32); - #else - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, simde_math_nearbyintf(a_.f32[i])); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvt_ps2pi(a) simde_mm_cvt_ps2pi((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvt_si2ss (simde__m128 a, int32_t b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cvt_si2ss(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vsetq_lane_f32(HEDLEY_STATIC_CAST(float, b), a_.neon_f32, 0); - #else - r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b); - r_.i32[1] = a_.i32[1]; - r_.i32[2] = a_.i32[2]; - r_.i32[3] = a_.i32[3]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvt_si2ss(a, b) simde_mm_cvt_si2ss((a), b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_cvt_ss2si (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cvt_ss2si(a); - #elif defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_BUG_GCC_95399) - return vgetq_lane_s32(vcvtnq_s32_f32(simde__m128_to_neon_f32(a)), 0); - #else - simde__m128_private a_ = simde__m128_to_private(simde_mm_round_ps(a, SIMDE_MM_FROUND_CUR_DIRECTION)); - #if !defined(SIMDE_FAST_CONVERSION_RANGE) - return ((a_.f32[0] > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && - (a_.f32[0] < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ? - SIMDE_CONVERT_FTOI(int32_t, a_.f32[0]) : INT32_MIN; - #else - return SIMDE_CONVERT_FTOI(int32_t, a_.f32[0]); - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvt_ss2si(a) simde_mm_cvt_ss2si((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtpi16_ps (simde__m64 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtpi16_ps(a); - #else - simde__m128_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vcvtq_f32_s32(vmovl_s16(a_.neon_i16)); - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.f32, a_.i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - simde_float32 v = a_.i16[i]; - r_.f32[i] = v; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtpi16_ps(a) simde_mm_cvtpi16_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtpi32_ps (simde__m128 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtpi32_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - simde__m64_private b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vcombine_f32(vcvt_f32_s32(b_.neon_i32), vget_high_f32(a_.neon_f32)); - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, b_.i32); - r_.m64_private[1] = a_.m64_private[1]; - #else - r_.f32[0] = (simde_float32) b_.i32[0]; - r_.f32[1] = (simde_float32) b_.i32[1]; - r_.i32[2] = a_.i32[2]; - r_.i32[3] = a_.i32[3]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtpi32_ps(a, b) simde_mm_cvtpi32_ps((a), b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtpi32x2_ps (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtpi32x2_ps(a, b); - #else - simde__m128_private r_; - simde__m64_private - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vcvtq_f32_s32(vcombine_s32(a_.neon_i32, b_.neon_i32)); - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.m64_private[0].f32, a_.i32); - SIMDE_CONVERT_VECTOR_(r_.m64_private[1].f32, b_.i32); - #else - r_.f32[0] = (simde_float32) a_.i32[0]; - r_.f32[1] = (simde_float32) a_.i32[1]; - r_.f32[2] = (simde_float32) b_.i32[0]; - r_.f32[3] = (simde_float32) b_.i32[1]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtpi32x2_ps(a, b) simde_mm_cvtpi32x2_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtpi8_ps (simde__m64 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtpi8_ps(a); - #else - simde__m128_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(a_.neon_i8)))); - #else - r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[0]); - r_.f32[1] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[1]); - r_.f32[2] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[2]); - r_.f32[3] = HEDLEY_STATIC_CAST(simde_float32, a_.i8[3]); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtpi8_ps(a) simde_mm_cvtpi8_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cvtps_pi16 (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtps_pi16(a); - #else - simde__m64_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95399) - r_.neon_i16 = vmovn_s32(vcvtq_s32_f32(vrndiq_f32(a_.neon_f32))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = SIMDE_CONVERT_FTOI(int16_t, simde_math_roundf(a_.f32[i])); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtps_pi16(a) simde_mm_cvtps_pi16((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cvtps_pi32 (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtps_pi32(a); - #else - simde__m64_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_BUG_GCC_95399) - r_.neon_i32 = vcvt_s32_f32(vget_low_f32(vrndiq_f32(a_.neon_f32))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - simde_float32 v = simde_math_roundf(a_.f32[i]); - #if !defined(SIMDE_FAST_CONVERSION_RANGE) - r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ? - SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; - #else - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v); - #endif - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtps_pi32(a) simde_mm_cvtps_pi32((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cvtps_pi8 (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtps_pi8(a); - #else - simde__m64_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && !defined(SIMDE_BUG_GCC_95471) - /* Clamp the input to [INT8_MIN, INT8_MAX], round, convert to i32, narrow to - * i16, combine with an all-zero vector of i16 (which will become the upper - * half), narrow to i8. */ - float32x4_t max = vdupq_n_f32(HEDLEY_STATIC_CAST(simde_float32, INT8_MAX)); - float32x4_t min = vdupq_n_f32(HEDLEY_STATIC_CAST(simde_float32, INT8_MIN)); - float32x4_t values = vrndnq_f32(vmaxq_f32(vminq_f32(max, a_.neon_f32), min)); - r_.neon_i8 = vmovn_s16(vcombine_s16(vmovn_s32(vcvtq_s32_f32(values)), vdup_n_s16(0))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.f32) / sizeof(a_.f32[0])) ; i++) { - if (a_.f32[i] > HEDLEY_STATIC_CAST(simde_float32, INT8_MAX)) - r_.i8[i] = INT8_MAX; - else if (a_.f32[i] < HEDLEY_STATIC_CAST(simde_float32, INT8_MIN)) - r_.i8[i] = INT8_MIN; - else - r_.i8[i] = SIMDE_CONVERT_FTOI(int8_t, simde_math_roundf(a_.f32[i])); - } - /* Note: the upper half is undefined */ - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtps_pi8(a) simde_mm_cvtps_pi8((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtpu16_ps (simde__m64 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtpu16_ps(a); - #else - simde__m128_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vcvtq_f32_u32(vmovl_u16(a_.neon_u16)); - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.f32, a_.u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (simde_float32) a_.u16[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtpu16_ps(a) simde_mm_cvtpu16_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtpu8_ps (simde__m64 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtpu8_ps(a); - #else - simde__m128_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(a_.neon_u8)))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = HEDLEY_STATIC_CAST(simde_float32, a_.u8[i]); - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtpu8_ps(a) simde_mm_cvtpu8_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtsi32_ss (simde__m128 a, int32_t b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cvtsi32_ss(a, b); - #else - simde__m128_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vsetq_lane_f32(HEDLEY_STATIC_CAST(float32_t, b), a_.neon_f32, 0); - #else - r_ = a_; - r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtsi32_ss(a, b) simde_mm_cvtsi32_ss((a), b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtsi64_ss (simde__m128 a, int64_t b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) - #if !defined(__PGI) - return _mm_cvtsi64_ss(a, b); - #else - return _mm_cvtsi64x_ss(a, b); - #endif - #else - simde__m128_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vsetq_lane_f32(HEDLEY_STATIC_CAST(float32_t, b), a_.neon_f32, 0); - #else - r_ = a_; - r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) -# define _mm_cvtsi64_ss(a, b) simde_mm_cvtsi64_ss((a), b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde_float32 -simde_mm_cvtss_f32 (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cvtss_f32(a); - #else - simde__m128_private a_ = simde__m128_to_private(a); - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return vgetq_lane_f32(a_.neon_f32, 0); - #else - return a_.f32[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtss_f32(a) simde_mm_cvtss_f32((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_cvtss_si32 (simde__m128 a) { - return simde_mm_cvt_ss2si(a); -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtss_si32(a) simde_mm_cvtss_si32((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int64_t -simde_mm_cvtss_si64 (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) - #if !defined(__PGI) - return _mm_cvtss_si64(a); - #else - return _mm_cvtss_si64x(a); - #endif - #else - simde__m128_private a_ = simde__m128_to_private(a); - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return SIMDE_CONVERT_FTOI(int64_t, simde_math_roundf(vgetq_lane_f32(a_.neon_f32, 0))); - #else - return SIMDE_CONVERT_FTOI(int64_t, simde_math_roundf(a_.f32[0])); - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) -# define _mm_cvtss_si64(a) simde_mm_cvtss_si64((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cvtt_ps2pi (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtt_ps2pi(a); - #else - simde__m64_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) - r_.neon_i32 = vcvt_s32_f32(vget_low_f32(a_.neon_f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - simde_float32 v = a_.f32[i]; - #if !defined(SIMDE_FAST_CONVERSION_RANGE) - r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ? - SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; - #else - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v); - #endif - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_mm_cvttps_pi32(a) simde_mm_cvtt_ps2pi(a) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtt_ps2pi(a) simde_mm_cvtt_ps2pi((a)) -# define _mm_cvttps_pi32(a) simde_mm_cvttps_pi32((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_cvtt_ss2si (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cvtt_ss2si(a); - #else - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) - return SIMDE_CONVERT_FTOI(int32_t, vgetq_lane_f32(a_.neon_f32, 0)); - #else - simde_float32 v = a_.f32[0]; - #if !defined(SIMDE_FAST_CONVERSION_RANGE) - return ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ? - SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; - #else - return SIMDE_CONVERT_FTOI(int32_t, v); - #endif - #endif - #endif -} -#define simde_mm_cvttss_si32(a) simde_mm_cvtt_ss2si((a)) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cvtt_ss2si(a) simde_mm_cvtt_ss2si((a)) -# define _mm_cvttss_si32(a) simde_mm_cvtt_ss2si((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int64_t -simde_mm_cvttss_si64 (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(_MSC_VER) - #if defined(__PGI) - return _mm_cvttss_si64x(a); - #else - return _mm_cvttss_si64(a); - #endif - #else - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return SIMDE_CONVERT_FTOI(int64_t, vgetq_lane_f32(a_.neon_f32, 0)); - #else - return SIMDE_CONVERT_FTOI(int64_t, a_.f32[0]); - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) -# define _mm_cvttss_si64(a) simde_mm_cvttss_si64((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cmpord_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_cmpord_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_cmpord_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_cmpord_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(simde_math_isnanf) - r_.u32[0] = (simde_math_isnanf(simde_mm_cvtss_f32(a)) || simde_math_isnanf(simde_mm_cvtss_f32(b))) ? UINT32_C(0) : ~UINT32_C(0); - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.u32[i] = a_.u32[i]; - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_cmpord_ss(a, b) simde_mm_cmpord_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_div_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_div_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vdivq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x4_t recip0 = vrecpeq_f32(b_.neon_f32); - float32x4_t recip1 = vmulq_f32(recip0, vrecpsq_f32(recip0, b_.neon_f32)); - r_.neon_f32 = vmulq_f32(a_.neon_f32, recip1); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_div(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f32 = vec_div(a_.altivec_f32, b_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LASX_NATIVE) - r_.lsx_f32 = __lsx_vfdiv_s(a_.lsx_f32, b_.lsx_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = a_.f32 / b_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i] / b_.f32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_div_ps(a, b) simde_mm_div_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_div_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_div_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_div_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_div_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32_t value = - vgetq_lane_f32(simde__m128_to_private(simde_mm_div_ps(a, b)).neon_f32, 0); - r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0); - #else - r_.f32[0] = a_.f32[0] / b_.f32[0]; - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_div_ss(a, b) simde_mm_div_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int16_t -simde_mm_extract_pi16 (simde__m64 a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m64_private a_ = simde__m64_to_private(a); - return a_.i16[imm8]; -} -#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(HEDLEY_PGI_VERSION) && !defined(SIMDE_BUG_CLANG_44589) - #define simde_mm_extract_pi16(a, imm8) HEDLEY_STATIC_CAST(int16_t, _mm_extract_pi16(a, imm8)) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_extract_pi16(a, imm8) vget_lane_s16(simde__m64_to_private(a).neon_i16, imm8) -#endif -#define simde_m_pextrw(a, imm8) simde_mm_extract_pi16(a, imm8) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_extract_pi16(a, imm8) simde_mm_extract_pi16((a), (imm8)) -# define _m_pextrw(a, imm8) simde_mm_extract_pi16((a), (imm8)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_insert_pi16 (simde__m64 a, int16_t i, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m64_private - a_ = simde__m64_to_private(a); - - a_.i16[imm8] = i; - - return simde__m64_from_private(a_); -} -#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) && !defined(SIMDE_BUG_CLANG_44589) - #define simde_mm_insert_pi16(a, i, imm8) _mm_insert_pi16(a, i, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_insert_pi16(a, i, imm8) simde__m64_from_neon_i16(vset_lane_s16((i), simde__m64_to_neon_i16(a), (imm8))) -#endif -#define simde_m_pinsrw(a, i, imm8) (simde_mm_insert_pi16(a, i, imm8)) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_insert_pi16(a, i, imm8) simde_mm_insert_pi16(a, i, imm8) -# define _m_pinsrw(a, i, imm8) simde_mm_insert_pi16(a, i, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_load_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) { -#if defined(SIMDE_X86_SSE_NATIVE) - return _mm_load_ps(mem_addr); -#else - simde__m128_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vld1q_f32(mem_addr); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f32 = vec_vsx_ld(0, mem_addr); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = vec_ld(0, mem_addr); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vld(mem_addr, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_load(mem_addr); - #else - simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128), sizeof(r_)); - #endif - - return simde__m128_from_private(r_); -#endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_load_ps(mem_addr) simde_mm_load_ps(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_load1_ps (simde_float32 const* mem_addr) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_load_ps1(mem_addr); - #else - simde__m128_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vld1q_dup_f32(mem_addr); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vldrepl_w(mem_addr, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_load32_splat(mem_addr); - #else - r_ = simde__m128_to_private(simde_mm_set1_ps(*mem_addr)); - #endif - - return simde__m128_from_private(r_); - #endif -} -#define simde_mm_load_ps1(mem_addr) simde_mm_load1_ps(mem_addr) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_load_ps1(mem_addr) simde_mm_load1_ps(mem_addr) -# define _mm_load1_ps(mem_addr) simde_mm_load1_ps(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_load_ss (simde_float32 const* mem_addr) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_load_ss(mem_addr); - #else - simde__m128_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vsetq_lane_f32(*mem_addr, vdupq_n_f32(0), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_load32_zero(mem_addr); - #else - r_.f32[0] = *mem_addr; - r_.i32[1] = 0; - r_.i32[2] = 0; - r_.i32[3] = 0; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_load_ss(mem_addr) simde_mm_load_ss(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_loadh_pi (simde__m128 a, simde__m64 const* mem_addr) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_loadh_pi(a, HEDLEY_REINTERPRET_CAST(__m64 const*, mem_addr)); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vcombine_f32(vget_low_f32(a_.neon_f32), vld1_f32(HEDLEY_REINTERPRET_CAST(const float32_t*, mem_addr))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_load64_lane(mem_addr, a_.wasm_v128, 1); - #else - simde__m64_private b_ = *HEDLEY_REINTERPRET_CAST(simde__m64_private const*, mem_addr); - r_.f32[0] = a_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = b_.f32[0]; - r_.f32[3] = b_.f32[1]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #if HEDLEY_HAS_WARNING("-Wold-style-cast") - #define _mm_loadh_pi(a, mem_addr) simde_mm_loadh_pi((a), HEDLEY_REINTERPRET_CAST(simde__m64 const*, (mem_addr))) - #else - #define _mm_loadh_pi(a, mem_addr) simde_mm_loadh_pi((a), (simde__m64 const*) (mem_addr)) - #endif -#endif - -/* The SSE documentation says that there are no alignment requirements - for mem_addr. Unfortunately they used the __m64 type for the argument - which is supposed to be 8-byte aligned, so some compilers (like clang - with -Wcast-align) will generate a warning if you try to cast, say, - a simde_float32* to a simde__m64* for this function. - - I think the choice of argument type is unfortunate, but I do think we - need to stick to it here. If there is demand I can always add something - like simde_x_mm_loadl_f32(simde__m128, simde_float32 mem_addr[2]) */ -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_loadl_pi (simde__m128 a, simde__m64 const* mem_addr) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_loadl_pi(a, HEDLEY_REINTERPRET_CAST(__m64 const*, mem_addr)); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vcombine_f32(vld1_f32( - HEDLEY_REINTERPRET_CAST(const float32_t*, mem_addr)), vget_high_f32(a_.neon_f32)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_load64_lane(mem_addr, a_.wasm_v128, 0); - #else - simde__m64_private b_; - simde_memcpy(&b_, mem_addr, sizeof(b_)); - r_.i32[0] = b_.i32[0]; - r_.i32[1] = b_.i32[1]; - r_.i32[2] = a_.i32[2]; - r_.i32[3] = a_.i32[3]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #if HEDLEY_HAS_WARNING("-Wold-style-cast") - #define _mm_loadl_pi(a, mem_addr) simde_mm_loadl_pi((a), HEDLEY_REINTERPRET_CAST(simde__m64 const*, (mem_addr))) - #else - #define _mm_loadl_pi(a, mem_addr) simde_mm_loadl_pi((a), (simde__m64 const*) (mem_addr)) - #endif -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_loadr_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_loadr_ps(mem_addr); - #else - simde__m128_private - r_, - v_ = simde__m128_to_private(simde_mm_load_ps(mem_addr)); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vrev64q_f32(v_.neon_f32); - r_.neon_f32 = vextq_f32(r_.neon_f32, r_.neon_f32, 2); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && defined(__PPC64__) - r_.altivec_f32 = vec_reve(v_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vshuf4i_w(v_.lsx_i64, 0x1b); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, v_.f32, v_.f32, 3, 2, 1, 0); - #else - r_.f32[0] = v_.f32[3]; - r_.f32[1] = v_.f32[2]; - r_.f32[2] = v_.f32[1]; - r_.f32[3] = v_.f32[0]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_loadr_ps(mem_addr) simde_mm_loadr_ps(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_loadu_ps (simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)]) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_loadu_ps(mem_addr); - #else - simde__m128_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vld1q_f32(HEDLEY_REINTERPRET_CAST(const float32_t*, mem_addr)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_load(mem_addr); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && defined(__PPC64__) - r_.altivec_f32 = vec_vsx_ld(0, mem_addr); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vld(mem_addr, 0); - #else - simde_memcpy(&r_, mem_addr, sizeof(r_)); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_loadu_ps(mem_addr) simde_mm_loadu_ps(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_maskmove_si64 (simde__m64 a, simde__m64 mask, int8_t* mem_addr) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - _mm_maskmove_si64(a, mask, HEDLEY_REINTERPRET_CAST(char*, mem_addr)); - #else - simde__m64_private - a_ = simde__m64_to_private(a), - mask_ = simde__m64_to_private(mask); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) - if (mask_.i8[i] < 0) - mem_addr[i] = a_.i8[i]; - #endif -} -#define simde_m_maskmovq(a, mask, mem_addr) simde_mm_maskmove_si64(a, mask, mem_addr) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_maskmove_si64(a, mask, mem_addr) simde_mm_maskmove_si64((a), (mask), SIMDE_CHECKED_REINTERPRET_CAST(int8_t*, char*, (mem_addr))) -# define _m_maskmovq(a, mask, mem_addr) simde_mm_maskmove_si64((a), (mask), SIMDE_CHECKED_REINTERPRET_CAST(int8_t*, char*, (mem_addr))) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_max_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_max_pi16(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vmax_s16(a_.neon_i16, b_.neon_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pmaxsw(a, b) simde_mm_max_pi16(a, b) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_max_pi16(a, b) simde_mm_max_pi16(a, b) -# define _m_pmaxsw(a, b) simde_mm_max_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_max_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_max_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_FAST_NANS) - r_.neon_f32 = vmaxq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vbslq_f32(vcgtq_f32(a_.neon_f32, b_.neon_f32), a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) && defined(SIMDE_FAST_NANS) - r_.wasm_v128 = wasm_f32x4_max(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_bitselect(a_.wasm_v128, b_.wasm_v128, wasm_f32x4_gt(a_.wasm_v128, b_.wasm_v128)); - #elif (defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE)) && defined(SIMDE_FAST_NANS) - r_.altivec_f32 = vec_max(a_.altivec_f32, b_.altivec_f32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - r_.altivec_f32 = vec_sel(b_.altivec_f32, a_.altivec_f32, vec_cmpgt(a_.altivec_f32, b_.altivec_f32)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) && defined(SIMDE_FAST_NANS) - r_.lsx_f32 = __lsx_vfmax_s(a_.lsx_f32, b_.lsx_f32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (a_.f32[i] > b_.f32[i]) ? a_.f32[i] : b_.f32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_max_ps(a, b) simde_mm_max_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_max_pu8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_max_pu8(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vmax_u8(a_.neon_u8, b_.neon_u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pmaxub(a, b) simde_mm_max_pu8(a, b) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_max_pu8(a, b) simde_mm_max_pu8(a, b) -# define _m_pmaxub(a, b) simde_mm_max_pu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_max_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_max_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_max_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_max_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32_t value = vgetq_lane_f32(maxq_f32(a_.neon_f32, b_.neon_f32), 0); - r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0); - #else - r_.f32[0] = (a_.f32[0] > b_.f32[0]) ? a_.f32[0] : b_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_max_ss(a, b) simde_mm_max_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_min_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_min_pi16(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vmin_s16(a_.neon_i16, b_.neon_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pminsw(a, b) simde_mm_min_pi16(a, b) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_min_pi16(a, b) simde_mm_min_pi16(a, b) -# define _m_pminsw(a, b) simde_mm_min_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_min_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_min_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_FAST_NANS) && defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vminq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_pmin(b_.wasm_v128, a_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - #if defined(SIMDE_FAST_NANS) - r_.altivec_f32 = vec_min(a_.altivec_f32, b_.altivec_f32); - #else - r_.altivec_f32 = vec_sel(b_.altivec_f32, a_.altivec_f32, vec_cmpgt(b_.altivec_f32, a_.altivec_f32)); - #endif - #elif defined(SIMDE_FAST_NANS) && defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f32 = __lsx_vfmin_s(a_.lsx_f32, b_.lsx_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - uint32_t SIMDE_VECTOR(16) m = HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f32 < b_.f32); - r_.f32 = - HEDLEY_REINTERPRET_CAST( - __typeof__(r_.f32), - ( (HEDLEY_REINTERPRET_CAST(__typeof__(m), a_.f32) & m) | - (HEDLEY_REINTERPRET_CAST(__typeof__(m), b_.f32) & ~m) - ) - ); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (a_.f32[i] < b_.f32[i]) ? a_.f32[i] : b_.f32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_min_ps(a, b) simde_mm_min_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_min_pu8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_min_pu8(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vmin_u8(a_.neon_u8, b_.neon_u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i]; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pminub(a, b) simde_mm_min_pu8(a, b) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_min_pu8(a, b) simde_mm_min_pu8(a, b) -# define _m_pminub(a, b) simde_mm_min_pu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_min_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_min_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_min_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_min_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32_t value = vgetq_lane_f32(vminq_f32(a_.neon_f32, b_.neon_f32), 0); - r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0); - #else - r_.f32[0] = (a_.f32[0] < b_.f32[0]) ? a_.f32[0] : b_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_min_ss(a, b) simde_mm_min_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_movehl_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_movehl_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vzip2q_u64(b_.neon_u64, a_.neon_u64); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x2_t a32 = vget_high_f32(a_.neon_f32); - float32x2_t b32 = vget_high_f32(b_.neon_f32); - r_.neon_f32 = vcombine_f32(b32, a32); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), - vec_mergel(b_.altivec_i64, a_.altivec_i64)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vilvh_d(a_.lsx_i64, b_.lsx_i64); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 6, 7, 2, 3); - #else - r_.f32[0] = b_.f32[2]; - r_.f32[1] = b_.f32[3]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_movehl_ps(a, b) simde_mm_movehl_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_movelh_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_movelh_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x2_t a10 = vget_low_f32(a_.neon_f32); - float32x2_t b10 = vget_low_f32(b_.neon_f32); - r_.neon_f32 = vcombine_f32(a10, b10); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), - vec_mergeh(a_.altivec_i64, b_.altivec_i64)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vilvl_d(b_.lsx_i64, a_.lsx_i64); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 1, 4, 5); - #else - r_.f32[0] = a_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = b_.f32[0]; - r_.f32[3] = b_.f32[1]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_movelh_ps(a, b) simde_mm_movelh_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_movemask_pi8 (simde__m64 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_movemask_pi8(a); - #else - simde__m64_private a_ = simde__m64_to_private(a); - int r = 0; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint8x8_t input = a_.neon_u8; - const int8_t xr[8] = {-7, -6, -5, -4, -3, -2, -1, 0}; - const uint8x8_t mask_and = vdup_n_u8(0x80); - const int8x8_t mask_shift = vld1_s8(xr); - const uint8x8_t mask_result = vshl_u8(vand_u8(input, mask_and), mask_shift); - uint8x8_t lo = mask_result; - r = vaddv_u8(lo); - #else - const size_t nmemb = sizeof(a_.i8) / sizeof(a_.i8[0]); - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < nmemb ; i++) { - r |= (a_.u8[nmemb - 1 - i] >> 7) << (nmemb - 1 - i); - } - #endif - - return r; - #endif -} -#define simde_m_pmovmskb(a) simde_mm_movemask_pi8(a) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_movemask_pi8(a) simde_mm_movemask_pi8(a) -# define _m_pmovmskb(a) simde_mm_movemask_pi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_movemask_ps (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_movemask_ps(a); - #else - int r = 0; - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - static const int32_t shift[4] = {0, 1, 2, 3}; - uint32x4_t tmp = vshrq_n_u32(a_.neon_u32, 31); - return HEDLEY_STATIC_CAST(int32_t, vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift)))); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - // Shift out everything but the sign bits with a 32-bit unsigned shift right. - uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(a_.neon_u32, 31)); - // Merge the two pairs together with a 64-bit unsigned shift right + add. - uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); - // Extract the result. - return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && defined(SIMDE_BUG_CLANG_50932) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) idx = { 96, 64, 32, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) res = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_bperm(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned __int128), a_.altivec_u64), idx)); - return HEDLEY_STATIC_CAST(int32_t, vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), res), 2)); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) idx = { 96, 64, 32, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) res = vec_bperm(a_.altivec_u8, idx); - return HEDLEY_STATIC_CAST(int32_t, vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), res), 2)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - v2i64 t64 = __lsx_vmskltz_w(a_.lsx_i64); - r = __lsx_vpickve2gr_wu(t64, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return HEDLEY_STATIC_CAST(int32_t, wasm_i32x4_bitmask(a_.wasm_v128)); - #else - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < sizeof(a_.u32) / sizeof(a_.u32[0]) ; i++) { - r |= (a_.u32[i] >> ((sizeof(a_.u32[i]) * CHAR_BIT) - 1)) << i; - } - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_movemask_ps(a) simde_mm_movemask_ps((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_mul_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_mul_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vmulq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_mul(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = a_.f32 * b_.f32; - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f32 = vec_mul(a_.altivec_f32, b_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f32 = __lsx_vfmul_s(a_.lsx_f32, b_.lsx_f32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i] * b_.f32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_mul_ps(a, b) simde_mm_mul_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_mul_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_mul_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_mul_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_mul_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - r_.f32[0] = a_.f32[0] * b_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_mul_ss(a, b) simde_mm_mul_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_mulhi_pu16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_mulhi_pu16(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const uint32x4_t t1 = vmull_u16(a_.neon_u16, b_.neon_u16); - const uint32x4_t t2 = vshrq_n_u32(t1, 16); - const uint16x4_t t3 = vmovn_u32(t2); - r_.neon_u16 = t3; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, ((HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i])) >> UINT32_C(16))); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_mulhi_pu16(a, b) simde_mm_mulhi_pu16(a, b) -# define _m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b) -#endif - -#if defined(SIMDE_X86_SSE_NATIVE) && defined(HEDLEY_GCC_VERSION) - #define SIMDE_MM_HINT_NTA HEDLEY_STATIC_CAST(enum _mm_hint, 0) - #define SIMDE_MM_HINT_T0 HEDLEY_STATIC_CAST(enum _mm_hint, 1) - #define SIMDE_MM_HINT_T1 HEDLEY_STATIC_CAST(enum _mm_hint, 2) - #define SIMDE_MM_HINT_T2 HEDLEY_STATIC_CAST(enum _mm_hint, 3) - #define SIMDE_MM_HINT_ENTA HEDLEY_STATIC_CAST(enum _mm_hint, 4) - #define SIMDE_MM_HINT_ET0 HEDLEY_STATIC_CAST(enum _mm_hint, 5) - #define SIMDE_MM_HINT_ET1 HEDLEY_STATIC_CAST(enum _mm_hint, 6) - #define SIMDE_MM_HINT_ET2 HEDLEY_STATIC_CAST(enum _mm_hint, 7) -#else - #define SIMDE_MM_HINT_NTA 0 - #define SIMDE_MM_HINT_T0 1 - #define SIMDE_MM_HINT_T1 2 - #define SIMDE_MM_HINT_T2 3 - #define SIMDE_MM_HINT_ENTA 4 - #define SIMDE_MM_HINT_ET0 5 - #define SIMDE_MM_HINT_ET1 6 - #define SIMDE_MM_HINT_ET2 7 -#endif - -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - HEDLEY_DIAGNOSTIC_PUSH - #if HEDLEY_HAS_WARNING("-Wreserved-id-macro") - _Pragma("clang diagnostic ignored \"-Wreserved-id-macro\"") - #endif - #undef _MM_HINT_NTA - #define _MM_HINT_NTA SIMDE_MM_HINT_NTA - #undef _MM_HINT_T0 - #define _MM_HINT_T0 SIMDE_MM_HINT_T0 - #undef _MM_HINT_T1 - #define _MM_HINT_T1 SIMDE_MM_HINT_T1 - #undef _MM_HINT_T2 - #define _MM_HINT_T2 SIMDE_MM_HINT_T2 - #undef _MM_HINT_ENTA - #define _MM_HINT_ETNA SIMDE_MM_HINT_ENTA - #undef _MM_HINT_ET0 - #define _MM_HINT_ET0 SIMDE_MM_HINT_ET0 - #undef _MM_HINT_ET1 - #define _MM_HINT_ET1 SIMDE_MM_HINT_ET1 - #undef _MM_HINT_ET1 - #define _MM_HINT_ET2 SIMDE_MM_HINT_ET2 - HEDLEY_DIAGNOSTIC_POP -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_prefetch (const void* p, int i) { - #if \ - HEDLEY_HAS_BUILTIN(__builtin_prefetch) || \ - HEDLEY_GCC_VERSION_CHECK(3,4,0) || \ - HEDLEY_INTEL_VERSION_CHECK(13,0,0) - switch(i) { - case SIMDE_MM_HINT_NTA: - __builtin_prefetch(p, 0, 0); - break; - case SIMDE_MM_HINT_T0: - __builtin_prefetch(p, 0, 3); - break; - case SIMDE_MM_HINT_T1: - __builtin_prefetch(p, 0, 2); - break; - case SIMDE_MM_HINT_T2: - __builtin_prefetch(p, 0, 1); - break; - case SIMDE_MM_HINT_ENTA: - __builtin_prefetch(p, 1, 0); - break; - case SIMDE_MM_HINT_ET0: - __builtin_prefetch(p, 1, 3); - break; - case SIMDE_MM_HINT_ET1: - __builtin_prefetch(p, 1, 2); - break; - case SIMDE_MM_HINT_ET2: - __builtin_prefetch(p, 0, 1); - break; - } - #elif defined(__ARM_ACLE) - #if (__ARM_ACLE >= 101) - switch(i) { - case SIMDE_MM_HINT_NTA: - __pldx(0, 0, 1, p); - break; - case SIMDE_MM_HINT_T0: - __pldx(0, 0, 0, p); - break; - case SIMDE_MM_HINT_T1: - __pldx(0, 1, 0, p); - break; - case SIMDE_MM_HINT_T2: - __pldx(0, 2, 0, p); - break; - case SIMDE_MM_HINT_ENTA: - __pldx(1, 0, 1, p); - break; - case SIMDE_MM_HINT_ET0: - __pldx(1, 0, 0, p); - break; - case SIMDE_MM_HINT_ET1: - __pldx(1, 1, 0, p); - break; - case SIMDE_MM_HINT_ET2: - __pldx(1, 2, 0, p); - break; - } - #else - (void) i; - __pld(p) - #endif - #elif HEDLEY_PGI_VERSION_CHECK(10,0,0) - (void) i; - #pragma mem prefetch p - #elif HEDLEY_CRAY_VERSION_CHECK(8,1,0) - switch (i) { - case SIMDE_MM_HINT_NTA: - #pragma _CRI prefetch (nt) p - break; - case SIMDE_MM_HINT_T0: - case SIMDE_MM_HINT_T1: - case SIMDE_MM_HINT_T2: - #pragma _CRI prefetch p - break; - case SIMDE_MM_HINT_ENTA: - #pragma _CRI prefetch (write, nt) p - break; - case SIMDE_MM_HINT_ET0: - case SIMDE_MM_HINT_ET1: - case SIMDE_MM_HINT_ET2: - #pragma _CRI prefetch (write) p - break; - } - #elif HEDLEY_IBM_VERSION_CHECK(11,0,0) - switch(i) { - case SIMDE_MM_HINT_NTA: - __prefetch_by_load(p, 0, 0); - break; - case SIMDE_MM_HINT_T0: - __prefetch_by_load(p, 0, 3); - break; - case SIMDE_MM_HINT_T1: - __prefetch_by_load(p, 0, 2); - break; - case SIMDE_MM_HINT_T2: - __prefetch_by_load(p, 0, 1); - break; - case SIMDE_MM_HINT_ENTA: - __prefetch_by_load(p, 1, 0); - break; - case SIMDE_MM_HINT_ET0: - __prefetch_by_load(p, 1, 3); - break; - case SIMDE_MM_HINT_ET1: - __prefetch_by_load(p, 1, 2); - break; - case SIMDE_MM_HINT_ET2: - __prefetch_by_load(p, 0, 1); - break; - } - #elif HEDLEY_MSVC_VERSION - (void) i; - (void) p; - #endif -} -#if defined(SIMDE_X86_SSE_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(10,0,0) /* https://reviews.llvm.org/D71718 */ - #define simde_mm_prefetch(p, i) \ - (__extension__({ \ - HEDLEY_DIAGNOSTIC_PUSH \ - HEDLEY_DIAGNOSTIC_DISABLE_CAST_QUAL \ - _mm_prefetch((p), (i)); \ - HEDLEY_DIAGNOSTIC_POP \ - })) - #else - #define simde_mm_prefetch(p, i) _mm_prefetch(p, i) - #endif -#endif -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) - #define _mm_prefetch(p, i) simde_mm_prefetch(p, i) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_negate_ps(simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return simde_mm_xor_ps(a, _mm_set1_ps(SIMDE_FLOAT32_C(-0.0))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vnegq_f32(a_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_neg(a_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_f32 = vec_neg(a_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - const v4f32 f32 = {0.0f, 0.0f, 0.0f, 0.0f}; - r_.lsx_f32 = __lsx_vfsub_s(f32, a_.lsx_f32); - #elif defined(SIMDE_VECTOR_NEGATE) - r_.f32 = -a_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = -a_.f32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_rcp_ps (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_rcp_ps(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x4_t recip = vrecpeq_f32(a_.neon_f32); - - #if SIMDE_ACCURACY_PREFERENCE > 0 - for (int i = 0; i < SIMDE_ACCURACY_PREFERENCE ; ++i) { - recip = vmulq_f32(recip, vrecpsq_f32(recip, a_.neon_f32)); - } - #endif - - r_.neon_f32 = recip; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_div(simde_mm_set1_ps(1.0f), a_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = vec_re(a_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f32 = __lsx_vfrecip_s(a_.lsx_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.f32 = 1.0f / a_.f32; - #elif defined(SIMDE_IEEE754_STORAGE) - /* https://stackoverflow.com/questions/12227126/division-as-multiply-and-lut-fast-float-division-reciprocal/12228234#12228234 */ - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - int32_t ix; - simde_float32 fx = a_.f32[i]; - simde_memcpy(&ix, &fx, sizeof(ix)); - int32_t x = INT32_C(0x7EF311C3) - ix; - simde_float32 temp; - simde_memcpy(&temp, &x, sizeof(temp)); - r_.f32[i] = temp * (SIMDE_FLOAT32_C(2.0) - temp * fx); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = 1.0f / a_.f32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_rcp_ps(a) simde_mm_rcp_ps((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_rcp_ss (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_rcp_ss(a); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_rcp_ps(a)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_rcp_ps(simde_x_mm_broadcastlow_ps(a))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - r_.f32[0] = 1.0f / a_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_rcp_ss(a) simde_mm_rcp_ss((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_rsqrt_ps (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_rsqrt_ps(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vrsqrteq_f32(a_.neon_f32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = vec_rsqrte(a_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f32 = __lsx_vfrsqrt_s(a_.lsx_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_div(simde_mm_set1_ps(1.0f), wasm_f32x4_sqrt(a_.wasm_v128)); - #elif defined(SIMDE_IEEE754_STORAGE) - /* https://basesandframes.files.wordpress.com/2020/04/even_faster_math_functions_green_2020.pdf - Pages 100 - 103 */ - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - #if SIMDE_ACCURACY_PREFERENCE <= 0 - r_.i32[i] = INT32_C(0x5F37624F) - (a_.i32[i] >> 1); - #else - simde_float32 x = a_.f32[i]; - simde_float32 xhalf = SIMDE_FLOAT32_C(0.5) * x; - int32_t ix; - - simde_memcpy(&ix, &x, sizeof(ix)); - - #if SIMDE_ACCURACY_PREFERENCE == 1 - ix = INT32_C(0x5F375A82) - (ix >> 1); - #else - ix = INT32_C(0x5F37599E) - (ix >> 1); - #endif - - simde_memcpy(&x, &ix, sizeof(x)); - - #if SIMDE_ACCURACY_PREFERENCE >= 2 - x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x); - #endif - x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x); - - r_.f32[i] = x; - #endif - } - #elif defined(simde_math_sqrtf) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = 1.0f / simde_math_sqrtf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_rsqrt_ps(a) simde_mm_rsqrt_ps((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_rsqrt_ss (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_rsqrt_ss(a); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_rsqrt_ps(a)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_rsqrt_ps(simde_x_mm_broadcastlow_ps(a))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vsetq_lane_f32(vgetq_lane_f32(simde_mm_rsqrt_ps(a).neon_f32, 0), a_.neon_f32, 0); - #elif defined(SIMDE_IEEE754_STORAGE) - { - #if SIMDE_ACCURACY_PREFERENCE <= 0 - r_.i32[0] = INT32_C(0x5F37624F) - (a_.i32[0] >> 1); - #else - simde_float32 x = a_.f32[0]; - simde_float32 xhalf = SIMDE_FLOAT32_C(0.5) * x; - int32_t ix; - - simde_memcpy(&ix, &x, sizeof(ix)); - - #if SIMDE_ACCURACY_PREFERENCE == 1 - ix = INT32_C(0x5F375A82) - (ix >> 1); - #else - ix = INT32_C(0x5F37599E) - (ix >> 1); - #endif - - simde_memcpy(&x, &ix, sizeof(x)); - - #if SIMDE_ACCURACY_PREFERENCE >= 2 - x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x); - #endif - x = x * (SIMDE_FLOAT32_C(1.5008909) - xhalf * x * x); - - r_.f32[0] = x; - #endif - } - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - #elif defined(simde_math_sqrtf) - r_.f32[0] = 1.0f / simde_math_sqrtf(a_.f32[0]); - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_rsqrt_ss(a) simde_mm_rsqrt_ss((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sad_pu8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_sad_pu8(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint64x1_t t = vpaddl_u32(vpaddl_u16(vpaddl_u8(vabd_u8(a_.neon_u8, b_.neon_u8)))); - r_.neon_u16 = vset_lane_u16(HEDLEY_STATIC_CAST(uint64_t, vget_lane_u64(t, 0)), vdup_n_u16(0), 0); - #else - uint16_t sum = 0; - - SIMDE_VECTORIZE_REDUCTION(+:sum) - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - sum += HEDLEY_STATIC_CAST(uint8_t, simde_math_abs(a_.u8[i] - b_.u8[i])); - } - - r_.i16[0] = HEDLEY_STATIC_CAST(int16_t, sum); - r_.i16[1] = 0; - r_.i16[2] = 0; - r_.i16[3] = 0; - #endif - - return simde__m64_from_private(r_); - #endif -} -#define simde_m_psadbw(a, b) simde_mm_sad_pu8(a, b) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_sad_pu8(a, b) simde_mm_sad_pu8(a, b) -# define _m_psadbw(a, b) simde_mm_sad_pu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_set_ss (simde_float32 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_set_ss(a); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return vsetq_lane_f32(a, vdupq_n_f32(SIMDE_FLOAT32_C(0.0)), 0); - #else - return simde_mm_set_ps(SIMDE_FLOAT32_C(0.0), SIMDE_FLOAT32_C(0.0), SIMDE_FLOAT32_C(0.0), a); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_set_ss(a) simde_mm_set_ss(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_setr_ps (simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_setr_ps(e3, e2, e1, e0); - #else - return simde_mm_set_ps(e0, e1, e2, e3); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_setr_ps(e3, e2, e1, e0) simde_mm_setr_ps(e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_setzero_ps (void) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_setzero_ps(); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return vdupq_n_f32(SIMDE_FLOAT32_C(0.0)); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - return vec_splats(SIMDE_FLOAT32_C(0.0)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f32x4_const(0.f, 0.f, 0.f, 0.f); - #else - simde__m128 r; - simde_memset(&r, 0, sizeof(r)); - return r; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_setzero_ps() simde_mm_setzero_ps() -#endif - -#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_undefined_ps (void) { - simde__m128_private r_; - - #if defined(SIMDE_HAVE_UNDEFINED128) - r_.n = _mm_undefined_ps(); - #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - r_ = simde__m128_to_private(simde_mm_setzero_ps()); - #endif - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_undefined_ps() simde_mm_undefined_ps() -#endif - -#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) -HEDLEY_DIAGNOSTIC_POP -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_setone_ps (void) { - simde__m128 t = simde_mm_setzero_ps(); - return simde_mm_cmpeq_ps(t, t); -} - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_sfence (void) { - /* TODO: Use Hedley. */ - #if defined(SIMDE_X86_SSE_NATIVE) - _mm_sfence(); - #elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)) - __atomic_thread_fence(__ATOMIC_SEQ_CST); - #elif !defined(__INTEL_COMPILER) && defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) - #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ < 9) - __atomic_thread_fence(__ATOMIC_SEQ_CST); - #else - atomic_thread_fence(memory_order_seq_cst); - #endif - #elif defined(_MSC_VER) - MemoryBarrier(); - #elif HEDLEY_HAS_EXTENSION(c_atomic) - __c11_atomic_thread_fence(__ATOMIC_SEQ_CST); - #elif defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) - __sync_synchronize(); - #elif defined(_OPENMP) - #pragma omp critical(simde_mm_sfence_) - { } - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_sfence() simde_mm_sfence() -#endif - -#define SIMDE_MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _MM_SHUFFLE(z, y, x, w) SIMDE_MM_SHUFFLE(z, y, x, w) -#endif - -#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) -# define simde_mm_shuffle_pi16(a, imm8) _mm_shuffle_pi16(a, imm8) -#elif defined(SIMDE_SHUFFLE_VECTOR_) -# define simde_mm_shuffle_pi16(a, imm8) (__extension__ ({ \ - const simde__m64_private simde_tmp_a_ = simde__m64_to_private(a); \ - simde__m64_from_private((simde__m64_private) { .i16 = \ - SIMDE_SHUFFLE_VECTOR_(16, 8, \ - (simde_tmp_a_).i16, \ - (simde_tmp_a_).i16, \ - (((imm8) ) & 3), \ - (((imm8) >> 2) & 3), \ - (((imm8) >> 4) & 3), \ - (((imm8) >> 6) & 3)) }); })) -#else -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_shuffle_pi16 (simde__m64 a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m64_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - for (size_t i = 0 ; i < sizeof(r_.i16) / sizeof(r_.i16[0]) ; i++) { - r_.i16[i] = a_.i16[(imm8 >> (i * 2)) & 3]; - } - -HEDLEY_DIAGNOSTIC_PUSH -#if HEDLEY_HAS_WARNING("-Wconditional-uninitialized") -# pragma clang diagnostic ignored "-Wconditional-uninitialized" -#endif - return simde__m64_from_private(r_); -HEDLEY_DIAGNOSTIC_POP -} -#endif -#if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) -# define simde_m_pshufw(a, imm8) _m_pshufw(a, imm8) -#else -# define simde_m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8) -#endif -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_shuffle_pi16(a, imm8) simde_mm_shuffle_pi16(a, imm8) -# define _m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_shuffle_ps (simde__m128 a, simde__m128 b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - r_.f32[0] = a_.f32[(imm8 >> 0) & 3]; - r_.f32[1] = a_.f32[(imm8 >> 2) & 3]; - r_.f32[2] = b_.f32[(imm8 >> 4) & 3]; - r_.f32[3] = b_.f32[(imm8 >> 6) & 3]; - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_SSE_NATIVE) && !defined(__PGI) -# define simde_mm_shuffle_ps(a, b, imm8) _mm_shuffle_ps(a, b, imm8) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_shuffle_ps(a, b, imm8) (__extension__ ({ \ - simde__m128_from_private((simde__m128_private) { .wasm_v128 = \ - wasm_i32x4_shuffle( \ - simde__m128_to_private(a).wasm_v128, \ - simde__m128_to_private(b).wasm_v128, \ - (((imm8) ) & 3), \ - (((imm8) >> 2) & 3), \ - (((imm8) >> 4) & 3) + 4, \ - (((imm8) >> 6) & 3) + 4) }); })) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm_shuffle_ps(a, b, imm8) \ - (__extension__({ \ - float32x4_t simde_mm_shuffle_ps_a_ = simde__m128_to_neon_f32(a); \ - float32x4_t simde_mm_shuffle_ps_b_ = simde__m128_to_neon_f32(b); \ - float32x4_t simde_mm_shuffle_ps_r_; \ - \ - simde_mm_shuffle_ps_r_ = vmovq_n_f32(vgetq_lane_f32(simde_mm_shuffle_ps_a_, (imm8) & (0x3))); \ - simde_mm_shuffle_ps_r_ = vsetq_lane_f32(vgetq_lane_f32(simde_mm_shuffle_ps_a_, ((imm8) >> 2) & 0x3), simde_mm_shuffle_ps_r_, 1); \ - simde_mm_shuffle_ps_r_ = vsetq_lane_f32(vgetq_lane_f32(simde_mm_shuffle_ps_b_, ((imm8) >> 4) & 0x3), simde_mm_shuffle_ps_r_, 2); \ - vsetq_lane_f32(vgetq_lane_f32(simde_mm_shuffle_ps_b_, ((imm8) >> 6) & 0x3), simde_mm_shuffle_ps_r_, 3); \ - })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm_shuffle_ps(a, b, imm8) (__extension__ ({ \ - simde__m128_from_private((simde__m128_private) { .f32 = \ - SIMDE_SHUFFLE_VECTOR_(32, 16, \ - simde__m128_to_private(a).f32, \ - simde__m128_to_private(b).f32, \ - (((imm8) ) & 3), \ - (((imm8) >> 2) & 3), \ - (((imm8) >> 4) & 3) + 4, \ - (((imm8) >> 6) & 3) + 4) }); })) -#endif -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_shuffle_ps(a, b, imm8) simde_mm_shuffle_ps((a), (b), imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_sqrt_ps (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_sqrt_ps(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vsqrtq_f32(a_.neon_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x4_t est = vrsqrteq_f32(a_.neon_f32); - for (int i = 0 ; i <= SIMDE_ACCURACY_PREFERENCE ; i++) { - est = vmulq_f32(vrsqrtsq_f32(vmulq_f32(a_.neon_f32, est), est), est); - } - r_.neon_f32 = vmulq_f32(a_.neon_f32, est); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_sqrt(a_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_14_NATIVE) - r_.altivec_f32 = vec_sqrt(a_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f32 = __lsx_vfsqrt_s(a_.lsx_f32); - #elif defined(simde_math_sqrt) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < sizeof(r_.f32) / sizeof(r_.f32[0]) ; i++) { - r_.f32[i] = simde_math_sqrtf(a_.f32[i]); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_sqrt_ps(a) simde_mm_sqrt_ps((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_sqrt_ss (simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_sqrt_ss(a); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_sqrt_ps(a)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_sqrt_ps(simde_x_mm_broadcastlow_ps(a))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32_t value = - vgetq_lane_f32(simde__m128_to_private(simde_mm_sqrt_ps(a)).neon_f32, 0); - r_.neon_f32 = vsetq_lane_f32(value, a_.neon_f32, 0); - #elif defined(simde_math_sqrtf) - r_.f32[0] = simde_math_sqrtf(a_.f32[0]); - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_sqrt_ss(a) simde_mm_sqrt_ss((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_store_ps (simde_float32 mem_addr[4], simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - _mm_store_ps(mem_addr, a); - #else - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - vst1q_f32(mem_addr, a_.neon_f32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - vec_st(a_.altivec_f32, 0, mem_addr); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - wasm_v128_store(mem_addr, a_.wasm_v128); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - __lsx_vst(a_.lsx_f32, mem_addr, 0); - #else - simde_memcpy(mem_addr, &a_, sizeof(a)); - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_store_ps(mem_addr, a) simde_mm_store_ps(SIMDE_CHECKED_REINTERPRET_CAST(float*, simde_float32*, mem_addr), (a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_store1_ps (simde_float32 mem_addr[4], simde__m128 a) { - simde_float32* mem_addr_ = SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128); - - #if defined(SIMDE_X86_SSE_NATIVE) - _mm_store_ps1(mem_addr_, a); - #else - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - vst1q_f32(mem_addr_, vdupq_lane_f32(vget_low_f32(a_.neon_f32), 0)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - wasm_v128_store(mem_addr_, wasm_i32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 0, 0, 0)); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - vec_st(vec_splat(a_.altivec_f32, 0), 0, mem_addr_); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - __lsx_vst(__lsx_vreplvei_w(a_.lsx_f32, 0), mem_addr_, 0); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - simde__m128_private tmp_; - tmp_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 0, 0); - simde_mm_store_ps(mem_addr_, tmp_.f32); - #else - SIMDE_VECTORIZE_ALIGNED(mem_addr_:16) - for (size_t i = 0 ; i < sizeof(a_.f32) / sizeof(a_.f32[0]) ; i++) { - mem_addr_[i] = a_.f32[0]; - } - #endif - #endif -} -#define simde_mm_store_ps1(mem_addr, a) simde_mm_store1_ps(mem_addr, a) -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_store_ps1(mem_addr, a) simde_mm_store1_ps(SIMDE_CHECKED_REINTERPRET_CAST(float*, simde_float32*, mem_addr), (a)) -# define _mm_store1_ps(mem_addr, a) simde_mm_store1_ps(SIMDE_CHECKED_REINTERPRET_CAST(float*, simde_float32*, mem_addr), (a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_store_ss (simde_float32* mem_addr, simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - _mm_store_ss(mem_addr, a); - #else - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - vst1q_lane_f32(mem_addr, a_.neon_f32, 0); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - __lsx_vstelm_w(a_.lsx_f32, mem_addr, 0, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - wasm_v128_store32_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 0); - #else - *mem_addr = a_.f32[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_store_ss(mem_addr, a) simde_mm_store_ss(SIMDE_CHECKED_REINTERPRET_CAST(float*, simde_float32*, mem_addr), (a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storeh_pi (simde__m64* mem_addr, simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - _mm_storeh_pi(HEDLEY_REINTERPRET_CAST(__m64*, mem_addr), a); - #else - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - vst1_f32(HEDLEY_REINTERPRET_CAST(float32_t*, mem_addr), vget_high_f32(a_.neon_f32)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 1); - #else - simde_memcpy(mem_addr, &(a_.m64[1]), sizeof(a_.m64[1])); - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_storeh_pi(mem_addr, a) simde_mm_storeh_pi(mem_addr, (a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storel_pi (simde__m64* mem_addr, simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - _mm_storel_pi(HEDLEY_REINTERPRET_CAST(__m64*, mem_addr), a); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - simde__m128_private a_ = simde__m128_to_private(a); - wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 0); - #else - simde__m64_private* dest_ = HEDLEY_REINTERPRET_CAST(simde__m64_private*, mem_addr); - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - dest_->neon_f32 = vget_low_f32(a_.neon_f32); - #else - dest_->f32[0] = a_.f32[0]; - dest_->f32[1] = a_.f32[1]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_storel_pi(mem_addr, a) simde_mm_storel_pi(mem_addr, (a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storer_ps (simde_float32 mem_addr[4], simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - _mm_storer_ps(mem_addr, a); - #else - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - vec_st(vec_reve(a_.altivec_f32), 0, mem_addr); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x4_t tmp = vrev64q_f32(a_.neon_f32); - vst1q_f32(mem_addr, vextq_f32(tmp, tmp, 2)); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - __lsx_vst(__lsx_vshuf4i_w(a_.lsx_f32, 0x1b), mem_addr, 0); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - a_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 3, 2, 1, 0); - simde_mm_store_ps(mem_addr, simde__m128_from_private(a_)); - #else - SIMDE_VECTORIZE_ALIGNED(mem_addr:16) - for (size_t i = 0 ; i < sizeof(a_.f32) / sizeof(a_.f32[0]) ; i++) { - mem_addr[i] = a_.f32[((sizeof(a_.f32) / sizeof(a_.f32[0])) - 1) - i]; - } - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_storer_ps(mem_addr, a) simde_mm_storer_ps(SIMDE_CHECKED_REINTERPRET_CAST(float*, simde_float32*, mem_addr), (a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storeu_ps (simde_float32 mem_addr[4], simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - _mm_storeu_ps(mem_addr, a); - #else - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - vst1q_f32(mem_addr, a_.neon_f32); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - vec_vsx_st(a_.altivec_f32, 0, mem_addr); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - __lsx_vst(a_.lsx_f32, mem_addr, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - wasm_v128_store(mem_addr, a_.wasm_v128); - #else - simde_memcpy(mem_addr, &a_, sizeof(a_)); - #endif - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_storeu_ps(mem_addr, a) simde_mm_storeu_ps(SIMDE_CHECKED_REINTERPRET_CAST(float*, simde_float32*, mem_addr), (a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_sub_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_sub_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vsubq_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_sub(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = vec_sub(a_.altivec_f32, b_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_f32 = __lsx_vfsub_s(a_.lsx_f32, b_.lsx_f32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f32 = a_.f32 - b_.f32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i] - b_.f32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_sub_ps(a, b) simde_mm_sub_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_sub_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_sub_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_sub_ps(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_sub_ps(simde_x_mm_broadcastlow_ps(a), simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - r_.f32[0] = a_.f32[0] - b_.f32[0]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[3]; - - return simde__m128_from_private(r_); - #endif -} - -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_sub_ss(a, b) simde_mm_sub_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomieq_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_ucomieq_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); - uint32x4_t a_eq_b = vceqq_f32(a_.neon_f32, b_.neon_f32); - r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) == wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f32[0] == b_.f32[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f32[0] == b_.f32[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_ucomieq_ss(a, b) simde_mm_ucomieq_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomige_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_ucomige_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_ge_b = vcgeq_f32(a_.neon_f32, b_.neon_f32); - r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) >= wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f32[0] >= b_.f32[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f32[0] >= b_.f32[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_ucomige_ss(a, b) simde_mm_ucomige_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomigt_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_ucomigt_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_gt_b = vcgtq_f32(a_.neon_f32, b_.neon_f32); - r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) > wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f32[0] > b_.f32[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f32[0] > b_.f32[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_ucomigt_ss(a, b) simde_mm_ucomigt_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomile_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_ucomile_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); - uint32x4_t a_le_b = vcleq_f32(a_.neon_f32, b_.neon_f32); - r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) <= wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f32[0] <= b_.f32[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f32[0] <= b_.f32[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_ucomile_ss(a, b) simde_mm_ucomile_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomilt_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_ucomilt_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); - uint32x4_t a_lt_b = vcltq_f32(a_.neon_f32, b_.neon_f32); - r = !!(vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) < wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f32[0] < b_.f32[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f32[0] < b_.f32[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_ucomilt_ss(a, b) simde_mm_ucomilt_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomineq_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_ucomineq_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t a_not_nan = vceqq_f32(a_.neon_f32, a_.neon_f32); - uint32x4_t b_not_nan = vceqq_f32(b_.neon_f32, b_.neon_f32); - uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); - uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a_.neon_f32, b_.neon_f32)); - r = !!(vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r = wasm_f32x4_extract_lane(a_.wasm_v128, 0) != wasm_f32x4_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f32[0] != b_.f32[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f32[0] != b_.f32[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_ucomineq_ss(a, b) simde_mm_ucomineq_ss((a), (b)) -#endif - -#if defined(SIMDE_X86_SSE_NATIVE) -# if defined(__has_builtin) -# if __has_builtin(__builtin_ia32_undef128) -# define SIMDE_HAVE_UNDEFINED128 -# endif -# elif !defined(__PGI) && !defined(SIMDE_BUG_GCC_REV_208793) && !defined(_MSC_VER) -# define SIMDE_HAVE_UNDEFINED128 -# endif -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_unpackhi_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_unpackhi_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vzip2q_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x2_t a1 = vget_high_f32(a_.neon_f32); - float32x2_t b1 = vget_high_f32(b_.neon_f32); - float32x2x2_t result = vzip_f32(a1, b1); - r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vilvh_w(b_.lsx_i64, a_.lsx_i64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 2, 6, 3, 7); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 2, 6, 3, 7); - #else - r_.f32[0] = a_.f32[2]; - r_.f32[1] = b_.f32[2]; - r_.f32[2] = a_.f32[3]; - r_.f32[3] = b_.f32[3]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_unpackhi_ps(a, b) simde_mm_unpackhi_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_unpacklo_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE_NATIVE) - return _mm_unpacklo_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vzip1q_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_f32 = vec_mergeh(a_.altivec_f32, b_.altivec_f32); - #elif defined(SIMDE_LOONGARCH_LSX_NATIVE) - r_.lsx_i64 = __lsx_vilvl_w(b_.lsx_i64, a_.lsx_i64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 4, 1, 5); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x2_t a1 = vget_low_f32(a_.neon_f32); - float32x2_t b1 = vget_low_f32(b_.neon_f32); - float32x2x2_t result = vzip_f32(a1, b1); - r_.neon_f32 = vcombine_f32(result.val[0], result.val[1]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 4, 1, 5); - #else - r_.f32[0] = a_.f32[0]; - r_.f32[1] = b_.f32[0]; - r_.f32[2] = a_.f32[1]; - r_.f32[3] = b_.f32[1]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_unpacklo_ps(a, b) simde_mm_unpacklo_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_stream_pi (simde__m64* mem_addr, simde__m64 a) { - #if defined(SIMDE_X86_SSE_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - _mm_stream_pi(HEDLEY_REINTERPRET_CAST(__m64*, mem_addr), a); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \ - defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_MIPS_LOONGSON_MMI_NATIVE) || \ - defined(SIMDE_VECTOR_SUBSCRIPT)) - __builtin_nontemporal_store(a, mem_addr); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - simde__m64_private a_ = simde__m64_to_private(a); - vst1_s64(HEDLEY_REINTERPRET_CAST(int64_t *, mem_addr), a_.neon_i64); - #else - simde__m64_private* - dest = HEDLEY_REINTERPRET_CAST(simde__m64_private*, mem_addr), - a_ = simde__m64_to_private(a); - - dest->i64[0] = a_.i64[0]; - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_stream_pi(mem_addr, a) simde_mm_stream_pi(mem_addr, (a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_stream_ps (simde_float32 mem_addr[4], simde__m128 a) { - #if defined(SIMDE_X86_SSE_NATIVE) - _mm_stream_ps(mem_addr, a); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \ - defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_VECTOR_SUBSCRIPT) || \ - defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \ - defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) || defined(SIMDE_LOONGARCH_LSX_NATIVE)) - __builtin_nontemporal_store(a, SIMDE_ALIGN_ASSUME_CAST(__typeof__(a)*, mem_addr)); - #else - simde_mm_store_ps(mem_addr, a); - #endif -} -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _mm_stream_ps(mem_addr, a) simde_mm_stream_ps(SIMDE_CHECKED_REINTERPRET_CAST(float*, simde_float32*, mem_addr), (a)) -#endif - -#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(SIMDE_ARM_NEON_A64V8_NATIVE) - #define SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ - do { \ - float32x4x2_t SIMDE_MM_TRANSPOSE4_PS_ROW01 = vtrnq_f32(row0, row1); \ - float32x4x2_t SIMDE_MM_TRANSPOSE4_PS_ROW23 = vtrnq_f32(row2, row3); \ - row0 = vcombine_f32(vget_low_f32(SIMDE_MM_TRANSPOSE4_PS_ROW01.val[0]), \ - vget_low_f32(SIMDE_MM_TRANSPOSE4_PS_ROW23.val[0])); \ - row1 = vcombine_f32(vget_low_f32(SIMDE_MM_TRANSPOSE4_PS_ROW01.val[1]), \ - vget_low_f32(SIMDE_MM_TRANSPOSE4_PS_ROW23.val[1])); \ - row2 = vcombine_f32(vget_high_f32(SIMDE_MM_TRANSPOSE4_PS_ROW01.val[0]), \ - vget_high_f32(SIMDE_MM_TRANSPOSE4_PS_ROW23.val[0])); \ - row3 = vcombine_f32(vget_high_f32(SIMDE_MM_TRANSPOSE4_PS_ROW01.val[1]), \ - vget_high_f32(SIMDE_MM_TRANSPOSE4_PS_ROW23.val[1])); \ - } while (0) -#else - #define SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ - do { \ - simde__m128 SIMDE_MM_TRANSPOSE4_PS_tmp3, SIMDE_MM_TRANSPOSE4_PS_tmp2, SIMDE_MM_TRANSPOSE4_PS_tmp1, SIMDE_MM_TRANSPOSE4_PS_tmp0; \ - SIMDE_MM_TRANSPOSE4_PS_tmp0 = simde_mm_unpacklo_ps((row0), (row1)); \ - SIMDE_MM_TRANSPOSE4_PS_tmp2 = simde_mm_unpacklo_ps((row2), (row3)); \ - SIMDE_MM_TRANSPOSE4_PS_tmp1 = simde_mm_unpackhi_ps((row0), (row1)); \ - SIMDE_MM_TRANSPOSE4_PS_tmp3 = simde_mm_unpackhi_ps((row2), (row3)); \ - row0 = simde_mm_movelh_ps(SIMDE_MM_TRANSPOSE4_PS_tmp0, SIMDE_MM_TRANSPOSE4_PS_tmp2); \ - row1 = simde_mm_movehl_ps(SIMDE_MM_TRANSPOSE4_PS_tmp2, SIMDE_MM_TRANSPOSE4_PS_tmp0); \ - row2 = simde_mm_movelh_ps(SIMDE_MM_TRANSPOSE4_PS_tmp1, SIMDE_MM_TRANSPOSE4_PS_tmp3); \ - row3 = simde_mm_movehl_ps(SIMDE_MM_TRANSPOSE4_PS_tmp3, SIMDE_MM_TRANSPOSE4_PS_tmp1); \ - } while (0) -#endif -#if defined(SIMDE_X86_SSE_ENABLE_NATIVE_ALIASES) -# define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_SSE_H) */ diff --git a/ffi-deps/simde/simde/x86/sse2.h b/ffi-deps/simde/simde/x86/sse2.h deleted file mode 100644 index 024fe26..0000000 --- a/ffi-deps/simde/simde/x86/sse2.h +++ /dev/null @@ -1,7737 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2017-2020 Evan Nemerson - * 2015-2017 John W. Ratcliff - * 2015 Brandon Rowlett - * 2015 Ken Fast - * 2017 Hasindu Gamaarachchi - * 2018 Jeff Daily - */ - -#if !defined(SIMDE_X86_SSE2_H) -#define SIMDE_X86_SSE2_H - -#include "sse.h" -#include "../simde-f16.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -typedef union { - #if defined(SIMDE_VECTOR_SUBSCRIPT) - SIMDE_ALIGN_TO_16 int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_16 simde_int128 i128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 simde_uint128 u128 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - #endif - #if defined(SIMDE_FLOAT16_VECTOR) - SIMDE_ALIGN_TO_16 simde_float16 f16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - #else - SIMDE_ALIGN_TO_16 simde_float16 f16[8]; - #endif - SIMDE_ALIGN_TO_16 simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 simde_float64 f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - - SIMDE_ALIGN_TO_16 int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - #else - SIMDE_ALIGN_TO_16 int8_t i8[16]; - SIMDE_ALIGN_TO_16 int16_t i16[8]; - SIMDE_ALIGN_TO_16 int32_t i32[4]; - SIMDE_ALIGN_TO_16 int64_t i64[2]; - SIMDE_ALIGN_TO_16 uint8_t u8[16]; - SIMDE_ALIGN_TO_16 uint16_t u16[8]; - SIMDE_ALIGN_TO_16 uint32_t u32[4]; - SIMDE_ALIGN_TO_16 uint64_t u64[2]; - #if defined(SIMDE_HAVE_INT128_) - SIMDE_ALIGN_TO_16 simde_int128 i128[1]; - SIMDE_ALIGN_TO_16 simde_uint128 u128[1]; - #endif - SIMDE_ALIGN_TO_16 simde_float16 f16[8]; - SIMDE_ALIGN_TO_16 simde_float32 f32[4]; - SIMDE_ALIGN_TO_16 simde_float64 f64[2]; - - SIMDE_ALIGN_TO_16 int_fast32_t i32f[16 / sizeof(int_fast32_t)]; - SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)]; - #endif - - SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2]; - SIMDE_ALIGN_TO_16 simde__m64 m64[2]; - - #if defined(SIMDE_X86_SSE2_NATIVE) - SIMDE_ALIGN_TO_16 __m128i n; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_TO_16 int8x16_t neon_i8; - SIMDE_ALIGN_TO_16 int16x8_t neon_i16; - SIMDE_ALIGN_TO_16 int32x4_t neon_i32; - SIMDE_ALIGN_TO_16 int64x2_t neon_i64; - SIMDE_ALIGN_TO_16 uint8x16_t neon_u8; - SIMDE_ALIGN_TO_16 uint16x8_t neon_u16; - SIMDE_ALIGN_TO_16 uint32x4_t neon_u32; - SIMDE_ALIGN_TO_16 uint64x2_t neon_u64; - #if defined(__ARM_FP16_FORMAT_IEEE) - SIMDE_ALIGN_TO_16 float16x8_t neon_f16; - #endif - SIMDE_ALIGN_TO_16 float32x4_t neon_f32; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - SIMDE_ALIGN_TO_16 float64x2_t neon_f64; - #endif - #elif defined(SIMDE_MIPS_MSA_NATIVE) - v16i8 msa_i8; - v8i16 msa_i16; - v4i32 msa_i32; - v2i64 msa_i64; - v16u8 msa_u8; - v8u16 msa_u16; - v4u32 msa_u32; - v2u64 msa_u64; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - SIMDE_ALIGN_TO_16 v128_t wasm_v128; - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32; - #if defined(__UINT_FAST32_TYPE__) && (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__) altivec_i32f; - #else - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32f; - #endif - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32; - #if defined(__UINT_FAST32_TYPE__) && (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f; - #else - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32f; - #endif - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32; - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64; - #endif - #endif -} simde__m128i_private; - -typedef union { - #if defined(SIMDE_VECTOR_SUBSCRIPT) - SIMDE_ALIGN_TO_16 int8_t i8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int16_t i16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int32_t i32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int64_t i64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint8_t u8 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint16_t u16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint32_t u32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint64_t u64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 simde_float32 f32 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 simde_float64 f64 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 int_fast32_t i32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - SIMDE_ALIGN_TO_16 uint_fast32_t u32f SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - #else - SIMDE_ALIGN_TO_16 int8_t i8[16]; - SIMDE_ALIGN_TO_16 int16_t i16[8]; - SIMDE_ALIGN_TO_16 int32_t i32[4]; - SIMDE_ALIGN_TO_16 int64_t i64[2]; - SIMDE_ALIGN_TO_16 uint8_t u8[16]; - SIMDE_ALIGN_TO_16 uint16_t u16[8]; - SIMDE_ALIGN_TO_16 uint32_t u32[4]; - SIMDE_ALIGN_TO_16 uint64_t u64[2]; - SIMDE_ALIGN_TO_16 simde_float32 f32[4]; - SIMDE_ALIGN_TO_16 simde_float64 f64[2]; - SIMDE_ALIGN_TO_16 int_fast32_t i32f[16 / sizeof(int_fast32_t)]; - SIMDE_ALIGN_TO_16 uint_fast32_t u32f[16 / sizeof(uint_fast32_t)]; - #endif - - SIMDE_ALIGN_TO_16 simde__m64_private m64_private[2]; - SIMDE_ALIGN_TO_16 simde__m64 m64[2]; - - #if defined(SIMDE_X86_SSE2_NATIVE) - SIMDE_ALIGN_TO_16 __m128d n; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_TO_16 int8x16_t neon_i8; - SIMDE_ALIGN_TO_16 int16x8_t neon_i16; - SIMDE_ALIGN_TO_16 int32x4_t neon_i32; - SIMDE_ALIGN_TO_16 int64x2_t neon_i64; - SIMDE_ALIGN_TO_16 uint8x16_t neon_u8; - SIMDE_ALIGN_TO_16 uint16x8_t neon_u16; - SIMDE_ALIGN_TO_16 uint32x4_t neon_u32; - SIMDE_ALIGN_TO_16 uint64x2_t neon_u64; - SIMDE_ALIGN_TO_16 float32x4_t neon_f32; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - SIMDE_ALIGN_TO_16 float64x2_t neon_f64; - #endif - #elif defined(SIMDE_MIPS_MSA_NATIVE) - v16i8 msa_i8; - v8i16 msa_i16; - v4i32 msa_i32; - v2i64 msa_i64; - v16u8 msa_u8; - v8u16 msa_u16; - v4u32 msa_u32; - v2u64 msa_u64; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - SIMDE_ALIGN_TO_16 v128_t wasm_v128; - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed char) altivec_i8; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed short) altivec_i16; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32; - #if defined(__INT_FAST32_TYPE__) && (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__INT_FAST32_TYPE__) altivec_i32f; - #else - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed int) altivec_i32f; - #endif - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) altivec_u8; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) altivec_u16; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32; - #if defined(__UINT_FAST32_TYPE__) && (defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(__UINT_FAST32_TYPE__) altivec_u32f; - #else - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned int) altivec_u32f; - #endif - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(float) altivec_f32; - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(signed long long) altivec_i64; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long) altivec_u64; - SIMDE_ALIGN_TO_16 SIMDE_POWER_ALTIVEC_VECTOR(double) altivec_f64; - #endif - #endif -} simde__m128d_private; - -#if defined(SIMDE_X86_SSE2_NATIVE) - typedef __m128i simde__m128i; - typedef __m128d simde__m128d; -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - typedef int64x2_t simde__m128i; -# if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - typedef float64x2_t simde__m128d; -# elif defined(SIMDE_VECTOR_SUBSCRIPT) - typedef simde_float64 simde__m128d SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; -# else - typedef simde__m128d_private simde__m128d; -# endif -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - typedef v128_t simde__m128i; - typedef v128_t simde__m128d; -#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - typedef SIMDE_POWER_ALTIVEC_VECTOR(float) simde__m128i; - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - typedef SIMDE_POWER_ALTIVEC_VECTOR(double) simde__m128d; - #else - typedef simde__m128d_private simde__m128d; - #endif -#elif defined(SIMDE_VECTOR_SUBSCRIPT) - typedef int64_t simde__m128i SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; - typedef simde_float64 simde__m128d SIMDE_ALIGN_TO_16 SIMDE_VECTOR(16) SIMDE_MAY_ALIAS; -#else - typedef simde__m128i_private simde__m128i; - typedef simde__m128d_private simde__m128d; -#endif - -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - typedef simde__m128i __m128i; - typedef simde__m128d __m128d; -#endif - -HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i), "simde__m128i size incorrect"); -HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i_private), "simde__m128i_private size incorrect"); -HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d), "simde__m128d size incorrect"); -HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d_private), "simde__m128d_private size incorrect"); -#if defined(SIMDE_CHECK_ALIGNMENT) && defined(SIMDE_ALIGN_OF) -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i) == 16, "simde__m128i is not 16-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128i_private) == 16, "simde__m128i_private is not 16-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d) == 16, "simde__m128d is not 16-byte aligned"); -HEDLEY_STATIC_ASSERT(SIMDE_ALIGN_OF(simde__m128d_private) == 16, "simde__m128d_private is not 16-byte aligned"); -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde__m128i_from_private(simde__m128i_private v) { - simde__m128i r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i_private -simde__m128i_to_private(simde__m128i v) { - simde__m128i_private r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde__m128d_from_private(simde__m128d_private v) { - simde__m128d r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d_private -simde__m128d_to_private(simde__m128d v) { - simde__m128d_private r; - simde_memcpy(&r, &v, sizeof(r)); - return r; -} - -#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int8x16_t, neon, i8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int16x8_t, neon, i16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int32x4_t, neon, i32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, int64x2_t, neon, i64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint8x16_t, neon, u8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint16x8_t, neon, u16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint32x4_t, neon, u32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, uint64x2_t, neon, u64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float32x4_t, neon, f32) - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, float64x2_t, neon, f64) - #endif -#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32) - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64) - #endif -#endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */ - -#if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int8x16_t, neon, i8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int16x8_t, neon, i16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int32x4_t, neon, i32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, int64x2_t, neon, i64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint8x16_t, neon, u8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint16x8_t, neon, u16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint32x4_t, neon, u32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, uint64x2_t, neon, u64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float32x4_t, neon, f32) - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, float64x2_t, neon, f64) - #endif -#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed char), altivec, i8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed short), altivec, i16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed int), altivec, i32) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), altivec, u8) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned short), altivec, u16) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), altivec, u32) - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), altivec, u64) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(signed long long), altivec, i64) - #if defined(SIMDE_BUG_GCC_95782) - SIMDE_FUNCTION_ATTRIBUTES - SIMDE_POWER_ALTIVEC_VECTOR(double) - simde__m128d_to_altivec_f64(simde__m128d value) { - simde__m128d_private r_ = simde__m128d_to_private(value); - return r_.altivec_f64; - } - - SIMDE_FUNCTION_ATTRIBUTES - simde__m128d - simde__m128d_from_altivec_f64(SIMDE_POWER_ALTIVEC_VECTOR(double) value) { - simde__m128d_private r_; - r_.altivec_f64 = value; - return simde__m128d_from_private(r_); - } - #else - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, SIMDE_POWER_ALTIVEC_VECTOR(double), altivec, f64) - #endif - #endif -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128d, v128_t, wasm, v128); - SIMDE_X86_GENERATE_CONVERSION_FUNCTION(m128i, v128_t, wasm, v128); -#endif /* defined(SIMDE_ARM_NEON_A32V7_NATIVE) */ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_set_pd (simde_float64 e1, simde_float64 e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set_pd(e1, e0); - #else - simde__m128d_private r_; - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_make(e0, e1); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - SIMDE_ALIGN_TO_16 simde_float64 data[2] = { e0, e1 }; - r_.neon_f64 = vld1q_f64(data); - #else - r_.f64[0] = e0; - r_.f64[1] = e1; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set_pd(e1, e0) simde_mm_set_pd(e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_set1_pd (simde_float64 a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set1_pd(a); - #else - simde__m128d_private r_; - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_splat(a); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vdupq_n_f64(a); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = vec_splats(HEDLEY_STATIC_CAST(double, a)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.f64[i] = a; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#define simde_mm_set_pd1(a) simde_mm_set1_pd(a) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set1_pd(a) simde_mm_set1_pd(a) - #define _mm_set_pd1(a) simde_mm_set1_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_x_mm_abs_pd(simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - simde_float64 mask_; - uint64_t u64_ = UINT64_C(0x7FFFFFFFFFFFFFFF); - simde_memcpy(&mask_, &u64_, sizeof(u64_)); - return _mm_and_pd(_mm_set1_pd(mask_), a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vabsq_f64(a_.neon_f64); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = vec_abs(a_.altivec_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_abs(a_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_fabs(a_.f64[i]); - } - #endif - - return simde__m128d_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_x_mm_not_pd(simde__m128d a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - __m128i ai = _mm_castpd_si128(a); - return _mm_castsi128_pd(_mm_ternarylogic_epi64(ai, ai, ai, 0x55)); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vmvnq_s32(a_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f64 = vec_nor(a_.altivec_f64, a_.altivec_f64); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_not(a_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = ~a_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = ~(a_.i32f[i]); - } - #endif - - return simde__m128d_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_x_mm_select_pd(simde__m128d a, simde__m128d b, simde__m128d mask) { - /* This function is for when you want to blend two elements together - * according to a mask. It is similar to _mm_blendv_pd, except that - * it is undefined whether the blend is based on the highest bit in - * each lane (like blendv) or just bitwise operations. This allows - * us to implement the function efficiently everywhere. - * - * Basically, you promise that all the lanes in mask are either 0 or - * ~0. */ - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_blendv_pd(a, b, mask); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b), - mask_ = simde__m128d_to_private(mask); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 ^ ((a_.i64 ^ b_.i64) & mask_.i64); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vbslq_s64(mask_.neon_u64, b_.neon_i64, a_.neon_i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] ^ ((a_.i64[i] ^ b_.i64[i]) & mask_.i64[i]); - } - #endif - - return simde__m128d_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_add_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_add_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vaddq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i8 = vec_add(a_.altivec_i8, b_.altivec_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_add(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = a_.i8 + b_.i8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] + b_.i8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_add_epi8(a, b) simde_mm_add_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_add_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_add_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vaddq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i16 = vec_add(a_.altivec_i16, b_.altivec_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_add(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = a_.i16 + b_.i16; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] + b_.i16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_add_epi16(a, b) simde_mm_add_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_add_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_add_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vaddq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_add(a_.altivec_i32, b_.altivec_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_add(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 + b_.i32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] + b_.i32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_add_epi32(a, b) simde_mm_add_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_add_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_add_epi64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vaddq_s64(a_.neon_i64, b_.neon_i64); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_i64 = vec_add(a_.altivec_i64, b_.altivec_i64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_add(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 + b_.i64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] + b_.i64[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_add_epi64(a, b) simde_mm_add_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_add_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_add_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vaddq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f64 = vec_add(a_.altivec_f64, b_.altivec_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_add(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 + b_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[i] + b_.f64[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_add_pd(a, b) simde_mm_add_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_move_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_move_sd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vsetq_lane_f64(vgetq_lane_f64(b_.neon_f64, 0), a_.neon_f64, 0); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - #if defined(HEDLEY_IBM_VERSION) - r_.altivec_f64 = vec_xxpermdi(a_.altivec_f64, b_.altivec_f64, 1); - #else - r_.altivec_f64 = vec_xxpermdi(b_.altivec_f64, a_.altivec_f64, 1); - #endif - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 2, 1); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 2, 1); - #else - r_.f64[0] = b_.f64[0]; - r_.f64[1] = a_.f64[1]; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_move_sd(a, b) simde_mm_move_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_x_mm_broadcastlow_pd(simde__m128d a) { - /* This function broadcasts the first element in the input vector to - * all lanes. It is used to avoid generating spurious exceptions in - * *_sd functions since there may be garbage in the upper lanes. */ - - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_castsi128_pd(_mm_shuffle_epi32(_mm_castpd_si128(a), 0x44)); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vdupq_laneq_f64(a_.neon_f64, 0); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f64 = vec_splat(a_.altivec_f64, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_splat(a_.f64[0]); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 0, 0); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[0]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_add_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_add_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_add_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_add_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - r_.f64[0] = a_.f64[0] + b_.f64[0]; - r_.f64[1] = a_.f64[1]; - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_add_sd(a, b) simde_mm_add_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_add_si64 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_add_si64(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vadd_s64(a_.neon_i64, b_.neon_i64); - #else - r_.i64[0] = a_.i64[0] + b_.i64[0]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_add_si64(a, b) simde_mm_add_si64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_adds_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_adds_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vqaddq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_add_sat(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i8 = vec_adds(a_.altivec_i8, b_.altivec_i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = simde_math_adds_i8(a_.i8[i], b_.i8[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_adds_epi8(a, b) simde_mm_adds_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_adds_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_adds_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vqaddq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_add_sat(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i16 = vec_adds(a_.altivec_i16, b_.altivec_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = simde_math_adds_i16(a_.i16[i], b_.i16[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_adds_epi16(a, b) simde_mm_adds_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_adds_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_adds_epu8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vqaddq_u8(a_.neon_u8, b_.neon_u8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u8x16_add_sat(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_u8 = vec_adds(a_.altivec_u8, b_.altivec_u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = simde_math_adds_u8(a_.u8[i], b_.u8[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_adds_epu8(a, b) simde_mm_adds_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_adds_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_adds_epu16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vqaddq_u16(a_.neon_u16, b_.neon_u16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u16x8_add_sat(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_u16 = vec_adds(a_.altivec_u16, b_.altivec_u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = simde_math_adds_u16(a_.u16[i], b_.u16[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_adds_epu16(a, b) simde_mm_adds_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_and_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_and_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vandq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f64 = vec_and(a_.altivec_f64, b_.altivec_f64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f & b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] & b_.i32f[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_and_pd(a, b) simde_mm_and_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_and_si128 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_and_si128(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vandq_s32(b_.neon_i32, a_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_u32f = vec_and(a_.altivec_u32f, b_.altivec_u32f); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f & b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] & b_.i32f[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_and_si128(a, b) simde_mm_and_si128(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_andnot_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_andnot_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = vec_andc(b_.altivec_f64, a_.altivec_f64); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32f = vec_andc(b_.altivec_i32f, a_.altivec_i32f); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = ~a_.i32f & b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = ~a_.u64[i] & b_.u64[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_andnot_pd(a, b) simde_mm_andnot_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_andnot_si128 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_andnot_si128(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vbicq_s32(b_.neon_i32, a_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = vec_andc(b_.altivec_i32, a_.altivec_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = ~a_.i32f & b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = ~(a_.i32f[i]) & b_.i32f[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_andnot_si128(a, b) simde_mm_andnot_si128(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_xor_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_xor_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f ^ b_.i32f; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_xor(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = veorq_s64(a_.neon_i64, b_.neon_i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_xor_pd(a, b) simde_mm_xor_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_avg_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_avg_epu8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vrhaddq_u8(b_.neon_u8, a_.neon_u8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u8x16_avgr(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u8 = vec_avg(a_.altivec_u8, b_.altivec_u8); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) - uint16_t wa SIMDE_VECTOR(32); - uint16_t wb SIMDE_VECTOR(32); - uint16_t wr SIMDE_VECTOR(32); - SIMDE_CONVERT_VECTOR_(wa, a_.u8); - SIMDE_CONVERT_VECTOR_(wb, b_.u8); - wr = (wa + wb + 1) >> 1; - SIMDE_CONVERT_VECTOR_(r_.u8, wr); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] + b_.u8[i] + 1) >> 1; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_avg_epu8(a, b) simde_mm_avg_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_avg_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_avg_epu16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vrhaddq_u16(b_.neon_u16, a_.neon_u16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u16x8_avgr(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u16 = vec_avg(a_.altivec_u16, b_.altivec_u16); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && defined(SIMDE_CONVERT_VECTOR_) - uint32_t wa SIMDE_VECTOR(32); - uint32_t wb SIMDE_VECTOR(32); - uint32_t wr SIMDE_VECTOR(32); - SIMDE_CONVERT_VECTOR_(wa, a_.u16); - SIMDE_CONVERT_VECTOR_(wb, b_.u16); - wr = (wa + wb + 1) >> 1; - SIMDE_CONVERT_VECTOR_(r_.u16, wr); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] + b_.u16[i] + 1) >> 1; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_avg_epu16(a, b) simde_mm_avg_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_setzero_si128 (void) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_setzero_si128(); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vdupq_n_s32(0); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, 0)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_splat(INT32_C(0)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT) - r_.i32 = __extension__ (__typeof__(r_.i32)) { 0, 0, 0, 0 }; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = 0; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_setzero_si128() (simde_mm_setzero_si128()) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_bslli_si128 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - if (HEDLEY_UNLIKELY((imm8 & ~15))) { - return simde_mm_setzero_si128(); - } - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_ENDIAN_ORDER) - r_.altivec_i8 = - #if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - vec_slo - #else /* SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG */ - vec_sro - #endif - (a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, imm8 * 8))); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i8 = vec_srb(a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, (imm8 & 15) << 3))); - #elif defined(SIMDE_HAVE_INT128_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - r_.u128[0] = a_.u128[0] << (imm8 * 8); - #else - r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - for (int i = imm8 ; i < HEDLEY_STATIC_CAST(int, sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i - imm8]; - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) - #define simde_mm_bslli_si128(a, imm8) _mm_slli_si128(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) - #define simde_mm_bslli_si128(a, imm8) \ - simde__m128i_from_neon_i8(((imm8) <= 0) ? simde__m128i_to_neon_i8(a) : (((imm8) > 15) ? (vdupq_n_s8(0)) : (vextq_s8(vdupq_n_s8(0), simde__m128i_to_neon_i8(a), 16 - (imm8))))) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_bslli_si128(a, imm8) __extension__ ({ \ - simde__m128i_from_wasm_v128( \ - wasm_i8x16_shuffle(wasm_i32x4_splat(INT32_C(0)), \ - simde__m128i_to_wasm_v128((a)), \ - ((imm8)&0xF0) ? 0 : 16 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 17 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 18 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 19 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 20 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 21 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 22 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 23 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 24 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 25 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 26 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 27 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 28 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 29 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 30 - ((imm8)&0xF), \ - ((imm8)&0xF0) ? 0 : 31 - ((imm8)&0xF))); }) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - #define simde_mm_bslli_si128(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ - const simde__m128i_private simde_tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ - simde__m128i_private simde_tmp_r_; \ - if (HEDLEY_UNLIKELY(imm8 > 15)) { \ - simde_tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ - } else { \ - simde_tmp_r_.i8 = \ - SIMDE_SHUFFLE_VECTOR_(8, 16, \ - simde_tmp_z_.i8, \ - (simde_tmp_a_).i8, \ - HEDLEY_STATIC_CAST(int8_t, (16 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (17 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (18 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (19 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (20 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (21 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (22 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (23 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (24 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (25 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (26 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (27 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (28 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (29 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (30 - imm8) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (31 - imm8) & 31)); \ - } \ - simde__m128i_from_private(simde_tmp_r_); })) -#endif -#define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_bslli_si128(a, imm8) simde_mm_bslli_si128(a, imm8) - #define _mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_bsrli_si128 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - if (HEDLEY_UNLIKELY((imm8 & ~15))) { - return simde_mm_setzero_si128(); - } - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_ENDIAN_ORDER) - r_.altivec_i8 = - #if (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - vec_sro - #else /* SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG */ - vec_slo - #endif - (a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, imm8 * 8))); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i8 = vec_slb(a_.altivec_i8, vec_splats(HEDLEY_STATIC_CAST(unsigned char, (imm8 & 15) << 3))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - const int e = HEDLEY_STATIC_CAST(int, i) + imm8; - r_.i8[i] = (e < 16) ? a_.i8[e] : 0; - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) - #define simde_mm_bsrli_si128(a, imm8) _mm_srli_si128(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && !defined(__clang__) - #define simde_mm_bsrli_si128(a, imm8) \ - simde__m128i_from_neon_i8(((imm8 < 0) || (imm8 > 15)) ? vdupq_n_s8(0) : (vextq_s8(simde__m128i_to_private(a).neon_i8, vdupq_n_s8(0), ((imm8 & 15) != 0) ? imm8 : (imm8 & 15)))) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_bsrli_si128(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ - const simde__m128i_private simde_tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ - simde__m128i_private simde_tmp_r_ = simde__m128i_to_private(a); \ - if (HEDLEY_UNLIKELY(imm8 > 15)) { \ - simde_tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ - } else { \ - simde_tmp_r_.wasm_v128 = \ - wasm_i8x16_shuffle( \ - simde_tmp_z_.wasm_v128, \ - simde_tmp_a_.wasm_v128, \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 16) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 17) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 18) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 19) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 20) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 21) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 22) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 23) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 24) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 25) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 26) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 27) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 28) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 29) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 30) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 31) & 31)); \ - } \ - simde__m128i_from_private(simde_tmp_r_); })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) && !defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && !defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - #define simde_mm_bsrli_si128(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ - const simde__m128i_private simde_tmp_z_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ - simde__m128i_private simde_tmp_r_ = simde__m128i_to_private(a); \ - if (HEDLEY_UNLIKELY(imm8 > 15)) { \ - simde_tmp_r_ = simde__m128i_to_private(simde_mm_setzero_si128()); \ - } else { \ - simde_tmp_r_.i8 = \ - SIMDE_SHUFFLE_VECTOR_(8, 16, \ - simde_tmp_z_.i8, \ - (simde_tmp_a_).i8, \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 16) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 17) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 18) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 19) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 20) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 21) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 22) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 23) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 24) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 25) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 26) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 27) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 28) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 29) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 30) & 31), \ - HEDLEY_STATIC_CAST(int8_t, (imm8 + 31) & 31)); \ - } \ - simde__m128i_from_private(simde_tmp_r_); })) -#endif -#define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8)) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_bsrli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8)) - #define _mm_srli_si128(a, imm8) simde_mm_bsrli_si128((a), (imm8)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_clflush (void const* p) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_clflush(p); - #else - (void) p; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_clflush(p) simde_mm_clflush(p) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comieq_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_comieq_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return !!vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) == wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #else - return a_.f64[0] == b_.f64[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_comieq_sd(a, b) simde_mm_comieq_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comige_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_comige_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return !!vgetq_lane_u64(vcgeq_f64(a_.neon_f64, b_.neon_f64), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) >= wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #else - return a_.f64[0] >= b_.f64[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_comige_sd(a, b) simde_mm_comige_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comigt_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_comigt_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return !!vgetq_lane_u64(vcgtq_f64(a_.neon_f64, b_.neon_f64), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) > wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #else - return a_.f64[0] > b_.f64[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_comigt_sd(a, b) simde_mm_comigt_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comile_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_comile_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return !!vgetq_lane_u64(vcleq_f64(a_.neon_f64, b_.neon_f64), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) <= wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #else - return a_.f64[0] <= b_.f64[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_comile_sd(a, b) simde_mm_comile_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comilt_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_comilt_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return !!vgetq_lane_u64(vcltq_f64(a_.neon_f64, b_.neon_f64), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) < wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #else - return a_.f64[0] < b_.f64[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_comilt_sd(a, b) simde_mm_comilt_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_comineq_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_comineq_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return !vgetq_lane_u64(vceqq_f64(a_.neon_f64, b_.neon_f64), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) != wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #else - return a_.f64[0] != b_.f64[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_comineq_sd(a, b) simde_mm_comineq_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_x_mm_copysign_pd(simde__m128d dest, simde__m128d src) { - simde__m128d_private - r_, - dest_ = simde__m128d_to_private(dest), - src_ = simde__m128d_to_private(src); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint64x2_t sign_pos = vreinterpretq_u64_f64(vdupq_n_f64(-SIMDE_FLOAT64_C(0.0))); - #else - simde_float64 dbl_nz = -SIMDE_FLOAT64_C(0.0); - uint64_t u64_nz; - simde_memcpy(&u64_nz, &dbl_nz, sizeof(u64_nz)); - uint64x2_t sign_pos = vdupq_n_u64(u64_nz); - #endif - r_.neon_u64 = vbslq_u64(sign_pos, src_.neon_u64, dest_.neon_u64); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - #if defined(SIMDE_BUG_VEC_CPSGN_REVERSED_ARGS) - r_.altivec_f64 = vec_cpsgn(dest_.altivec_f64, src_.altivec_f64); - #else - r_.altivec_f64 = vec_cpsgn(src_.altivec_f64, dest_.altivec_f64); - #endif - #elif defined(simde_math_copysign) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_copysign(dest_.f64[i], src_.f64[i]); - } - #else - simde__m128d sgnbit = simde_mm_set1_pd(-SIMDE_FLOAT64_C(0.0)); - return simde_mm_xor_pd(simde_mm_and_pd(sgnbit, src), simde_mm_andnot_pd(sgnbit, dest)); - #endif - - return simde__m128d_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_x_mm_xorsign_pd(simde__m128d dest, simde__m128d src) { - return simde_mm_xor_pd(simde_mm_and_pd(simde_mm_set1_pd(-0.0), src), dest); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_castpd_ps (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_castpd_ps(a); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vreinterpretq_f32_f64(a); - #else - simde__m128 r; - simde_memcpy(&r, &a, sizeof(a)); - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_castpd_ps(a) simde_mm_castpd_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_castpd_si128 (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_castpd_si128(a); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vreinterpretq_s64_f64(a); - #else - simde__m128i r; - simde_memcpy(&r, &a, sizeof(a)); - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_castpd_si128(a) simde_mm_castpd_si128(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_castps_pd (simde__m128 a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_castps_pd(a); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vreinterpretq_f64_f32(a); - #else - simde__m128d r; - simde_memcpy(&r, &a, sizeof(a)); - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_castps_pd(a) simde_mm_castps_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_castps_si128 (simde__m128 a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_castps_si128(a); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return simde__m128i_from_neon_i32(simde__m128_to_private(a).neon_i32); - #else - simde__m128i r; - simde_memcpy(&r, &a, sizeof(a)); - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_castps_si128(a) simde_mm_castps_si128(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_castsi128_pd (simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_castsi128_pd(a); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vreinterpretq_f64_s64(a); - #else - simde__m128d r; - simde_memcpy(&r, &a, sizeof(a)); - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_castsi128_pd(a) simde_mm_castsi128_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_castsi128_ps (simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_castsi128_ps(a); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - return HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(float), a); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return simde__m128_from_neon_i32(simde__m128i_to_private(a).neon_i32); - #else - simde__m128 r; - simde_memcpy(&r, &a, sizeof(a)); - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_castsi128_ps(a) simde_mm_castsi128_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmpeq_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpeq_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vceqq_s8(b_.neon_i8, a_.neon_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_eq(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpeq(a_.altivec_i8, b_.altivec_i8)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 == b_.i8)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpeq_epi8(a, b) simde_mm_cmpeq_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmpeq_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpeq_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vceqq_s16(b_.neon_i16, a_.neon_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_eq(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpeq(a_.altivec_i16, b_.altivec_i16)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = (a_.i16 == b_.i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpeq_epi16(a, b) simde_mm_cmpeq_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmpeq_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpeq_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vceqq_s32(b_.neon_i32, a_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_eq(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpeq(a_.altivec_i32, b_.altivec_i32)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.i32 == b_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpeq_epi32(a, b) simde_mm_cmpeq_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpeq_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpeq_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vceqq_f64(b_.neon_f64, a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_eq(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpeq(a_.altivec_f64, b_.altivec_f64)); - #elif defined(SIMDE_MIPS_MSA_NATIVE) - r_.msa_i32 = __msa_addv_w(a_.msa_i32, b_.msa_i32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 == b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (a_.f64[i] == b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpeq_pd(a, b) simde_mm_cmpeq_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpeq_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpeq_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_cmpeq_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_cmpeq_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - r_.u64[0] = (a_.u64[0] == b_.u64[0]) ? ~UINT64_C(0) : 0; - r_.u64[1] = a_.u64[1]; - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpeq_sd(a, b) simde_mm_cmpeq_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpneq_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpneq_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u32 = vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(b_.neon_f64, a_.neon_f64))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_ne(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 != b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (a_.f64[i] != b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpneq_pd(a, b) simde_mm_cmpneq_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpneq_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpneq_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_cmpneq_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_cmpneq_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - r_.u64[0] = (a_.f64[0] != b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpneq_sd(a, b) simde_mm_cmpneq_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmplt_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmplt_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vcltq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char),vec_cmplt(a_.altivec_i8, b_.altivec_i8)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_lt(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 < b_.i8)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] < b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmplt_epi8(a, b) simde_mm_cmplt_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmplt_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmplt_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vcltq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmplt(a_.altivec_i16, b_.altivec_i16)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_lt(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.i16 < b_.i16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmplt_epi16(a, b) simde_mm_cmplt_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmplt_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmplt_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcltq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmplt(a_.altivec_i32, b_.altivec_i32)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_lt(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 < b_.i32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] < b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmplt_epi32(a, b) simde_mm_cmplt_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmplt_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmplt_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vcltq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmplt(a_.altivec_f64, b_.altivec_f64)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_lt(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 < b_.f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (a_.f64[i] < b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmplt_pd(a, b) simde_mm_cmplt_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmplt_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmplt_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_cmplt_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_cmplt_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - r_.u64[0] = (a_.f64[0] < b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmplt_sd(a, b) simde_mm_cmplt_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmple_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmple_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 <= b_.f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vcleq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_le(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmple(a_.altivec_f64, b_.altivec_f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (a_.f64[i] <= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmple_pd(a, b) simde_mm_cmple_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmple_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmple_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_cmple_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_cmple_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - r_.u64[0] = (a_.f64[0] <= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmple_sd(a, b) simde_mm_cmple_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmpgt_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpgt_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vcgtq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_gt(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i8 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmpgt(a_.altivec_i8, b_.altivec_i8)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), (a_.i8 > b_.i8)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpgt_epi8(a, b) simde_mm_cmpgt_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmpgt_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpgt_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vcgtq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_gt(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i16 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed short), vec_cmpgt(a_.altivec_i16, b_.altivec_i16)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), (a_.i16 > b_.i16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpgt_epi16(a, b) simde_mm_cmpgt_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmpgt_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpgt_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcgtq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_gt(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), vec_cmpgt(a_.altivec_i32, b_.altivec_i32)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), (a_.i32 > b_.i32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpgt_epi32(a, b) simde_mm_cmpgt_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpgt_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpgt_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 > b_.f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vcgtq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_gt(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpgt(a_.altivec_f64, b_.altivec_f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (a_.f64[i] > b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpgt_pd(a, b) simde_mm_cmpgt_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpgt_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) - return _mm_cmpgt_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_cmpgt_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_cmpgt_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - r_.u64[0] = (a_.f64[0] > b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpgt_sd(a, b) simde_mm_cmpgt_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpge_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpge_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), (a_.f64 >= b_.f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vcgeq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_ge(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_cmpge(a_.altivec_f64, b_.altivec_f64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (a_.f64[i] >= b_.f64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpge_pd(a, b) simde_mm_cmpge_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpge_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) - return _mm_cmpge_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_cmpge_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_cmpge_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - r_.u64[0] = (a_.f64[0] >= b_.f64[0]) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpge_sd(a, b) simde_mm_cmpge_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpngt_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpngt_pd(a, b); - #else - return simde_mm_cmple_pd(a, b); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpngt_pd(a, b) simde_mm_cmpngt_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpngt_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) - return _mm_cmpngt_sd(a, b); - #else - return simde_mm_cmple_sd(a, b); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpngt_sd(a, b) simde_mm_cmpngt_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpnge_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpnge_pd(a, b); - #else - return simde_mm_cmplt_pd(a, b); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpnge_pd(a, b) simde_mm_cmpnge_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpnge_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) - return _mm_cmpnge_sd(a, b); - #else - return simde_mm_cmplt_sd(a, b); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpnge_sd(a, b) simde_mm_cmpnge_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpnlt_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpnlt_pd(a, b); - #else - return simde_mm_cmpge_pd(a, b); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpnlt_pd(a, b) simde_mm_cmpnlt_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpnlt_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpnlt_sd(a, b); - #else - return simde_mm_cmpge_sd(a, b); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpnlt_sd(a, b) simde_mm_cmpnlt_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpnle_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpnle_pd(a, b); - #else - return simde_mm_cmpgt_pd(a, b); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpnle_pd(a, b) simde_mm_cmpnle_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpnle_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpnle_sd(a, b); - #else - return simde_mm_cmpgt_sd(a, b); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpnle_sd(a, b) simde_mm_cmpnle_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpord_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpord_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - /* Note: NEON does not have ordered compare builtin - Need to compare a eq a and b eq b to check for NaN - Do AND of results to get final */ - uint64x2_t ceqaa = vceqq_f64(a_.neon_f64, a_.neon_f64); - uint64x2_t ceqbb = vceqq_f64(b_.neon_f64, b_.neon_f64); - r_.neon_u64 = vandq_u64(ceqaa, ceqbb); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_and(wasm_f64x2_eq(a_.wasm_v128, a_.wasm_v128), - wasm_f64x2_eq(b_.wasm_v128, b_.wasm_v128)); - #elif defined(simde_math_isnan) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (!simde_math_isnan(a_.f64[i]) && !simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpord_pd(a, b) simde_mm_cmpord_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde_float64 -simde_mm_cvtsd_f64 (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) - return _mm_cvtsd_f64(a); - #else - simde__m128d_private a_ = simde__m128d_to_private(a); - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return HEDLEY_STATIC_CAST(simde_float64, vgetq_lane_f64(a_.neon_f64, 0)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return HEDLEY_STATIC_CAST(simde_float64, wasm_f64x2_extract_lane(a_.wasm_v128, 0)); - #else - return a_.f64[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtsd_f64(a) simde_mm_cvtsd_f64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpord_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpord_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_cmpord_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_cmpord_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(simde_math_isnan) - r_.u64[0] = (!simde_math_isnan(a_.f64[0]) && !simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpord_sd(a, b) simde_mm_cmpord_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpunord_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpunord_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint64x2_t ceqaa = vceqq_f64(a_.neon_f64, a_.neon_f64); - uint64x2_t ceqbb = vceqq_f64(b_.neon_f64, b_.neon_f64); - r_.neon_u64 = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(ceqaa, ceqbb)))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_or(wasm_f64x2_ne(a_.wasm_v128, a_.wasm_v128), - wasm_f64x2_ne(b_.wasm_v128, b_.wasm_v128)); - #elif defined(simde_math_isnan) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.u64[i] = (simde_math_isnan(a_.f64[i]) || simde_math_isnan(b_.f64[i])) ? ~UINT64_C(0) : UINT64_C(0); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpunord_pd(a, b) simde_mm_cmpunord_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cmpunord_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cmpunord_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_cmpunord_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_cmpunord_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(simde_math_isnan) - r_.u64[0] = (simde_math_isnan(a_.f64[0]) || simde_math_isnan(b_.f64[0])) ? ~UINT64_C(0) : UINT64_C(0); - r_.u64[1] = a_.u64[1]; - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cmpunord_sd(a, b) simde_mm_cmpunord_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cvtepi32_pd (simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvtepi32_pd(a); - #else - simde__m128d_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_convert_low_i32x4(a_.wasm_v128); - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (simde_float64) a_.i32[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtepi32_pd(a) simde_mm_cvtepi32_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtepi32_ps (simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvtepi32_ps(a); - #else - simde__m128_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vcvtq_f32_s32(a_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_convert_i32x4(a_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - HEDLEY_DIAGNOSTIC_PUSH - #if HEDLEY_HAS_WARNING("-Wc11-extensions") - #pragma clang diagnostic ignored "-Wc11-extensions" - #endif - r_.altivec_f32 = vec_ctf(a_.altivec_i32, 0); - HEDLEY_DIAGNOSTIC_POP - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.f32, a_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = (simde_float32) a_.i32[i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtepi32_ps(a) simde_mm_cvtepi32_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cvtpd_pi32 (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtpd_pi32(a); - #else - simde__m64_private r_; - simde__m128d_private a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - simde_float64 v = simde_math_round(a_.f64[i]); - #if defined(SIMDE_FAST_CONVERSION_RANGE) - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v); - #else - r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ? - SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; - #endif - } - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtpd_pi32(a) simde_mm_cvtpd_pi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtpd_epi32 (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_BUG_PGI_30107) - return _mm_cvtpd_epi32(a); - #else - simde__m128i_private r_; - - r_.m64[0] = simde_mm_cvtpd_pi32(a); - r_.m64[1] = simde_mm_setzero_si64(); - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtpd_epi32(a) simde_mm_cvtpd_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtpd_ps (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvtpd_ps(a); - #else - simde__m128_private r_; - simde__m128d_private a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vcombine_f32(vcvt_f32_f64(a_.neon_f64), vdup_n_f32(0.0f)); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_f32 = vec_float2(a_.altivec_f64, vec_splats(0.0)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f32x4_demote_f64x2_zero(a_.wasm_v128); - #elif HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && HEDLEY_HAS_BUILTIN(__builtin_convertvector) - float __attribute__((__vector_size__(8))) z = { 0.0f, 0.0f }; - r_.f32 = - __builtin_shufflevector( - __builtin_convertvector(__builtin_shufflevector(a_.f64, a_.f64, 0, 1), __typeof__(z)), z, - 0, 1, 2, 3 - ); - #else - r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, a_.f64[0]); - r_.f32[1] = HEDLEY_STATIC_CAST(simde_float32, a_.f64[1]); - r_.f32[2] = SIMDE_FLOAT32_C(0.0); - r_.f32[3] = SIMDE_FLOAT32_C(0.0); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtpd_ps(a) simde_mm_cvtpd_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cvtpi32_pd (simde__m64 a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvtpi32_pd(a); - #else - simde__m128d_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.f64, a_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (simde_float64) a_.i32[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtpi32_pd(a) simde_mm_cvtpi32_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtps_epi32 (simde__m128 a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvtps_epi32(a); - #else - simde__m128i_private r_; - simde__m128_private a_; - - #if defined(SIMDE_ARM_NEON_A32V8_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES) && !defined(SIMDE_BUG_GCC_95399) - a_ = simde__m128_to_private(a); - r_.neon_i32 = vcvtnq_s32_f32(a_.neon_f32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES) - a_ = simde__m128_to_private(a); - HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_C11_EXTENSIONS_ - SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_ - r_.altivec_i32 = vec_cts(a_.altivec_f32, 1); - HEDLEY_DIAGNOSTIC_POP - #elif defined(SIMDE_WASM_SIMD128_NATIVE) && defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_ROUND_TIES) - a_ = simde__m128_to_private(a); - r_.wasm_v128 = wasm_i32x4_trunc_sat_f32x4(a_.wasm_v128); - #else - a_ = simde__m128_to_private(simde_x_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEAREST_INT, 1)); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - simde_float32 v = simde_math_roundf(a_.f32[i]); - #if defined(SIMDE_FAST_CONVERSION_RANGE) - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v); - #else - r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ? - SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; - #endif - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtps_epi32(a) simde_mm_cvtps_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cvtps_pd (simde__m128 a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvtps_pd(a); - #else - simde__m128d_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_promote_low_f32x4(a_.wasm_v128); - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.f64, a_.m64_private[0].f32); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vcvt_f64_f32(vget_low_f32(a_.neon_f32)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f32[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtps_pd(a) simde_mm_cvtps_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_cvtsd_si32 (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvtsd_si32(a); - #else - simde__m128d_private a_ = simde__m128d_to_private(a); - - simde_float64 v = simde_math_round(a_.f64[0]); - #if defined(SIMDE_FAST_CONVERSION_RANGE) - return SIMDE_CONVERT_FTOI(int32_t, v); - #else - return ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ? - SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtsd_si32(a) simde_mm_cvtsd_si32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int64_t -simde_mm_cvtsd_si64 (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) - #if defined(__PGI) - return _mm_cvtsd_si64x(a); - #else - return _mm_cvtsd_si64(a); - #endif - #else - simde__m128d_private a_ = simde__m128d_to_private(a); - return SIMDE_CONVERT_FTOI(int64_t, simde_math_round(a_.f64[0])); - #endif -} -#define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) - #define _mm_cvtsd_si64(a) simde_mm_cvtsd_si64(a) - #define _mm_cvtsd_si64x(a) simde_mm_cvtsd_si64x(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cvtsd_ss (simde__m128 a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvtsd_ss(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - simde__m128d_private b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vsetq_lane_f32(vcvtxd_f32_f64(vgetq_lane_f64(b_.neon_f64, 0)), a_.neon_f32, 0); - #else - r_.f32[0] = HEDLEY_STATIC_CAST(simde_float32, b_.f64[0]); - - SIMDE_VECTORIZE - for (size_t i = 1 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i]; - } - #endif - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtsd_ss(a, b) simde_mm_cvtsd_ss(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int16_t -simde_x_mm_cvtsi128_si16 (simde__m128i a) { - simde__m128i_private - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return vgetq_lane_s16(a_.neon_i16, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return HEDLEY_STATIC_CAST(int16_t, wasm_i16x8_extract_lane(a_.wasm_v128, 0)); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - #if defined(SIMDE_BUG_GCC_95227) - (void) a_; - #endif - return vec_extract(a_.altivec_i16, 0); - #else - return a_.i16[0]; - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_cvtsi128_si32 (simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvtsi128_si32(a); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return vgetq_lane_s32(a_.neon_i32, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return HEDLEY_STATIC_CAST(int32_t, wasm_i32x4_extract_lane(a_.wasm_v128, 0)); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - #if defined(SIMDE_BUG_GCC_95227) - (void) a_; - #endif - return vec_extract(a_.altivec_i32, 0); - #else - return a_.i32[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtsi128_si32(a) simde_mm_cvtsi128_si32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int64_t -simde_mm_cvtsi128_si64 (simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) - #if defined(__PGI) - return _mm_cvtsi128_si64x(a); - #else - return _mm_cvtsi128_si64(a); - #endif - #else - simde__m128i_private a_ = simde__m128i_to_private(a); - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) && !defined(HEDLEY_IBM_VERSION) - return vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), a_.i64), 0); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return vgetq_lane_s64(a_.neon_i64, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return HEDLEY_STATIC_CAST(int64_t, wasm_i64x2_extract_lane(a_.wasm_v128, 0)); - #endif - return a_.i64[0]; - #endif -} -#define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) - #define _mm_cvtsi128_si64(a) simde_mm_cvtsi128_si64(a) - #define _mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64x(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cvtsi32_sd (simde__m128d a, int32_t b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvtsi32_sd(a, b); - #else - simde__m128d_private r_; - simde__m128d_private a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0); - #else - r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b); - r_.i64[1] = a_.i64[1]; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtsi32_sd(a, b) simde_mm_cvtsi32_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_cvtsi16_si128 (int16_t a) { - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vsetq_lane_s16(a, vdupq_n_s16(0), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_make(a, 0, 0, 0, 0, 0, 0, 0); - #else - r_.i16[0] = a; - r_.i16[1] = 0; - r_.i16[2] = 0; - r_.i16[3] = 0; - r_.i16[4] = 0; - r_.i16[5] = 0; - r_.i16[6] = 0; - r_.i16[7] = 0; - #endif - - return simde__m128i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtsi32_si128 (int32_t a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvtsi32_si128(a); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_make(a, 0, 0, 0); - #else - r_.i32[0] = a; - r_.i32[1] = 0; - r_.i32[2] = 0; - r_.i32[3] = 0; - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtsi32_si128(a) simde_mm_cvtsi32_si128(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cvtsi64_sd (simde__m128d a, int64_t b) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) - #if !defined(__PGI) - return _mm_cvtsi64_sd(a, b); - #else - return _mm_cvtsi64x_sd(a, b); - #endif - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vsetq_lane_f64(HEDLEY_STATIC_CAST(float64_t, b), a_.neon_f64, 0); - #else - r_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b); - r_.f64[1] = a_.f64[1]; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64_sd(a, b) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) - #define _mm_cvtsi64_sd(a, b) simde_mm_cvtsi64_sd(a, b) - #define _mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64x_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtsi64_si128 (int64_t a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) - #if !defined(__PGI) - return _mm_cvtsi64_si128(a); - #else - return _mm_cvtsi64x_si128(a); - #endif - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vsetq_lane_s64(a, vdupq_n_s64(0), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_make(a, 0); - #else - r_.i64[0] = a; - r_.i64[1] = 0; - #endif - - return simde__m128i_from_private(r_); - #endif -} -#define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) - #define _mm_cvtsi64_si128(a) simde_mm_cvtsi64_si128(a) - #define _mm_cvtsi64x_si128(a) simde_mm_cvtsi64x_si128(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cvtss_sd (simde__m128d a, simde__m128 b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvtss_sd(a, b); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - float64x2_t temp = vcvt_f64_f32(vset_lane_f32(vgetq_lane_f32(simde__m128_to_private(b).neon_f32, 0), vdup_n_f32(0), 0)); - return vsetq_lane_f64(vgetq_lane_f64(simde__m128d_to_private(a).neon_f64, 1), temp, 1); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a); - simde__m128_private b_ = simde__m128_to_private(b); - - a_.f64[0] = HEDLEY_STATIC_CAST(simde_float64, b_.f32[0]); - - return simde__m128d_from_private(a_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvtss_sd(a, b) simde_mm_cvtss_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_cvttpd_pi32 (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_cvttpd_pi32(a); - #else - simde__m64_private r_; - simde__m128d_private a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_CONVERT_VECTOR_) && defined(SIMDE_FAST_CONVERSION_RANGE) - SIMDE_CONVERT_VECTOR_(r_.i32, a_.f64); - #else - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - simde_float64 v = a_.f64[i]; - #if defined(SIMDE_FAST_CONVERSION_RANGE) - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v); - #else - r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ? - SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; - #endif - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvttpd_pi32(a) simde_mm_cvttpd_pi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvttpd_epi32 (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvttpd_epi32(a); - #else - simde__m128i_private r_; - - r_.m64[0] = simde_mm_cvttpd_pi32(a); - r_.m64[1] = simde_mm_setzero_si64(); - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvttpd_epi32(a) simde_mm_cvttpd_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvttps_epi32 (simde__m128 a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvttps_epi32(a); - #else - simde__m128i_private r_; - simde__m128_private a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vcvtq_s32_f32(a_.neon_f32); - - #if !defined(SIMDE_FAST_CONVERSION_RANGE) || !defined(SIMDE_FAST_NANS) - /* Values below INT32_MIN saturate anyways, so we don't need to - * test for that. */ - #if !defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_FAST_NANS) - uint32x4_t valid_input = - vandq_u32( - vcltq_f32(a_.neon_f32, vdupq_n_f32(SIMDE_FLOAT32_C(2147483648.0))), - vceqq_f32(a_.neon_f32, a_.neon_f32) - ); - #elif !defined(SIMDE_FAST_CONVERSION_RANGE) - uint32x4_t valid_input = vcltq_f32(a_.neon_f32, vdupq_n_f32(SIMDE_FLOAT32_C(2147483648.0))); - #elif !defined(SIMDE_FAST_NANS) - uint32x4_t valid_input = vceqq_f32(a_.neon_f32, a_.neon_f32); - #endif - - r_.neon_i32 = vbslq_s32(valid_input, r_.neon_i32, vdupq_n_s32(INT32_MIN)); - #endif - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_trunc_sat_f32x4(a_.wasm_v128); - - #if !defined(SIMDE_FAST_CONVERSION_RANGE) || !defined(SIMDE_FAST_NANS) - #if !defined(SIMDE_FAST_CONVERSION_RANGE) && !defined(SIMDE_FAST_NANS) - v128_t valid_input = - wasm_v128_and( - wasm_f32x4_lt(a_.wasm_v128, wasm_f32x4_splat(SIMDE_FLOAT32_C(2147483648.0))), - wasm_f32x4_eq(a_.wasm_v128, a_.wasm_v128) - ); - #elif !defined(SIMDE_FAST_CONVERSION_RANGE) - v128_t valid_input = wasm_f32x4_lt(a_.wasm_v128, wasm_f32x4_splat(SIMDE_FLOAT32_C(2147483648.0))); - #elif !defined(SIMDE_FAST_NANS) - v128_t valid_input = wasm_f32x4_eq(a_.wasm_v128, a_.wasm_v128); - #endif - - r_.wasm_v128 = wasm_v128_bitselect(r_.wasm_v128, wasm_i32x4_splat(INT32_MIN), valid_input); - #endif - #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_ARCH_POWER) - SIMDE_CONVERT_VECTOR_(r_.i32, a_.f32); - - #if !defined(SIMDE_FAST_CONVERSION_RANGE) || !defined(SIMDE_FAST_NANS) - #if !defined(SIMDE_FAST_CONVERSION_RANGE) - static const simde_float32 SIMDE_VECTOR(16) first_too_high = { SIMDE_FLOAT32_C(2147483648.0), SIMDE_FLOAT32_C(2147483648.0), SIMDE_FLOAT32_C(2147483648.0), SIMDE_FLOAT32_C(2147483648.0) }; - - __typeof__(r_.i32) valid_input = - HEDLEY_REINTERPRET_CAST( - __typeof__(r_.i32), - (a_.f32 < first_too_high) & (a_.f32 >= -first_too_high) - ); - #elif !defined(SIMDE_FAST_NANS) - __typeof__(r_.i32) valid_input = HEDLEY_REINTERPRET_CAST( __typeof__(valid_input), a_.f32 == a_.f32); - #endif - - __typeof__(r_.i32) invalid_output = { INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN }; - r_.i32 = (r_.i32 & valid_input) | (invalid_output & ~valid_input); - #endif - #else - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - simde_float32 v = a_.f32[i]; - #if defined(SIMDE_FAST_CONVERSION_RANGE) && defined(SIMDE_FAST_NANS) - r_.i32[i] = SIMDE_CONVERT_FTOI(int32_t, v); - #else - r_.i32[i] = ((v > HEDLEY_STATIC_CAST(simde_float32, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float32, INT32_MAX))) ? - SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; - #endif - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvttps_epi32(a) simde_mm_cvttps_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_cvttsd_si32 (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_cvttsd_si32(a); - #else - simde__m128d_private a_ = simde__m128d_to_private(a); - simde_float64 v = a_.f64[0]; - #if defined(SIMDE_FAST_CONVERSION_RANGE) - return SIMDE_CONVERT_FTOI(int32_t, v); - #else - return ((v > HEDLEY_STATIC_CAST(simde_float64, INT32_MIN)) && (v < HEDLEY_STATIC_CAST(simde_float64, INT32_MAX))) ? - SIMDE_CONVERT_FTOI(int32_t, v) : INT32_MIN; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_cvttsd_si32(a) simde_mm_cvttsd_si32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int64_t -simde_mm_cvttsd_si64 (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) - #if !defined(__PGI) - return _mm_cvttsd_si64(a); - #else - return _mm_cvttsd_si64x(a); - #endif - #else - simde__m128d_private a_ = simde__m128d_to_private(a); - return SIMDE_CONVERT_FTOI(int64_t, a_.f64[0]); - #endif -} -#define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) - #define _mm_cvttsd_si64(a) simde_mm_cvttsd_si64(a) - #define _mm_cvttsd_si64x(a) simde_mm_cvttsd_si64x(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_div_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_div_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 / b_.f64; - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vdivq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_div(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[i] / b_.f64[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_div_pd(a, b) simde_mm_div_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_div_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_div_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_div_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_div_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - float64x2_t temp = vdivq_f64(a_.neon_f64, b_.neon_f64); - r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1); - #else - r_.f64[0] = a_.f64[0] / b_.f64[0]; - r_.f64[1] = a_.f64[1]; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_div_sd(a, b) simde_mm_div_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_extract_epi16 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { - uint16_t r; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - #if defined(SIMDE_BUG_GCC_95227) - (void) a_; - (void) imm8; - #endif - r = HEDLEY_STATIC_CAST(uint16_t, vec_extract(a_.altivec_i16, imm8)); - #else - r = a_.u16[imm8 & 7]; - #endif - - return HEDLEY_STATIC_CAST(int32_t, r); -} -#if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(4,6,0)) - #define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_extract_epi16(a, imm8) (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_s16(simde__m128i_to_private(a).neon_i16, (imm8))) & (INT32_C(0x0000ffff))) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_extract_epi16(a, imm8) HEDLEY_STATIC_CAST(int32_t, wasm_u16x8_extract_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 7)) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_extract_epi16(a, imm8) simde_mm_extract_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_insert_epi16 (simde__m128i a, int16_t i, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { - simde__m128i_private a_ = simde__m128i_to_private(a); - a_.i16[imm8 & 7] = i; - return simde__m128i_from_private(a_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) - #define simde_mm_insert_epi16(a, i, imm8) _mm_insert_epi16((a), (i), (imm8)) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_insert_epi16(a, i, imm8) simde__m128i_from_neon_i16(vsetq_lane_s16((i), simde__m128i_to_neon_i16(a), (imm8))) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_insert_epi16(a, i, imm8) wasm_i16x8_replace_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 7, (i)) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_insert_epi16(a, i, imm8) simde_mm_insert_epi16(a, i, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_load_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_load_pd(mem_addr); - #else - simde__m128d_private r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vld1q_f64(mem_addr); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vld1q_u32(HEDLEY_REINTERPRET_CAST(uint32_t const*, mem_addr)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_load(mem_addr); - #else - simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), sizeof(r_)); - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_load_pd(mem_addr) simde_mm_load_pd(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_load1_pd (simde_float64 const* mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_load1_pd(mem_addr); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return simde__m128d_from_neon_f64(vld1q_dup_f64(mem_addr)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128d_from_wasm_v128(wasm_v128_load64_splat(mem_addr)); - #else - return simde_mm_set1_pd(*mem_addr); - #endif -} -#define simde_mm_load_pd1(mem_addr) simde_mm_load1_pd(mem_addr) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_load_pd1(mem_addr) simde_mm_load1_pd(mem_addr) - #define _mm_load1_pd(mem_addr) simde_mm_load1_pd(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_load_sd (simde_float64 const* mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_load_sd(mem_addr); - #else - simde__m128d_private r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vsetq_lane_f64(*mem_addr, vdupq_n_f64(0), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_load64_zero(HEDLEY_REINTERPRET_CAST(const void*, mem_addr)); - #else - r_.f64[0] = *mem_addr; - r_.u64[1] = UINT64_C(0); - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_load_sd(mem_addr) simde_mm_load_sd(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_load_si128 (simde__m128i const* mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_load_si128(HEDLEY_REINTERPRET_CAST(__m128i const*, mem_addr)); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - return vld1q_s64(HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr)); - #else - simde__m128i_private r_; - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_ld(0, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(int) const*, mem_addr)); - #else - simde_memcpy(&r_, SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128i), sizeof(simde__m128i)); - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_load_si128(mem_addr) simde_mm_load_si128(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_loadh_pd (simde__m128d a, simde_float64 const* mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_loadh_pd(a, mem_addr); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vcombine_f64(vget_low_f64(a_.neon_f64), vld1_f64(HEDLEY_REINTERPRET_CAST(const float64_t*, mem_addr))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_load64_lane(HEDLEY_REINTERPRET_CAST(const void*, mem_addr), a_.wasm_v128, 1); - #else - simde_float64 t; - - simde_memcpy(&t, mem_addr, sizeof(t)); - r_.f64[0] = a_.f64[0]; - r_.f64[1] = t; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_loadh_pd(a, mem_addr) simde_mm_loadh_pd(a, mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_loadl_epi64 (simde__m128i const* mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_loadl_epi64(mem_addr); - #else - simde__m128i_private r_; - - int64_t value; - simde_memcpy(&value, mem_addr, sizeof(value)); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vcombine_s64(vld1_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr)), vdup_n_s64(0)); - #else - r_.i64[0] = value; - r_.i64[1] = 0; - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_loadl_epi64(mem_addr) simde_mm_loadl_epi64(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_loadl_pd (simde__m128d a, simde_float64 const* mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_loadl_pd(a, mem_addr); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vcombine_f64(vld1_f64( - HEDLEY_REINTERPRET_CAST(const float64_t*, mem_addr)), vget_high_f64(a_.neon_f64)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_load64_lane(HEDLEY_REINTERPRET_CAST(const void*, mem_addr), a_.wasm_v128, 0); - #else - r_.f64[0] = *mem_addr; - r_.u64[1] = a_.u64[1]; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_loadl_pd(a, mem_addr) simde_mm_loadl_pd(a, mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_loadr_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_loadr_pd(mem_addr); - #else - simde__m128d_private - r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vld1q_f64(mem_addr); - r_.neon_f64 = vextq_f64(r_.neon_f64, r_.neon_f64, 1); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vld1q_s64(HEDLEY_REINTERPRET_CAST(int64_t const *, mem_addr)); - r_.neon_i64 = vextq_s64(r_.neon_i64, r_.neon_i64, 1); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t tmp = wasm_v128_load(mem_addr); - r_.wasm_v128 = wasm_i64x2_shuffle(tmp, tmp, 1, 0); - #else - r_.f64[0] = mem_addr[1]; - r_.f64[1] = mem_addr[0]; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_loadr_pd(mem_addr) simde_mm_loadr_pd(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_loadu_pd (simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)]) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_loadu_pd(mem_addr); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vld1q_f64(mem_addr); - #else - simde__m128d_private r_; - - simde_memcpy(&r_, mem_addr, sizeof(r_)); - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_loadu_pd(mem_addr) simde_mm_loadu_pd(mem_addr) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) \ - && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) \ - && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - #define simde_mm_loadu_epi8(mem_addr) _mm_loadu_epi8(mem_addr) -#else -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_loadu_epi8(void const * mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr)); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr)); - #else - simde_memcpy(&r_, mem_addr, sizeof(r_)); - #endif - - return simde__m128i_from_private(r_); - #endif -} -#endif -#define simde_x_mm_loadu_epi8(mem_addr) simde_mm_loadu_epi8(mem_addr) -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) - #undef _mm_loadu_epi8 - #define _mm_loadu_epi8(a) simde_mm_loadu_epi8(a) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && defined(SIMDE_X86_AVX512BW_NATIVE) \ - && !defined(SIMDE_BUG_GCC_95483) && !defined(SIMDE_BUG_CLANG_REV_344862) \ - && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - #define simde_mm_loadu_epi16(mem_addr) _mm_loadu_epi16(mem_addr) -#else -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_loadu_epi16(void const * mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr)); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vreinterpretq_s16_s8(vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr))); - #else - simde_memcpy(&r_, mem_addr, sizeof(r_)); - #endif - - return simde__m128i_from_private(r_); - #endif -} -#endif -#define simde_x_mm_loadu_epi16(mem_addr) simde_mm_loadu_epi16(mem_addr) -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || defined(SIMDE_X86_AVX512BW_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) - #undef _mm_loadu_epi16 - #define _mm_loadu_epi16(a) simde_mm_loadu_epi16(a) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) \ - && !defined(SIMDE_BUG_CLANG_REV_344862) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - #define simde_mm_loadu_epi32(mem_addr) _mm_loadu_epi32(mem_addr) -#else -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_loadu_epi32(void const * mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr)); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vreinterpretq_s32_s8(vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr))); - #else - simde_memcpy(&r_, mem_addr, sizeof(r_)); - #endif - - return simde__m128i_from_private(r_); - #endif -} -#endif -#define simde_x_mm_loadu_epi32(mem_addr) simde_mm_loadu_epi32(mem_addr) -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) - #undef _mm_loadu_epi32 - #define _mm_loadu_epi32(a) simde_mm_loadu_epi32(a) -#endif - -#if defined(SIMDE_X86_AVX512VL_NATIVE) && !defined(SIMDE_BUG_GCC_95483) \ - && !defined(SIMDE_BUG_CLANG_REV_344862) \ - && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,20,0)) - #define simde_mm_loadu_epi64(mem_addr) _mm_loadu_epi64(mem_addr) -#else -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_loadu_epi64(void const * mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_loadu_si128(SIMDE_ALIGN_CAST(__m128i const *, mem_addr)); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vreinterpretq_s64_s8(vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr))); - #else - simde_memcpy(&r_, mem_addr, sizeof(r_)); - #endif - - return simde__m128i_from_private(r_); - #endif -} -#endif -#define simde_x_mm_loadu_epi64(mem_addr) simde_mm_loadu_epi64(mem_addr) -#if defined(SIMDE_X86_AVX512VL_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && (defined(SIMDE_BUG_GCC_95483) || defined(SIMDE_BUG_CLANG_REV_344862))) - #undef _mm_loadu_epi64 - #define _mm_loadu_epi64(a) simde_mm_loadu_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_loadu_si128 (void const* mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_loadu_si128(HEDLEY_STATIC_CAST(__m128i const*, mem_addr)); - #else - simde__m128i_private r_; - - #if HEDLEY_GNUC_HAS_ATTRIBUTE(may_alias,3,3,0) - HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_PACKED_ - struct simde_mm_loadu_si128_s { - __typeof__(r_) v; - } __attribute__((__packed__, __may_alias__)); - r_ = HEDLEY_REINTERPRET_CAST(const struct simde_mm_loadu_si128_s *, mem_addr)->v; - HEDLEY_DIAGNOSTIC_POP - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vld1q_s8(HEDLEY_REINTERPRET_CAST(int8_t const*, mem_addr)); - #else - simde_memcpy(&r_, mem_addr, sizeof(r_)); - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_loadu_si128(mem_addr) simde_mm_loadu_si128(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_madd_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_madd_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - int32x4_t pl = vmull_s16(vget_low_s16(a_.neon_i16), vget_low_s16(b_.neon_i16)); - int32x4_t ph = vmull_high_s16(a_.neon_i16, b_.neon_i16); - r_.neon_i32 = vpaddq_s32(pl, ph); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int32x4_t pl = vmull_s16(vget_low_s16(a_.neon_i16), vget_low_s16(b_.neon_i16)); - int32x4_t ph = vmull_s16(vget_high_s16(a_.neon_i16), vget_high_s16(b_.neon_i16)); - int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl)); - int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph)); - r_.neon_i32 = vcombine_s32(rl, rh); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_msum(a_.altivec_i16, b_.altivec_i16, vec_splats(0)); - #elif defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = vec_mule(a_.altivec_i16, b_.altivec_i16) + vec_mulo(a_.altivec_i16, b_.altivec_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_dot_i16x8(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) - int32_t SIMDE_VECTOR(32) a32, b32, p32; - SIMDE_CONVERT_VECTOR_(a32, a_.i16); - SIMDE_CONVERT_VECTOR_(b32, b_.i16); - p32 = a32 * b32; - r_.i32 = - __builtin_shufflevector(p32, p32, 0, 2, 4, 6) + - __builtin_shufflevector(p32, p32, 1, 3, 5, 7); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i += 2) { - r_.i32[i / 2] = (a_.i16[i] * b_.i16[i]) + (a_.i16[i + 1] * b_.i16[i + 1]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_madd_epi16(a, b) simde_mm_madd_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_maskmoveu_si128 (simde__m128i a, simde__m128i mask, int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)]) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_maskmoveu_si128(a, mask, HEDLEY_REINTERPRET_CAST(char*, mem_addr)); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - mask_ = simde__m128i_to_private(mask); - - for (size_t i = 0 ; i < (sizeof(a_.i8) / sizeof(a_.i8[0])) ; i++) { - if (mask_.u8[i] & 0x80) { - mem_addr[i] = a_.i8[i]; - } - } - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_maskmoveu_si128(a, mask, mem_addr) simde_mm_maskmoveu_si128((a), (mask), SIMDE_CHECKED_REINTERPRET_CAST(int8_t*, char*, (mem_addr))) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_movemask_epi8 (simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__INTEL_COMPILER) - /* ICC has trouble with _mm_movemask_epi8 at -O2 and above: */ - return _mm_movemask_epi8(a); - #else - int32_t r = 0; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - /* https://github.com/WebAssembly/simd/pull/201#issue-380682845 */ - static const uint8_t md[16] = { - 1 << 0, 1 << 1, 1 << 2, 1 << 3, - 1 << 4, 1 << 5, 1 << 6, 1 << 7, - 1 << 0, 1 << 1, 1 << 2, 1 << 3, - 1 << 4, 1 << 5, 1 << 6, 1 << 7, - }; - - /* Extend sign bit over entire lane */ - uint8x16_t extended = vreinterpretq_u8_s8(vshrq_n_s8(a_.neon_i8, 7)); - /* Clear all but the bit we're interested in. */ - uint8x16_t masked = vandq_u8(vld1q_u8(md), extended); - /* Alternate bytes from low half and high half */ - uint8x8x2_t tmp = vzip_u8(vget_low_u8(masked), vget_high_u8(masked)); - uint16x8_t x = vreinterpretq_u16_u8(vcombine_u8(tmp.val[0], tmp.val[1])); - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r = vaddvq_u16(x); - #else - uint64x2_t t64 = vpaddlq_u32(vpaddlq_u16(x)); - r = - HEDLEY_STATIC_CAST(int32_t, vgetq_lane_u64(t64, 0)) + - HEDLEY_STATIC_CAST(int32_t, vgetq_lane_u64(t64, 1)); - #endif - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 }; - r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 1)); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(HEDLEY_IBM_VERSION) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_BIG) - static const SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) perm = { 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 0 }; - r = HEDLEY_STATIC_CAST(int32_t, vec_extract(vec_vbpermq(a_.altivec_u8, perm), 14)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r = HEDLEY_STATIC_CAST(int32_t, wasm_i8x16_bitmask(a_.wasm_v128)); - #else - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u8) / sizeof(a_.u8[0])) ; i++) { - r |= (a_.u8[15 - i] >> 7) << (15 - i); - } - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_movemask_epi8(a) simde_mm_movemask_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_movemask_pd (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_movemask_pd(a); - #else - int32_t r = 0; - simde__m128d_private a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - HEDLEY_DIAGNOSTIC_PUSH - SIMDE_DIAGNOSTIC_DISABLE_VECTOR_CONVERSION_ - uint64x2_t shifted = vshrq_n_u64(a_.neon_u64, 63); - r = - HEDLEY_STATIC_CAST(int32_t, vgetq_lane_u64(shifted, 0)) + - (HEDLEY_STATIC_CAST(int32_t, vgetq_lane_u64(shifted, 1)) << 1); - HEDLEY_DIAGNOSTIC_POP - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && defined(SIMDE_BUG_CLANG_50932) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) idx = { 64, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) res = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), vec_bperm(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned __int128), a_.altivec_u64), idx)); - r = HEDLEY_STATIC_CAST(int32_t, vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), res), 2)); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) idx = { 64, 0, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }; - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) res = vec_bperm(a_.altivec_u8, idx); - r = HEDLEY_STATIC_CAST(int32_t, vec_extract(HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed int), res), 2)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r = HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_bitmask(a_.wasm_v128)); - #else - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - r |= (a_.u64[i] >> 63) << i; - } - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_movemask_pd(a) simde_mm_movemask_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_movepi64_pi64 (simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_movepi64_pi64(a); - #else - simde__m64_private r_; - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i64 = vget_low_s64(a_.neon_i64); - #else - r_.i64[0] = a_.i64[0]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_movepi64_pi64(a) simde_mm_movepi64_pi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_movpi64_epi64 (simde__m64 a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_movpi64_epi64(a); - #else - simde__m128i_private r_; - simde__m64_private a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vcombine_s64(a_.neon_i64, vdup_n_s64(0)); - #else - r_.i64[0] = a_.i64[0]; - r_.i64[1] = 0; - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_movpi64_epi64(a) simde_mm_movpi64_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_min_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_min_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vminq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_min(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i16 = vec_min(a_.altivec_i16, b_.altivec_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? a_.i16[i] : b_.i16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_min_epi16(a, b) simde_mm_min_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_min_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_min_epu8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vminq_u8(a_.neon_u8, b_.neon_u8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u8x16_min(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u8 = vec_min(a_.altivec_u8, b_.altivec_u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? a_.u8[i] : b_.u8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_min_epu8(a, b) simde_mm_min_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_min_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_min_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = vec_min(a_.altivec_f64, b_.altivec_f64); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vminq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_min(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (a_.f64[i] < b_.f64[i]) ? a_.f64[i] : b_.f64[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_min_pd(a, b) simde_mm_min_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_min_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_min_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_min_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_min_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - float64x2_t temp = vminq_f64(a_.neon_f64, b_.neon_f64); - r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1); - #else - r_.f64[0] = (a_.f64[0] < b_.f64[0]) ? a_.f64[0] : b_.f64[0]; - r_.f64[1] = a_.f64[1]; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_min_sd(a, b) simde_mm_min_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_max_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_max_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vmaxq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_max(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i16 = vec_max(a_.altivec_i16, b_.altivec_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? a_.i16[i] : b_.i16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_max_epi16(a, b) simde_mm_max_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_max_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_max_epu8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vmaxq_u8(a_.neon_u8, b_.neon_u8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u8x16_max(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u8 = vec_max(a_.altivec_u8, b_.altivec_u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? a_.u8[i] : b_.u8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_max_epu8(a, b) simde_mm_max_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_max_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_max_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = vec_max(a_.altivec_f64, b_.altivec_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_max(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vmaxq_f64(a_.neon_f64, b_.neon_f64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = (a_.f64[i] > b_.f64[i]) ? a_.f64[i] : b_.f64[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_max_pd(a, b) simde_mm_max_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_max_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_max_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_max_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_max_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - float64x2_t temp = vmaxq_f64(a_.neon_f64, b_.neon_f64); - r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1); - #else - r_.f64[0] = (a_.f64[0] > b_.f64[0]) ? a_.f64[0] : b_.f64[0]; - r_.f64[1] = a_.f64[1]; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_max_sd(a, b) simde_mm_max_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_move_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_move_epi64(a); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vsetq_lane_s64(0, a_.neon_i64, 1); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, wasm_i64x2_const(0, 0), 0, 2); - #else - r_.i64[0] = a_.i64[0]; - r_.i64[1] = 0; - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_move_epi64(a) simde_mm_move_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mul_epu32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_mul_epu32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x2_t a_lo = vmovn_u64(a_.neon_u64); - uint32x2_t b_lo = vmovn_u64(b_.neon_u64); - r_.neon_u64 = vmull_u32(a_lo, b_lo); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u64x2_extmul_low_u32x4( - wasm_i32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 2, 0, 2), - wasm_i32x4_shuffle(b_.wasm_v128, b_.wasm_v128, 0, 2, 0, 2)); - #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - __typeof__(a_.u32) z = { 0, }; - a_.u32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.u32, z, 0, 4, 2, 6); - b_.u32 = SIMDE_SHUFFLE_VECTOR_(32, 16, b_.u32, z, 0, 4, 2, 6); - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), a_.u32) * - HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[i * 2]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[i * 2]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_mul_epu32(a, b) simde_mm_mul_epu32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_mul_epi64 (simde__m128i a, simde__m128i b) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_mul(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 * b_.i64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] * b_.i64[i]; - } - #endif - - return simde__m128i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_mod_epi64 (simde__m128i a, simde__m128i b) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.i64 = a_.i64 % b_.i64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] % b_.i64[i]; - } - #endif - - return simde__m128i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_mul_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_mul_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 * b_.f64; - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vmulq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_mul(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[i] * b_.f64[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_mul_pd(a, b) simde_mm_mul_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_mul_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_mul_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_mul_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_mul_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - float64x2_t temp = vmulq_f64(a_.neon_f64, b_.neon_f64); - r_.neon_f64 = vsetq_lane_f64(vgetq_lane(a_.neon_f64, 1), temp, 1); - #else - r_.f64[0] = a_.f64[0] * b_.f64[0]; - r_.f64[1] = a_.f64[1]; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_mul_sd(a, b) simde_mm_mul_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_mul_su32 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) && !defined(__PGI) - return _mm_mul_su32(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.u64[0] = vget_lane_u64(vget_low_u64(vmull_u32(vreinterpret_u32_s64(a_.neon_i64), vreinterpret_u32_s64(b_.neon_i64))), 0); - #else - r_.u64[0] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[0]) * HEDLEY_STATIC_CAST(uint64_t, b_.u32[0]); - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_mul_su32(a, b) simde_mm_mul_su32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mulhi_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_mulhi_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x4_t a3210 = vget_low_s16(a_.neon_i16); - int16x4_t b3210 = vget_low_s16(b_.neon_i16); - int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */ - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - int32x4_t ab7654 = vmull_high_s16(a_.neon_i16, b_.neon_i16); - r_.neon_i16 = vuzp2q_s16(vreinterpretq_s16_s32(ab3210), vreinterpretq_s16_s32(ab7654)); - #else - int16x4_t a7654 = vget_high_s16(a_.neon_i16); - int16x4_t b7654 = vget_high_s16(b_.neon_i16); - int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */ - uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654)); - r_.neon_u16 = rv.val[1]; - #endif - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - const v128_t lo = wasm_i32x4_extmul_low_i16x8(a_.wasm_v128, b_.wasm_v128); - const v128_t hi = wasm_i32x4_extmul_high_i16x8(a_.wasm_v128, b_.wasm_v128); - r_.wasm_v128 = wasm_i16x8_shuffle(lo, hi, 1, 3, 5, 7, 9, 11, 13, 15); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (HEDLEY_STATIC_CAST(uint32_t, HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) >> 16)); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_mulhi_epi16(a, b) simde_mm_mulhi_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mulhi_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) - return _mm_mulhi_epu16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint16x4_t a3210 = vget_low_u16(a_.neon_u16); - uint16x4_t b3210 = vget_low_u16(b_.neon_u16); - uint32x4_t ab3210 = vmull_u16(a3210, b3210); /* 3333222211110000 */ - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint32x4_t ab7654 = vmull_high_u16(a_.neon_u16, b_.neon_u16); - r_.neon_u16 = vuzp2q_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); - #else - uint16x4_t a7654 = vget_high_u16(a_.neon_u16); - uint16x4_t b7654 = vget_high_u16(b_.neon_u16); - uint32x4_t ab7654 = vmull_u16(a7654, b7654); /* 7777666655554444 */ - uint16x8x2_t neon_r = vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); - r_.neon_u16 = neon_r.val[1]; - #endif - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - const v128_t lo = wasm_u32x4_extmul_low_u16x8(a_.wasm_v128, b_.wasm_v128); - const v128_t hi = wasm_u32x4_extmul_high_u16x8(a_.wasm_v128, b_.wasm_v128); - r_.wasm_v128 = wasm_i16x8_shuffle(lo, hi, 1, 3, 5, 7, 9, 11, 13, 15); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i]) >> 16); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_mulhi_epu16(a, b) simde_mm_mulhi_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mullo_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_mullo_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vmulq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - (void) a_; - (void) b_; - r_.altivec_i16 = vec_mul(a_.altivec_i16, b_.altivec_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_mul(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, HEDLEY_STATIC_CAST(uint32_t, a_.u16[i]) * HEDLEY_STATIC_CAST(uint32_t, b_.u16[i])); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_mullo_epi16(a, b) simde_mm_mullo_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_or_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_or_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f | b_.i32f; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vorrq_s64(a_.neon_i64, b_.neon_i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] | b_.i32f[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_or_pd(a, b) simde_mm_or_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_or_si128 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_or_si128(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vorrq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_or(a_.altivec_i32, b_.altivec_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_or(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f | b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] | b_.i32f[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_or_si128(a, b) simde_mm_or_si128(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_packs_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_packs_epi16(a, b); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i8 = vqmovn_high_s16(vqmovn_s16(a_.neon_i16), b_.neon_i16); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vcombine_s8(vqmovn_s16(a_.neon_i16), vqmovn_s16(b_.neon_i16)); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i8 = vec_packs(a_.altivec_i16, b_.altivec_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_narrow_i16x8(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) - int16_t SIMDE_VECTOR(32) v = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - const int16_t SIMDE_VECTOR(32) min = { INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN, INT8_MIN }; - const int16_t SIMDE_VECTOR(32) max = { INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX, INT8_MAX }; - - int16_t m SIMDE_VECTOR(32); - m = HEDLEY_REINTERPRET_CAST(__typeof__(m), v < min); - v = (v & ~m) | (min & m); - - m = v > max; - v = (v & ~m) | (max & m); - - SIMDE_CONVERT_VECTOR_(r_.i8, v); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - int16_t v = (i < (sizeof(a_.i16) / sizeof(a_.i16[0]))) ? a_.i16[i] : b_.i16[i & 7]; - r_.i8[i] = (v < INT8_MIN) ? INT8_MIN : ((v > INT8_MAX) ? INT8_MAX : HEDLEY_STATIC_CAST(int8_t, v)); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_packs_epi16(a, b) simde_mm_packs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_packs_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_packs_epi32(a, b); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i16 = vqmovn_high_s32(vqmovn_s32(a_.neon_i32), b_.neon_i32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vcombine_s16(vqmovn_s32(a_.neon_i32), vqmovn_s32(b_.neon_i32)); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i16 = vec_packs(a_.altivec_i32, b_.altivec_i32); - #elif defined(SIMDE_X86_SSE2_NATIVE) - r_.sse_m128i = _mm_packs_epi32(a_.sse_m128i, b_.sse_m128i); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_narrow_i32x4(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) - int32_t SIMDE_VECTOR(32) v = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, 0, 1, 2, 3, 4, 5, 6, 7); - const int32_t SIMDE_VECTOR(32) min = { INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN, INT16_MIN }; - const int32_t SIMDE_VECTOR(32) max = { INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX, INT16_MAX }; - - int32_t m SIMDE_VECTOR(32); - m = HEDLEY_REINTERPRET_CAST(__typeof__(m), v < min); - v = (v & ~m) | (min & m); - - m = HEDLEY_REINTERPRET_CAST(__typeof__(m), v > max); - v = (v & ~m) | (max & m); - - SIMDE_CONVERT_VECTOR_(r_.i16, v); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - int32_t v = (i < (sizeof(a_.i32) / sizeof(a_.i32[0]))) ? a_.i32[i] : b_.i32[i & 3]; - r_.i16[i] = (v < INT16_MIN) ? INT16_MIN : ((v > INT16_MAX) ? INT16_MAX : HEDLEY_STATIC_CAST(int16_t, v)); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_packs_epi32(a, b) simde_mm_packs_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_packus_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_packus_epi16(a, b); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - #if defined(SIMDE_BUG_CLANG_46840) - r_.neon_u8 = vqmovun_high_s16(vreinterpret_s8_u8(vqmovun_s16(a_.neon_i16)), b_.neon_i16); - #else - r_.neon_u8 = vqmovun_high_s16(vqmovun_s16(a_.neon_i16), b_.neon_i16); - #endif - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = - vcombine_u8( - vqmovun_s16(a_.neon_i16), - vqmovun_s16(b_.neon_i16) - ); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_u8 = vec_packsu(a_.altivec_i16, b_.altivec_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u8x16_narrow_i16x8(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - int16_t v SIMDE_VECTOR(32) = SIMDE_SHUFFLE_VECTOR_(16, 32, a_.i16, b_.i16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); - - v &= ~(v >> 15); - v |= HEDLEY_REINTERPRET_CAST(__typeof__(v), v > UINT8_MAX); - - SIMDE_CONVERT_VECTOR_(r_.i8, v); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - int16_t v = (i < (sizeof(a_.i16) / sizeof(a_.i16[0]))) ? a_.i16[i] : b_.i16[i & 7]; - r_.u8[i] = (v < 0) ? UINT8_C(0) : ((v > UINT8_MAX) ? UINT8_MAX : HEDLEY_STATIC_CAST(uint8_t, v)); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_packus_epi16(a, b) simde_mm_packus_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_pause (void) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_pause(); - #elif defined(SIMDE_ARCH_X86) - __asm__ __volatile__("pause"); - #elif defined(SIMDE_ARCH_ARM_NEON) - #if defined(_MSC_VER) - __isb(_ARM64_BARRIER_SY); - #else - __asm__ __volatile__("isb\n"); - #endif - #elif defined(SIMDE_ARCH_POWER) - __asm__ __volatile__ ("or 27,27,27" ::: "memory"); - #elif defined(SIMDE_ARCH_WASM) - __asm__ __volatile__ ("nop"); - #elif defined(HEDLEY_GCC_VERSION) - #if defined(SIMDE_ARCH_RISCV) - __builtin_riscv_pause(); - #else - __asm__ __volatile__ ("nop" ::: "memory"); - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_pause() (simde_mm_pause()) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sad_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sad_epu8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const uint16x8_t t = vpaddlq_u8(vabdq_u8(a_.neon_u8, b_.neon_u8)); - r_.neon_u64 = vcombine_u64( - vpaddl_u32(vpaddl_u16(vget_low_u16(t))), - vpaddl_u32(vpaddl_u16(vget_high_u16(t)))); - #else - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - uint16_t tmp = 0; - SIMDE_VECTORIZE_REDUCTION(+:tmp) - for (size_t j = 0 ; j < ((sizeof(r_.u8) / sizeof(r_.u8[0])) / 2) ; j++) { - const size_t e = j + (i * 8); - tmp += (a_.u8[e] > b_.u8[e]) ? (a_.u8[e] - b_.u8[e]) : (b_.u8[e] - a_.u8[e]); - } - r_.i64[i] = tmp; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sad_epu8(a, b) simde_mm_sad_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_set_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12, - int8_t e11, int8_t e10, int8_t e9, int8_t e8, - int8_t e7, int8_t e6, int8_t e5, int8_t e4, - int8_t e3, int8_t e2, int8_t e1, int8_t e0) { - - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set_epi8( - e15, e14, e13, e12, e11, e10, e9, e8, - e7, e6, e5, e4, e3, e2, e1, e0); - #else - simde__m128i_private r_; - - #if defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_make( - e0, e1, e2, e3, e4, e5, e6, e7, - e8, e9, e10, e11, e12, e13, e14, e15); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_LIKE_16(int8x16_t) int8_t data[16] = { - e0, e1, e2, e3, - e4, e5, e6, e7, - e8, e9, e10, e11, - e12, e13, e14, e15}; - r_.neon_i8 = vld1q_s8(data); - #else - r_.i8[ 0] = e0; - r_.i8[ 1] = e1; - r_.i8[ 2] = e2; - r_.i8[ 3] = e3; - r_.i8[ 4] = e4; - r_.i8[ 5] = e5; - r_.i8[ 6] = e6; - r_.i8[ 7] = e7; - r_.i8[ 8] = e8; - r_.i8[ 9] = e9; - r_.i8[10] = e10; - r_.i8[11] = e11; - r_.i8[12] = e12; - r_.i8[13] = e13; - r_.i8[14] = e14; - r_.i8[15] = e15; - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_set_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4, - int16_t e3, int16_t e2, int16_t e1, int16_t e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_LIKE_16(int16x8_t) int16_t data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 }; - r_.neon_i16 = vld1q_s16(data); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_make(e0, e1, e2, e3, e4, e5, e6, e7); - #else - r_.i16[0] = e0; - r_.i16[1] = e1; - r_.i16[2] = e2; - r_.i16[3] = e3; - r_.i16[4] = e4; - r_.i16[5] = e5; - r_.i16[6] = e6; - r_.i16[7] = e7; - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_loadu_si16 (void const* mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) && ( \ - SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(20,21,1) || \ - HEDLEY_GCC_VERSION_CHECK(12,1,0)) - return _mm_loadu_si16(mem_addr); - #else - int16_t val; - simde_memcpy(&val, mem_addr, sizeof(val)); - return simde_x_mm_cvtsi16_si128(val); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_loadu_si16(mem_addr) simde_mm_loadu_si16(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_set_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set_epi32(e3, e2, e1, e0); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_LIKE_16(int32x4_t) int32_t data[4] = { e0, e1, e2, e3 }; - r_.neon_i32 = vld1q_s32(data); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_make(e0, e1, e2, e3); - #else - r_.i32[0] = e0; - r_.i32[1] = e1; - r_.i32[2] = e2; - r_.i32[3] = e3; - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set_epi32(e3, e2, e1, e0) simde_mm_set_epi32(e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_loadu_si32 (void const* mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) && ( \ - SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(20,21,1) || \ - HEDLEY_GCC_VERSION_CHECK(12,1,0)) - return _mm_loadu_si32(mem_addr); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128i_from_wasm_v128(wasm_v128_load32_zero(mem_addr)); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - simde__m128i_private r_; - r_.neon_i32 = vsetq_lane_s32(* HEDLEY_REINTERPRET_CAST(const int32_t *, mem_addr), vdupq_n_s32(0), 0); - return simde__m128i_from_private(r_); - #else - int32_t val; - simde_memcpy(&val, mem_addr, sizeof(val)); - return simde_mm_cvtsi32_si128(val); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_loadu_si32(mem_addr) simde_mm_loadu_si32(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_set_epi64 (simde__m64 e1, simde__m64 e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_set_epi64(e1, e0); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vcombine_s64(simde__m64_to_neon_i64(e0), simde__m64_to_neon_i64(e1)); - #else - r_.m64[0] = e0; - r_.m64[1] = e1; - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set_epi64(e1, e0) (simde_mm_set_epi64((e1), (e0))) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_set_epi64x (int64_t e1, int64_t e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0)) - return _mm_set_epi64x(e1, e0); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_LIKE_16(int64x2_t) int64_t data[2] = {e0, e1}; - r_.neon_i64 = vld1q_s64(data); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_make(e0, e1); - #else - r_.i64[0] = e0; - r_.i64[1] = e1; - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set_epi64x(e1, e0) simde_mm_set_epi64x(e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_loadu_si64 (void const* mem_addr) { - #if defined(SIMDE_X86_SSE2_NATIVE) && ( \ - SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \ - HEDLEY_GCC_VERSION_CHECK(11,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(20,21,1)) - return _mm_loadu_si64(mem_addr); - #else - int64_t val; - simde_memcpy(&val, mem_addr, sizeof(val)); - return simde_mm_cvtsi64_si128(val); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_loadu_si64(mem_addr) simde_mm_loadu_si64(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_set_epu8 (uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12, - uint8_t e11, uint8_t e10, uint8_t e9, uint8_t e8, - uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4, - uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set_epi8( - HEDLEY_STATIC_CAST(char, e15), HEDLEY_STATIC_CAST(char, e14), HEDLEY_STATIC_CAST(char, e13), HEDLEY_STATIC_CAST(char, e12), - HEDLEY_STATIC_CAST(char, e11), HEDLEY_STATIC_CAST(char, e10), HEDLEY_STATIC_CAST(char, e9), HEDLEY_STATIC_CAST(char, e8), - HEDLEY_STATIC_CAST(char, e7), HEDLEY_STATIC_CAST(char, e6), HEDLEY_STATIC_CAST(char, e5), HEDLEY_STATIC_CAST(char, e4), - HEDLEY_STATIC_CAST(char, e3), HEDLEY_STATIC_CAST(char, e2), HEDLEY_STATIC_CAST(char, e1), HEDLEY_STATIC_CAST(char, e0)); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_LIKE_16(uint8x16_t) uint8_t data[16] = { - e0, e1, e2, e3, - e4, e5, e6, e7, - e8, e9, e10, e11, - e12, e13, e14, e15}; - r_.neon_u8 = vld1q_u8(data); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u8x16_make(e0, e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, e11, e12, e13, e14, e15); - #else - r_.u8[ 0] = e0; r_.u8[ 1] = e1; r_.u8[ 2] = e2; r_.u8[ 3] = e3; - r_.u8[ 4] = e4; r_.u8[ 5] = e5; r_.u8[ 6] = e6; r_.u8[ 7] = e7; - r_.u8[ 8] = e8; r_.u8[ 9] = e9; r_.u8[10] = e10; r_.u8[11] = e11; - r_.u8[12] = e12; r_.u8[13] = e13; r_.u8[14] = e14; r_.u8[15] = e15; - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_set_epu16 (uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4, - uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set_epi16( - HEDLEY_STATIC_CAST(short, e7), HEDLEY_STATIC_CAST(short, e6), HEDLEY_STATIC_CAST(short, e5), HEDLEY_STATIC_CAST(short, e4), - HEDLEY_STATIC_CAST(short, e3), HEDLEY_STATIC_CAST(short, e2), HEDLEY_STATIC_CAST(short, e1), HEDLEY_STATIC_CAST(short, e0)); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_LIKE_16(uint16x8_t) uint16_t data[8] = { e0, e1, e2, e3, e4, e5, e6, e7 }; - r_.neon_u16 = vld1q_u16(data); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u16x8_make(e0, e1, e2, e3, e4, e5, e6, e7); - #else - r_.u16[0] = e0; r_.u16[1] = e1; r_.u16[2] = e2; r_.u16[3] = e3; - r_.u16[4] = e4; r_.u16[5] = e5; r_.u16[6] = e6; r_.u16[7] = e7; - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_set_epu32 (uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set_epi32( - HEDLEY_STATIC_CAST(int, e3), HEDLEY_STATIC_CAST(int, e2), HEDLEY_STATIC_CAST(int, e1), HEDLEY_STATIC_CAST(int, e0)); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_LIKE_16(uint32x4_t) uint32_t data[4] = { e0, e1, e2, e3 }; - r_.neon_u32 = vld1q_u32(data); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u32x4_make(e0, e1, e2, e3); - #else - r_.u32[0] = e0; - r_.u32[1] = e1; - r_.u32[2] = e2; - r_.u32[3] = e3; - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_set_epu64x (uint64_t e1, uint64_t e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0)) - return _mm_set_epi64x(HEDLEY_STATIC_CAST(int64_t, e1), HEDLEY_STATIC_CAST(int64_t, e0)); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - SIMDE_ALIGN_LIKE_16(uint64x2_t) uint64_t data[2] = {e0, e1}; - r_.neon_u64 = vld1q_u64(data); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u64x2_make(e0, e1); - #else - r_.u64[0] = e0; - r_.u64[1] = e1; - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_set_sd (simde_float64 a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set_sd(a); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return vsetq_lane_f64(a, vdupq_n_f64(SIMDE_FLOAT64_C(0.0)), 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128d_from_wasm_v128(wasm_f64x2_make(a, 0)); - #else - return simde_mm_set_pd(SIMDE_FLOAT64_C(0.0), a); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set_sd(a) simde_mm_set_sd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_set1_epi8 (int8_t a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set1_epi8(a); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vdupq_n_s8(a); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_splat(a); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i8 = vec_splats(HEDLEY_STATIC_CAST(signed char, a)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set1_epi8(a) simde_mm_set1_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_set1_epi16 (int16_t a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set1_epi16(a); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vdupq_n_s16(a); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_splat(a); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i16 = vec_splats(HEDLEY_STATIC_CAST(signed short, a)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set1_epi16(a) simde_mm_set1_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_set1_epi32 (int32_t a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_set1_epi32(a); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vdupq_n_s32(a); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_splat(a); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = vec_splats(HEDLEY_STATIC_CAST(signed int, a)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set1_epi32(a) simde_mm_set1_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_set1_epi64x (int64_t a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && (!defined(HEDLEY_MSVC_VERSION) || HEDLEY_MSVC_VERSION_CHECK(19,0,0)) - return _mm_set1_epi64x(a); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vdupq_n_s64(a); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_splat(a); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i64 = vec_splats(HEDLEY_STATIC_CAST(signed long long, a)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set1_epi64x(a) simde_mm_set1_epi64x(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_set1_epi64 (simde__m64 a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_set1_epi64(a); - #else - simde__m64_private a_ = simde__m64_to_private(a); - return simde_mm_set1_epi64x(a_.i64[0]); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_set1_epi64(a) simde_mm_set1_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_set1_epu8 (uint8_t value) { - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - return simde__m128i_from_altivec_u8(vec_splats(HEDLEY_STATIC_CAST(unsigned char, value))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128i_from_wasm_v128(wasm_u8x16_splat(value)); - #else - return simde_mm_set1_epi8(HEDLEY_STATIC_CAST(int8_t, value)); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_set1_epu16 (uint16_t value) { - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - return simde__m128i_from_altivec_u16(vec_splats(HEDLEY_STATIC_CAST(unsigned short, value))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128i_from_wasm_v128(wasm_u16x8_splat(value)); - #else - return simde_mm_set1_epi16(HEDLEY_STATIC_CAST(int16_t, value)); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_set1_epu32 (uint32_t value) { - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - return simde__m128i_from_altivec_u32(vec_splats(HEDLEY_STATIC_CAST(unsigned int, value))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128i_from_wasm_v128(wasm_u32x4_splat(value)); - #else - return simde_mm_set1_epi32(HEDLEY_STATIC_CAST(int32_t, value)); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_set1_epu64 (uint64_t value) { - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - return simde__m128i_from_altivec_u64(vec_splats(HEDLEY_STATIC_CAST(unsigned long long, value))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128i_from_wasm_v128(wasm_u64x2_splat(value)); - #else - return simde_mm_set1_epi64x(HEDLEY_STATIC_CAST(int64_t, value)); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_setr_epi8 (int8_t e15, int8_t e14, int8_t e13, int8_t e12, - int8_t e11, int8_t e10, int8_t e9, int8_t e8, - int8_t e7, int8_t e6, int8_t e5, int8_t e4, - int8_t e3, int8_t e2, int8_t e1, int8_t e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_setr_epi8( - e15, e14, e13, e12, e11, e10, e9, e8, - e7, e6, e5, e4, e3, e2, e1, e0); - #else - return simde_mm_set_epi8( - e0, e1, e2, e3, e4, e5, e6, e7, - e8, e9, e10, e11, e12, e13, e14, e15); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_setr_epi16 (int16_t e7, int16_t e6, int16_t e5, int16_t e4, - int16_t e3, int16_t e2, int16_t e1, int16_t e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0); - #else - return simde_mm_set_epi16(e0, e1, e2, e3, e4, e5, e6, e7); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0) simde_mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_setr_epi32 (int32_t e3, int32_t e2, int32_t e1, int32_t e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_setr_epi32(e3, e2, e1, e0); - #else - return simde_mm_set_epi32(e0, e1, e2, e3); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_setr_epi32(e3, e2, e1, e0) simde_mm_setr_epi32(e3, e2, e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_setr_epi64 (simde__m64 e1, simde__m64 e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_setr_epi64(e1, e0); - #else - return simde_mm_set_epi64(e0, e1); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_setr_epi64(e1, e0) (simde_mm_setr_epi64((e1), (e0))) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_setr_pd (simde_float64 e1, simde_float64 e0) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_setr_pd(e1, e0); - #else - return simde_mm_set_pd(e0, e1); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_setr_pd(e1, e0) simde_mm_setr_pd(e1, e0) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_setzero_pd (void) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_setzero_pd(); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128d_from_wasm_v128(wasm_f64x2_const(0.0, 0.0)); - #else - return simde_mm_castsi128_pd(simde_mm_setzero_si128()); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_setzero_pd() simde_mm_setzero_pd() -#endif - -#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_ -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_undefined_pd (void) { - simde__m128d_private r_; - - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128) - r_.n = _mm_undefined_pd(); - #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - r_ = simde__m128d_to_private(simde_mm_setzero_pd()); - #endif - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_undefined_pd() simde_mm_undefined_pd() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_undefined_si128 (void) { - simde__m128i_private r_; - - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128) - r_.n = _mm_undefined_si128(); - #elif !defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) - r_ = simde__m128i_to_private(simde_mm_setzero_si128()); - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_undefined_si128() (simde_mm_undefined_si128()) -#endif - -#if defined(SIMDE_DIAGNOSTIC_DISABLE_UNINITIALIZED_) -HEDLEY_DIAGNOSTIC_POP -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_x_mm_setone_pd (void) { - return simde_mm_castps_pd(simde_x_mm_setone_ps()); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_setone_si128 (void) { - return simde_mm_castps_si128(simde_x_mm_setone_ps()); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_shuffle_epi32 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[(imm8 >> (i * 2)) & 3]; - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) - #define simde_mm_shuffle_epi32(a, imm8) _mm_shuffle_epi32((a), (imm8)) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_shuffle_epi32(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ - simde__m128i_from_wasm_v128( \ - wasm_i32x4_shuffle( \ - (simde_tmp_a_).wasm_v128, \ - (simde_tmp_a_).wasm_v128, \ - ((imm8) ) & 3, \ - ((imm8) >> 2) & 3, \ - ((imm8) >> 4) & 3, \ - ((imm8) >> 6) & 3)); })) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm_shuffle_epi32(a, imm8) \ - (__extension__ ({ \ - const int32x4_t simde_mm_shuffle_epi32_a_ = simde__m128i_to_neon_i32(a); \ - int32x4_t simde_mm_shuffle_epi32_r_; \ - simde_mm_shuffle_epi32_r_ = vmovq_n_s32(vgetq_lane_s32(simde_mm_shuffle_epi32_a_, (imm8) & (0x3))); \ - simde_mm_shuffle_epi32_r_ = vsetq_lane_s32(vgetq_lane_s32(simde_mm_shuffle_epi32_a_, ((imm8) >> 2) & 0x3), simde_mm_shuffle_epi32_r_, 1); \ - simde_mm_shuffle_epi32_r_ = vsetq_lane_s32(vgetq_lane_s32(simde_mm_shuffle_epi32_a_, ((imm8) >> 4) & 0x3), simde_mm_shuffle_epi32_r_, 2); \ - simde_mm_shuffle_epi32_r_ = vsetq_lane_s32(vgetq_lane_s32(simde_mm_shuffle_epi32_a_, ((imm8) >> 6) & 0x3), simde_mm_shuffle_epi32_r_, 3); \ - vreinterpretq_s64_s32(simde_mm_shuffle_epi32_r_); \ - })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm_shuffle_epi32(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ - simde__m128i_from_private((simde__m128i_private) { .i32 = \ - SIMDE_SHUFFLE_VECTOR_(32, 16, \ - (simde_tmp_a_).i32, \ - (simde_tmp_a_).i32, \ - ((imm8) ) & 3, \ - ((imm8) >> 2) & 3, \ - ((imm8) >> 4) & 3, \ - ((imm8) >> 6) & 3) }); })) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_shuffle_epi32(a, imm8) simde_mm_shuffle_epi32(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_shuffle_pd (simde__m128d a, simde__m128d b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - r_.f64[0] = ((imm8 & 1) == 0) ? a_.f64[0] : a_.f64[1]; - r_.f64[1] = ((imm8 & 2) == 0) ? b_.f64[0] : b_.f64[1]; - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) && !defined(__PGI) - #define simde_mm_shuffle_pd(a, b, imm8) _mm_shuffle_pd((a), (b), (imm8)) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm_shuffle_pd(a, b, imm8) (__extension__ ({ \ - simde__m128d_from_private((simde__m128d_private) { .f64 = \ - SIMDE_SHUFFLE_VECTOR_(64, 16, \ - simde__m128d_to_private(a).f64, \ - simde__m128d_to_private(b).f64, \ - (((imm8) ) & 1), \ - (((imm8) >> 1) & 1) + 2) }); })) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_shuffle_pd(a, b, imm8) simde_mm_shuffle_pd(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_shufflehi_epi16 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i++) { - r_.i16[i] = a_.i16[i]; - } - for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4]; - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) - #define simde_mm_shufflehi_epi16(a, imm8) _mm_shufflehi_epi16((a), (imm8)) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm_shufflehi_epi16(a, imm8) \ - (__extension__ ({ \ - int16x8_t simde_mm_shufflehi_epi16_a_ = simde__m128i_to_neon_i16(a); \ - int16x8_t simde_mm_shufflehi_epi16_r_ = simde_mm_shufflehi_epi16_a_; \ - simde_mm_shufflehi_epi16_r_ = vsetq_lane_s16(vgetq_lane_s16(simde_mm_shufflehi_epi16_a_, (((imm8) ) & 0x3) + 4), simde_mm_shufflehi_epi16_r_, 4); \ - simde_mm_shufflehi_epi16_r_ = vsetq_lane_s16(vgetq_lane_s16(simde_mm_shufflehi_epi16_a_, (((imm8) >> 2) & 0x3) + 4), simde_mm_shufflehi_epi16_r_, 5); \ - simde_mm_shufflehi_epi16_r_ = vsetq_lane_s16(vgetq_lane_s16(simde_mm_shufflehi_epi16_a_, (((imm8) >> 4) & 0x3) + 4), simde_mm_shufflehi_epi16_r_, 6); \ - simde_mm_shufflehi_epi16_r_ = vsetq_lane_s16(vgetq_lane_s16(simde_mm_shufflehi_epi16_a_, (((imm8) >> 6) & 0x3) + 4), simde_mm_shufflehi_epi16_r_, 7); \ - simde__m128i_from_neon_i16(simde_mm_shufflehi_epi16_r_); \ - })) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_shufflehi_epi16(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ - simde__m128i_from_private((simde__m128i_private) { .wasm_v128 = \ - wasm_i16x8_shuffle( \ - (simde_tmp_a_).wasm_v128, \ - (simde_tmp_a_).wasm_v128, \ - 0, 1, 2, 3, \ - (((imm8) ) & 3) + 4, \ - (((imm8) >> 2) & 3) + 4, \ - (((imm8) >> 4) & 3) + 4, \ - (((imm8) >> 6) & 3) + 4) }); })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm_shufflehi_epi16(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ - simde__m128i_from_private((simde__m128i_private) { .i16 = \ - SIMDE_SHUFFLE_VECTOR_(16, 16, \ - (simde_tmp_a_).i16, \ - (simde_tmp_a_).i16, \ - 0, 1, 2, 3, \ - (((imm8) ) & 3) + 4, \ - (((imm8) >> 2) & 3) + 4, \ - (((imm8) >> 4) & 3) + 4, \ - (((imm8) >> 6) & 3) + 4) }); })) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_shufflehi_epi16(a, imm8) simde_mm_shufflehi_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_shufflelo_epi16 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - for (size_t i = 0 ; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2) ; i++) { - r_.i16[i] = a_.i16[((imm8 >> (i * 2)) & 3)]; - } - SIMDE_VECTORIZE - for (size_t i = ((sizeof(a_.i16) / sizeof(a_.i16[0])) / 2) ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i]; - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) - #define simde_mm_shufflelo_epi16(a, imm8) _mm_shufflelo_epi16((a), (imm8)) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_shufflelo_epi16(a, imm8) \ - simde__m128i_from_wasm_v128( \ - wasm_i16x8_shuffle( \ - simde__m128i_to_wasm_v128((a)), \ - wasm_i16x8_splat(0), \ - (((imm8) & 0x03) ), \ - (((imm8) & 0x0c) >> 2), \ - (((imm8) & 0x30) >> 4), \ - (((imm8) & 0xc0) >> 6), \ - 4, 5, 6, 7)) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) && defined(SIMDE_STATEMENT_EXPR_) - #define simde_mm_shufflelo_epi16(a, imm8) \ - (__extension__({ \ - int16x8_t simde_mm_shufflelo_epi16_a_ = simde__m128i_to_neon_i16(a); \ - int16x8_t simde_mm_shufflelo_epi16_r_ = simde_mm_shufflelo_epi16_a_; \ - simde_mm_shufflelo_epi16_r_ = vsetq_lane_s16(vgetq_lane_s16(simde_mm_shufflelo_epi16_a_, (((imm8) ) & 0x3)), simde_mm_shufflelo_epi16_r_, 0); \ - simde_mm_shufflelo_epi16_r_ = vsetq_lane_s16(vgetq_lane_s16(simde_mm_shufflelo_epi16_a_, (((imm8) >> 2) & 0x3)), simde_mm_shufflelo_epi16_r_, 1); \ - simde_mm_shufflelo_epi16_r_ = vsetq_lane_s16(vgetq_lane_s16(simde_mm_shufflelo_epi16_a_, (((imm8) >> 4) & 0x3)), simde_mm_shufflelo_epi16_r_, 2); \ - simde_mm_shufflelo_epi16_r_ = vsetq_lane_s16(vgetq_lane_s16(simde_mm_shufflelo_epi16_a_, (((imm8) >> 6) & 0x3)), simde_mm_shufflelo_epi16_r_, 3); \ - simde__m128i_from_neon_i16(simde_mm_shufflelo_epi16_r_); \ - })) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm_shufflelo_epi16(a, imm8) (__extension__ ({ \ - const simde__m128i_private simde_tmp_a_ = simde__m128i_to_private(a); \ - simde__m128i_from_private((simde__m128i_private) { .i16 = \ - SIMDE_SHUFFLE_VECTOR_(16, 16, \ - (simde_tmp_a_).i16, \ - (simde_tmp_a_).i16, \ - (((imm8) ) & 3), \ - (((imm8) >> 2) & 3), \ - (((imm8) >> 4) & 3), \ - (((imm8) >> 6) & 3), \ - 4, 5, 6, 7) }); })) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_shufflelo_epi16(a, imm8) simde_mm_shufflelo_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sll_epi16 (simde__m128i a, simde__m128i count) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sll_epi16(a, count); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - count_ = simde__m128i_to_private(count); - - if (count_.u64[0] > 15) - return simde_mm_setzero_si128(); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u16 = (a_.u16 << count_.u64[0]); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, count_.u64[0]))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 16) ? wasm_i16x8_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i16x8_const(0,0,0,0,0,0,0,0)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a_.u16[i] << count_.u64[0])); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sll_epi16(a, count) simde_mm_sll_epi16((a), (count)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sll_epi32 (simde__m128i a, simde__m128i count) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sll_epi32(a, count); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - count_ = simde__m128i_to_private(count); - - if (count_.u64[0] > 31) - return simde_mm_setzero_si128(); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = (a_.u32 << count_.u64[0]); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, count_.u64[0]))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = ((wasm_i64x2_extract_lane(count_.wasm_v128, 0) < 32) ? wasm_i32x4_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(int32_t, wasm_i64x2_extract_lane(count_.wasm_v128, 0))) : wasm_i32x4_const(0,0,0,0)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (a_.u32[i] << count_.u64[0])); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sll_epi32(a, count) (simde_mm_sll_epi32(a, (count))) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sll_epi64 (simde__m128i a, simde__m128i count) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sll_epi64(a, count); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - count_ = simde__m128i_to_private(count); - - if (count_.u64[0] > 63) - return simde_mm_setzero_si128(); - - const int_fast16_t s = HEDLEY_STATIC_CAST(int_fast16_t, count_.u64[0]); - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, s))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = (s < 64) ? wasm_i64x2_shl(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, s)) : wasm_i64x2_const(0,0); - #else - #if !defined(SIMDE_BUG_GCC_94488) - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = a_.u64[i] << s; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sll_epi64(a, count) (simde_mm_sll_epi64(a, (count))) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_sqrt_pd (simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sqrt_pd(a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vsqrtq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_sqrt(a_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = vec_sqrt(a_.altivec_f64); - #elif defined(simde_math_sqrt) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_sqrt(a_.f64[i]); - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sqrt_pd(a) simde_mm_sqrt_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_sqrt_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sqrt_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_sqrt_pd(b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_sqrt_pd(simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(simde_math_sqrt) - r_.f64[0] = simde_math_sqrt(b_.f64[0]); - r_.f64[1] = a_.f64[1]; - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sqrt_sd(a, b) simde_mm_sqrt_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srl_epi16 (simde__m128i a, simde__m128i count) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_srl_epi16(a, count); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - count_ = simde__m128i_to_private(count); - - const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 16 ? 16 : count_.i64[0])); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vshlq_u16(a_.neon_u16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = a_.u16[i] >> cnt; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_srl_epi16(a, count) (simde_mm_srl_epi16(a, (count))) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srl_epi32 (simde__m128i a, simde__m128i count) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_srl_epi32(a, count); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - count_ = simde__m128i_to_private(count); - - const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 32 ? 32 : count_.i64[0])); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vshlq_u32(a_.neon_u32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u32x4_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] >> cnt; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_srl_epi32(a, count) (simde_mm_srl_epi32(a, (count))) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srl_epi64 (simde__m128i a, simde__m128i count) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_srl_epi64(a, count); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - count_ = simde__m128i_to_private(count); - - const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 64 ? 64 : count_.i64[0])); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(HEDLEY_STATIC_CAST(int64_t, -cnt))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u64x2_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); - #else - #if !defined(SIMDE_BUG_GCC_94488) - SIMDE_VECTORIZE - #endif - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = a_.u64[i] >> cnt; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_srl_epi64(a, count) (simde_mm_srl_epi64(a, (count))) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srai_epi16 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - /* MSVC requires a range of (0, 255). */ - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - const int cnt = (imm8 & ~15) ? 15 : imm8; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] >> cnt; - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) - #define simde_mm_srai_epi16(a, imm8) _mm_srai_epi16((a), (imm8)) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_srai_epi16(a, imm8) simde_mm_srai_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srai_epi32 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_RANGE(imm8, 0, 255) { - /* MSVC requires a range of (0, 255). */ - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - const int cnt = (imm8 & ~31) ? 31 : imm8; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(-cnt)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] >> cnt; - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) - #define simde_mm_srai_epi32(a, imm8) _mm_srai_epi32((a), (imm8)) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_srai_epi32(a, imm8) simde_mm_srai_epi32(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sra_epi16 (simde__m128i a, simde__m128i count) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sra_epi16(a, count); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - count_ = simde__m128i_to_private(count); - - const int cnt = HEDLEY_STATIC_CAST(int, (count_.i64[0] > 15 ? 15 : count_.i64[0])); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vshlq_s16(a_.neon_i16, vdupq_n_s16(HEDLEY_STATIC_CAST(int16_t, -cnt))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] >> cnt; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sra_epi16(a, count) (simde_mm_sra_epi16(a, count)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sra_epi32 (simde__m128i a, simde__m128i count) { - #if defined(SIMDE_X86_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32) - return _mm_sra_epi32(a, count); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - count_ = simde__m128i_to_private(count); - - const int cnt = count_.u64[0] > 31 ? 31 : HEDLEY_STATIC_CAST(int, count_.u64[0]); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vshlq_s32(a_.neon_i32, vdupq_n_s32(HEDLEY_STATIC_CAST(int32_t, -cnt))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shr(a_.wasm_v128, HEDLEY_STATIC_CAST(uint32_t, cnt)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] >> cnt; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sra_epi32(a, count) (simde_mm_sra_epi32(a, (count))) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_slli_epi16 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - if (HEDLEY_UNLIKELY((imm8 > 15))) { - return simde_mm_setzero_si128(); - } - - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = a_.i16 << SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8 & 0xff); - #else - const int s = (imm8 > HEDLEY_STATIC_CAST(int, sizeof(r_.i16[0]) * CHAR_BIT) - 1) ? 0 : imm8; - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << s); - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) - #define simde_mm_slli_epi16(a, imm8) _mm_slli_epi16(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_slli_epi16(a, imm8) \ - (((imm8) <= 0) ? \ - (a) : \ - simde__m128i_from_neon_i16( \ - ((imm8) > 15) ? \ - vandq_s16(simde__m128i_to_neon_i16(a), vdupq_n_s16(0)) : \ - vshlq_n_s16(simde__m128i_to_neon_i16(a), ((imm8) & 15)))) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_slli_epi16(a, imm8) \ - ((imm8 < 16) ? wasm_i16x8_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i16x8_const(0,0,0,0,0,0,0,0)) -#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - #define simde_mm_slli_epi16(a, imm8) \ - ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sl(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8))))) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_slli_epi16(a, imm8) simde_mm_slli_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_slli_epi32 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - if (HEDLEY_UNLIKELY((imm8 > 31))) { - return simde_mm_setzero_si128(); - } - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = a_.i32 << imm8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] << (imm8 & 0xff); - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) - #define simde_mm_slli_epi32(a, imm8) _mm_slli_epi32(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_slli_epi32(a, imm8) \ - (((imm8) <= 0) ? \ - (a) : \ - simde__m128i_from_neon_i32( \ - ((imm8) > 31) ? \ - vandq_s32(simde__m128i_to_neon_i32(a), vdupq_n_s32(0)) : \ - vshlq_n_s32(simde__m128i_to_neon_i32(a), ((imm8) & 31)))) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_slli_epi32(a, imm8) \ - ((imm8 < 32) ? wasm_i32x4_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i32x4_const(0,0,0,0)) -#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - #define simde_mm_slli_epi32(a, imm8) \ - (__extension__ ({ \ - simde__m128i ret; \ - if ((imm8) <= 0) { \ - ret = a; \ - } else if ((imm8) > 31) { \ - ret = simde_mm_setzero_si128(); \ - } else { \ - ret = simde__m128i_from_altivec_i32( \ - vec_sl(simde__m128i_to_altivec_i32(a), \ - vec_splats(HEDLEY_STATIC_CAST(unsigned int, (imm8) & 31)))); \ - } \ - ret; \ - })) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_slli_epi32(a, imm8) simde_mm_slli_epi32(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_slli_epi64 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - if (HEDLEY_UNLIKELY((imm8 > 63))) { - return simde_mm_setzero_si128(); - } - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i64 = a_.i64 << imm8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] << (imm8 & 0xff); - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) - #define simde_mm_slli_epi64(a, imm8) _mm_slli_epi64(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_slli_epi64(a, imm8) \ - (((imm8) <= 0) ? \ - (a) : \ - simde__m128i_from_neon_i64( \ - ((imm8) > 63) ? \ - vandq_s64(simde__m128i_to_neon_i64(a), vdupq_n_s64(0)) : \ - vshlq_n_s64(simde__m128i_to_neon_i64(a), ((imm8) & 63)))) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_slli_epi64(a, imm8) \ - ((imm8 < 64) ? wasm_i64x2_shl(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0)) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_slli_epi64(a, imm8) simde_mm_slli_epi64(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srli_epi16 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - if (HEDLEY_UNLIKELY((imm8 > 15))) { - return simde_mm_setzero_si128(); - } - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u16 = a_.u16 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.u16[i] = a_.u16[i] >> (imm8 & 0xff); - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) - #define simde_mm_srli_epi16(a, imm8) _mm_srli_epi16(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_srli_epi16(a, imm8) \ - (((imm8) <= 0) ? \ - (a) : \ - simde__m128i_from_neon_u16( \ - ((imm8) > 15) ? \ - vandq_u16(simde__m128i_to_neon_u16(a), vdupq_n_u16(0)) : \ - vshrq_n_u16(simde__m128i_to_neon_u16(a), ((imm8) & 15) | (((imm8) & 15) == 0)))) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_srli_epi16(a, imm8) \ - ((imm8 < 16) ? wasm_u16x8_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i16x8_const(0,0,0,0,0,0,0,0)) -#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - #define simde_mm_srli_epi16(a, imm8) \ - ((imm8 & ~15) ? simde_mm_setzero_si128() : simde__m128i_from_altivec_i16(vec_sr(simde__m128i_to_altivec_i16(a), vec_splat_u16(HEDLEY_STATIC_CAST(unsigned short, imm8))))) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_srli_epi16(a, imm8) simde_mm_srli_epi16(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srli_epi32 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - if (HEDLEY_UNLIKELY((imm8 > 31))) { - return simde_mm_setzero_si128(); - } - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = a_.u32 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8 & 0xff); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.u32[i] = a_.u32[i] >> (imm8 & 0xff); - } - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) - #define simde_mm_srli_epi32(a, imm8) _mm_srli_epi32(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_srli_epi32(a, imm8) \ - (((imm8) <= 0) ? \ - (a) : \ - simde__m128i_from_neon_u32( \ - ((imm8) > 31) ? \ - vandq_u32(simde__m128i_to_neon_u32(a), vdupq_n_u32(0)) : \ - vshrq_n_u32(simde__m128i_to_neon_u32(a), ((imm8) & 31) | (((imm8) & 31) == 0)))) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_srli_epi32(a, imm8) \ - ((imm8 < 32) ? wasm_u32x4_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i32x4_const(0,0,0,0)) -#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - #define simde_mm_srli_epi32(a, imm8) \ - (__extension__ ({ \ - simde__m128i ret; \ - if ((imm8) <= 0) { \ - ret = a; \ - } else if ((imm8) > 31) { \ - ret = simde_mm_setzero_si128(); \ - } else { \ - ret = simde__m128i_from_altivec_i32( \ - vec_sr(simde__m128i_to_altivec_i32(a), \ - vec_splats(HEDLEY_STATIC_CAST(unsigned int, (imm8) & 31)))); \ - } \ - ret; \ - })) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_srli_epi32(a, imm8) simde_mm_srli_epi32(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_srli_epi64 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - if (HEDLEY_UNLIKELY((imm8 & 63) != imm8)) - return simde_mm_setzero_si128(); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u64 = vshlq_u64(a_.neon_u64, vdupq_n_s64(-imm8)); - #else - #if defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_94488) - r_.u64 = a_.u64 >> SIMDE_CAST_VECTOR_SHIFT_COUNT(8, imm8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.u64[i] = a_.u64[i] >> imm8; - } - #endif - #endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE2_NATIVE) - #define simde_mm_srli_epi64(a, imm8) _mm_srli_epi64(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_srli_epi64(a, imm8) \ - (((imm8) <= 0) ? \ - (a) : \ - simde__m128i_from_neon_u64( \ - ((imm8) > 63) ? \ - vandq_u64(simde__m128i_to_neon_u64(a), vdupq_n_u64(0)) : \ - vshrq_n_u64(simde__m128i_to_neon_u64(a), ((imm8) & 63) | (((imm8) & 63) == 0)))) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_srli_epi64(a, imm8) \ - ((imm8 < 64) ? wasm_u64x2_shr(simde__m128i_to_private(a).wasm_v128, imm8) : wasm_i64x2_const(0,0)) -#endif -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_srli_epi64(a, imm8) simde_mm_srli_epi64(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_store_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_store_pd(mem_addr, a); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), simde__m128d_to_private(a).neon_i64); - #else - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128d), &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_store_pd(mem_addr, a) simde_mm_store_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_store1_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_store1_pd(mem_addr, a); - #else - simde__m128d_private a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - vst1q_f64(mem_addr, vdupq_laneq_f64(a_.neon_f64, 0)); - #else - mem_addr[0] = a_.f64[0]; - mem_addr[1] = a_.f64[0]; - #endif - #endif -} -#define simde_mm_store_pd1(mem_addr, a) simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_store1_pd(mem_addr, a) simde_mm_store1_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) - #define _mm_store_pd1(mem_addr, a) simde_mm_store_pd1(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_store_sd (simde_float64* mem_addr, simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_store_sd(mem_addr, a); - #else - simde__m128d_private a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - const simde_float64 v = vgetq_lane_f64(a_.neon_f64, 0); - simde_memcpy(mem_addr, &v, sizeof(v)); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - const int64_t v = vgetq_lane_s64(a_.neon_i64, 0); - simde_memcpy(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), &v, sizeof(v)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 0); - #else - simde_float64 v = a_.f64[0]; - simde_memcpy(mem_addr, &v, sizeof(simde_float64)); - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_store_sd(mem_addr, a) simde_mm_store_sd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_store_si128 (simde__m128i* mem_addr, simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_store_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a); - #else - simde__m128i_private a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - vst1q_s32(HEDLEY_REINTERPRET_CAST(int32_t*, mem_addr), a_.neon_i32); - #else - simde_memcpy(SIMDE_ALIGN_ASSUME_LIKE(mem_addr, simde__m128i), &a_, sizeof(a_)); - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_store_si128(mem_addr, a) simde_mm_store_si128(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void - simde_mm_storeh_pd (simde_float64* mem_addr, simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_storeh_pd(mem_addr, a); - #else - simde__m128d_private a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - *mem_addr = vgetq_lane_f64(a_.neon_f64, 1); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), a_.wasm_v128, 1); - #else - *mem_addr = a_.f64[1]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_storeh_pd(mem_addr, a) simde_mm_storeh_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storel_epi64 (simde__m128i* mem_addr, simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_storel_epi64(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a); - #else - simde__m128i_private a_ = simde__m128i_to_private(a); - int64_t tmp; - - /* memcpy to prevent aliasing, tmp because we can't take the - * address of a vector element. */ - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - tmp = vgetq_lane_s64(a_.neon_i64, 0); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - #if defined(SIMDE_BUG_GCC_95227) - (void) a_; - #endif - tmp = vec_extract(a_.altivec_i64, 0); - #else - tmp = a_.i64[0]; - #endif - - simde_memcpy(mem_addr, &tmp, sizeof(tmp)); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_storel_epi64(mem_addr, a) simde_mm_storel_epi64(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storel_pd (simde_float64* mem_addr, simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_storel_pd(mem_addr, a); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - wasm_v128_store64_lane(HEDLEY_REINTERPRET_CAST(void*, mem_addr), simde__m128d_to_wasm_v128(a), 0); - #else - simde__m128d_private a_ = simde__m128d_to_private(a); - - simde_float64 tmp; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - tmp = vgetq_lane_f64(a_.neon_f64, 0); - #else - tmp = a_.f64[0]; - #endif - simde_memcpy(mem_addr, &tmp, sizeof(tmp)); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_storel_pd(mem_addr, a) simde_mm_storel_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storer_pd (simde_float64 mem_addr[2], simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_storer_pd(mem_addr, a); - #else - simde__m128d_private a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - vst1q_s64(HEDLEY_REINTERPRET_CAST(int64_t*, mem_addr), vextq_s64(a_.neon_i64, a_.neon_i64, 1)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - a_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, a_.wasm_v128, 1, 0); - simde_mm_store_pd(mem_addr, simde__m128d_from_private(a_)); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - a_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 1, 0); - simde_mm_store_pd(mem_addr, simde__m128d_from_private(a_)); - #else - mem_addr[0] = a_.f64[1]; - mem_addr[1] = a_.f64[0]; - #endif - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_storer_pd(mem_addr, a) simde_mm_storer_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storeu_pd (simde_float64* mem_addr, simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_storeu_pd(mem_addr, a); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - vst1q_f64(mem_addr, simde__m128d_to_private(a).neon_f64); - #else - simde_memcpy(mem_addr, &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_storeu_pd(mem_addr, a) simde_mm_storeu_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storeu_si128 (void* mem_addr, simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_storeu_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a); - #else - simde_memcpy(mem_addr, &a, sizeof(a)); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_storeu_si128(mem_addr, a) simde_mm_storeu_si128(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storeu_si16 (void* mem_addr, simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && ( \ - SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \ - HEDLEY_GCC_VERSION_CHECK(11,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(20,21,1)) - _mm_storeu_si16(mem_addr, a); - #else - int16_t val = simde_x_mm_cvtsi128_si16(a); - simde_memcpy(mem_addr, &val, sizeof(val)); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_storeu_si16(mem_addr, a) simde_mm_storeu_si16(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storeu_si32 (void* mem_addr, simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && ( \ - SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \ - HEDLEY_GCC_VERSION_CHECK(11,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(20,21,1)) - _mm_storeu_si32(mem_addr, a); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - wasm_v128_store32_lane(mem_addr, simde__m128i_to_wasm_v128(a), 0); - #else - int32_t val = simde_mm_cvtsi128_si32(a); - simde_memcpy(mem_addr, &val, sizeof(val)); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_storeu_si32(mem_addr, a) simde_mm_storeu_si32(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_storeu_si64 (void* mem_addr, simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && ( \ - SIMDE_DETECT_CLANG_VERSION_CHECK(8,0,0) || \ - HEDLEY_GCC_VERSION_CHECK(11,0,0) || \ - HEDLEY_INTEL_VERSION_CHECK(20,21,1)) - _mm_storeu_si64(mem_addr, a); - #else - int64_t val = simde_mm_cvtsi128_si64(a); - simde_memcpy(mem_addr, &val, sizeof(val)); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_storeu_si64(mem_addr, a) simde_mm_storeu_si64(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_stream_pd (simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_stream_pd(mem_addr, a); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \ - defined(SIMDE_VECTOR_SUBSCRIPT) || defined(SIMDE_ARM_NEON_A64V8_NATIVE) || \ - defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || \ - defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) - __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); - #else - simde_mm_store_pd(mem_addr, a); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_stream_pd(mem_addr, a) simde_mm_stream_pd(HEDLEY_REINTERPRET_CAST(double*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_stream_si128 (simde__m128i* mem_addr, simde__m128i a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) - _mm_stream_si128(HEDLEY_STATIC_CAST(__m128i*, mem_addr), a); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) && ( \ - defined(SIMDE_VECTOR_SUBSCRIPT) || defined(SIMDE_ARM_NEON_A32V7_NATIVE) || \ - defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \ - defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) - __builtin_nontemporal_store(a, SIMDE_ALIGN_CAST(__typeof__(a)*, mem_addr)); - #else - simde_mm_store_si128(mem_addr, a); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_stream_si128(mem_addr, a) simde_mm_stream_si128(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_stream_si32 (int32_t* mem_addr, int32_t a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_stream_si32(mem_addr, a); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) - __builtin_nontemporal_store(a, mem_addr); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - vst1q_lane_s32(mem_addr, vdupq_n_s32(a), 0); - #else - *mem_addr = a; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_stream_si32(mem_addr, a) simde_mm_stream_si32(mem_addr, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_stream_si64 (int64_t* mem_addr, int64_t a) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) && !defined(HEDLEY_MSVC_VERSION) - _mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(long long int*, int64_t*, mem_addr), a); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_store) - __builtin_nontemporal_store(a, mem_addr); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - vst1_s64(mem_addr, vdup_n_s64(a)); - #else - *mem_addr = a; - #endif -} -#define simde_mm_stream_si64x(mem_addr, a) simde_mm_stream_si64(mem_addr, a) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) - #define _mm_stream_si64(mem_addr, a) simde_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(int64_t*, __int64*, mem_addr), a) - #define _mm_stream_si64x(mem_addr, a) simde_mm_stream_si64(SIMDE_CHECKED_REINTERPRET_CAST(int64_t*, __int64*, mem_addr), a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sub_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sub_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vsubq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_sub(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = a_.i8 - b_.i8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] - b_.i8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sub_epi8(a, b) simde_mm_sub_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sub_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sub_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vsubq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_sub(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = a_.i16 - b_.i16; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] - b_.i16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sub_epi16(a, b) simde_mm_sub_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sub_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sub_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vsubq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_sub(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 - b_.i32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] - b_.i32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sub_epi32(a, b) simde_mm_sub_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sub_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sub_epi64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vsubq_s64(a_.neon_i64, b_.neon_i64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_sub(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 - b_.i64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] - b_.i64[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sub_epi64(a, b) simde_mm_sub_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_sub_epu32 (simde__m128i a, simde__m128i b) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = a_.u32 - b_.u32; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vsubq_u32(a_.neon_u32, b_.neon_u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] - b_.u32[i]; - } - #endif - - return simde__m128i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_sub_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sub_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.f64 = a_.f64 - b_.f64; - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vsubq_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_sub(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = a_.f64[i] - b_.f64[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sub_pd(a, b) simde_mm_sub_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_sub_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_sub_sd(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_sd(a, simde_mm_sub_pd(a, b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_sd(a, simde_mm_sub_pd(simde_x_mm_broadcastlow_pd(a), simde_x_mm_broadcastlow_pd(b))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - r_.f64[0] = a_.f64[0] - b_.f64[0]; - r_.f64[1] = a_.f64[1]; - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sub_sd(a, b) simde_mm_sub_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sub_si64 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSE2_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_sub_si64(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 - b_.i64; - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vsub_s64(a_.neon_i64, b_.neon_i64); - #else - r_.i64[0] = a_.i64[0] - b_.i64[0]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_sub_si64(a, b) simde_mm_sub_si64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_subs_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_subs_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vqsubq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_sub_sat(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = simde_math_subs_i8(a_.i8[i], b_.i8[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_subs_epi8(a, b) simde_mm_subs_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_subs_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_subs_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vqsubq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_sub_sat(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = simde_math_subs_i16(a_.i16[i], b_.i16[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_subs_epi16(a, b) simde_mm_subs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_subs_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_subs_epu8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vqsubq_u8(a_.neon_u8, b_.neon_u8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u8x16_sub_sat(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_u8 = vec_subs(a_.altivec_u8, b_.altivec_u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = simde_math_subs_u8(a_.u8[i], b_.u8[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_subs_epu8(a, b) simde_mm_subs_epu8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_subs_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_subs_epu16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vqsubq_u16(a_.neon_u16, b_.neon_u16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u16x8_sub_sat(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_u16 = vec_subs(a_.altivec_u16, b_.altivec_u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = simde_math_subs_u16(a_.u16[i], b_.u16[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_subs_epu16(a, b) simde_mm_subs_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomieq_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_ucomieq_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64); - uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64); - uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan)))); - uint64x2_t a_eq_b = vceqq_f64(a_.neon_f64, b_.neon_f64); - r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_eq_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) == wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f64[0] == b_.f64[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f64[0] == b_.f64[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_ucomieq_sd(a, b) simde_mm_ucomieq_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomige_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_ucomige_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64); - uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64); - uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan); - uint64x2_t a_ge_b = vcgeq_f64(a_.neon_f64, b_.neon_f64); - r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_ge_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) >= wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f64[0] >= b_.f64[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f64[0] >= b_.f64[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_ucomige_sd(a, b) simde_mm_ucomige_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomigt_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_ucomigt_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64); - uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64); - uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan); - uint64x2_t a_gt_b = vcgtq_f64(a_.neon_f64, b_.neon_f64); - r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_gt_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) > wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f64[0] > b_.f64[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f64[0] > b_.f64[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_ucomigt_sd(a, b) simde_mm_ucomigt_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomile_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_ucomile_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64); - uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64); - uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan)))); - uint64x2_t a_le_b = vcleq_f64(a_.neon_f64, b_.neon_f64); - r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_le_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) <= wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f64[0] <= b_.f64[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f64[0] <= b_.f64[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_ucomile_sd(a, b) simde_mm_ucomile_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomilt_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_ucomilt_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64); - uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64); - uint64x2_t a_or_b_nan = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vandq_u64(a_not_nan, b_not_nan)))); - uint64x2_t a_lt_b = vcltq_f64(a_.neon_f64, b_.neon_f64); - r = !!(vgetq_lane_u64(vorrq_u64(a_or_b_nan, a_lt_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) < wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f64[0] < b_.f64[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f64[0] < b_.f64[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_ucomilt_sd(a, b) simde_mm_ucomilt_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_ucomineq_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_ucomineq_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - int r; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - uint64x2_t a_not_nan = vceqq_f64(a_.neon_f64, a_.neon_f64); - uint64x2_t b_not_nan = vceqq_f64(b_.neon_f64, b_.neon_f64); - uint64x2_t a_and_b_not_nan = vandq_u64(a_not_nan, b_not_nan); - uint64x2_t a_neq_b = vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(vceqq_f64(a_.neon_f64, b_.neon_f64)))); - r = !!(vgetq_lane_u64(vandq_u64(a_and_b_not_nan, a_neq_b), 0) != 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - return wasm_f64x2_extract_lane(a_.wasm_v128, 0) != wasm_f64x2_extract_lane(b_.wasm_v128, 0); - #elif defined(SIMDE_HAVE_FENV_H) - fenv_t envp; - int x = feholdexcept(&envp); - r = a_.f64[0] != b_.f64[0]; - if (HEDLEY_LIKELY(x == 0)) - fesetenv(&envp); - #else - r = a_.f64[0] != b_.f64[0]; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_ucomineq_sd(a, b) simde_mm_ucomineq_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_lfence (void) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_lfence(); - #else - simde_mm_sfence(); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_lfence() simde_mm_lfence() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -void -simde_mm_mfence (void) { - #if defined(SIMDE_X86_SSE2_NATIVE) - _mm_mfence(); - #else - simde_mm_sfence(); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_mfence() simde_mm_mfence() -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_unpackhi_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpackhi_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i8 = vzip2q_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a_.neon_i16)); - int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b_.neon_i16)); - int8x8x2_t result = vzip_s8(a1, b1); - r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_shuffle(a_.wasm_v128, b_.wasm_v128, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2) ; i++) { - r_.i8[(i * 2)] = a_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)]; - r_.i8[(i * 2) + 1] = b_.i8[i + ((sizeof(r_) / sizeof(r_.i8[0])) / 2)]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_unpackhi_epi8(a, b) simde_mm_unpackhi_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_unpackhi_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpackhi_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i16 = vzip2q_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x4_t a1 = vget_high_s16(a_.neon_i16); - int16x4_t b1 = vget_high_s16(b_.neon_i16); - int16x4x2_t result = vzip_s16(a1, b1); - r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_shuffle(a_.wasm_v128, b_.wasm_v128, 4, 12, 5, 13, 6, 14, 7, 15); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 4, 12, 5, 13, 6, 14, 7, 15); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2) ; i++) { - r_.i16[(i * 2)] = a_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)]; - r_.i16[(i * 2) + 1] = b_.i16[i + ((sizeof(r_) / sizeof(r_.i16[0])) / 2)]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_unpackhi_epi16(a, b) simde_mm_unpackhi_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_unpackhi_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpackhi_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i32 = vzip2q_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int32x2_t a1 = vget_high_s32(a_.neon_i32); - int32x2_t b1 = vget_high_s32(b_.neon_i32); - int32x2x2_t result = vzip_s32(a1, b1); - r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 2, 6, 3, 7); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 2, 6, 3, 7); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2) ; i++) { - r_.i32[(i * 2)] = a_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)]; - r_.i32[(i * 2) + 1] = b_.i32[i + ((sizeof(r_) / sizeof(r_.i32[0])) / 2)]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_unpackhi_epi32(a, b) simde_mm_unpackhi_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_unpackhi_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpackhi_epi64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int64x1_t a_h = vget_high_s64(a_.neon_i64); - int64x1_t b_h = vget_high_s64(b_.neon_i64); - r_.neon_i64 = vcombine_s64(a_h, b_h); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 1, 3); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2) ; i++) { - r_.i64[(i * 2)] = a_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)]; - r_.i64[(i * 2) + 1] = b_.i64[i + ((sizeof(r_) / sizeof(r_.i64[0])) / 2)]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_unpackhi_epi64(a, b) simde_mm_unpackhi_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_unpackhi_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpackhi_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vzip2q_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) { - r_.f64[(i * 2)] = a_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)]; - r_.f64[(i * 2) + 1] = b_.f64[i + ((sizeof(r_) / sizeof(r_.f64[0])) / 2)]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_unpackhi_pd(a, b) simde_mm_unpackhi_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_unpacklo_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpacklo_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i8 = vzip1q_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a_.neon_i16)); - int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b_.neon_i16)); - int8x8x2_t result = vzip_s8(a1, b1); - r_.neon_i8 = vcombine_s8(result.val[0], result.val[1]); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i8 = SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, b_.i8, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i8[0])) / 2) ; i++) { - r_.i8[(i * 2)] = a_.i8[i]; - r_.i8[(i * 2) + 1] = b_.i8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_unpacklo_epi8(a, b) simde_mm_unpacklo_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_unpacklo_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpacklo_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i16 = vzip1q_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x4_t a1 = vget_low_s16(a_.neon_i16); - int16x4_t b1 = vget_low_s16(b_.neon_i16); - int16x4x2_t result = vzip_s16(a1, b1); - r_.neon_i16 = vcombine_s16(result.val[0], result.val[1]); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 8, 1, 9, 2, 10, 3, 11); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 8, 1, 9, 2, 10, 3, 11); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i16[0])) / 2) ; i++) { - r_.i16[(i * 2)] = a_.i16[i]; - r_.i16[(i * 2) + 1] = b_.i16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_unpacklo_epi16(a, b) simde_mm_unpacklo_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_unpacklo_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpacklo_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i32 = vzip1q_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int32x2_t a1 = vget_low_s32(a_.neon_i32); - int32x2_t b1 = vget_low_s32(b_.neon_i32); - int32x2x2_t result = vzip_s32(a1, b1); - r_.neon_i32 = vcombine_s32(result.val[0], result.val[1]); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 4, 1, 5); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 4, 1, 5); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i32[0])) / 2) ; i++) { - r_.i32[(i * 2)] = a_.i32[i]; - r_.i32[(i * 2) + 1] = b_.i32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_unpacklo_epi32(a, b) simde_mm_unpacklo_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_unpacklo_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpacklo_epi64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int64x1_t a_l = vget_low_s64(a_.neon_i64); - int64x1_t b_l = vget_low_s64(b_.neon_i64); - r_.neon_i64 = vcombine_s64(a_l, b_l); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.i64, b_.i64, 0, 2); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.i64[0])) / 2) ; i++) { - r_.i64[(i * 2)] = a_.i64[i]; - r_.i64[(i * 2) + 1] = b_.i64[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_unpacklo_epi64(a, b) simde_mm_unpacklo_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_unpacklo_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpacklo_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vzip1q_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < ((sizeof(r_) / sizeof(r_.f64[0])) / 2) ; i++) { - r_.f64[(i * 2)] = a_.f64[i]; - r_.f64[(i * 2) + 1] = b_.f64[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_unpacklo_pd(a, b) simde_mm_unpacklo_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_x_mm_negate_pd(simde__m128d a) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return simde_mm_xor_pd(a, _mm_set1_pd(SIMDE_FLOAT64_C(-0.0))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && \ - (!defined(HEDLEY_GCC_VERSION) || HEDLEY_GCC_VERSION_CHECK(8,1,0)) - r_.altivec_f64 = vec_neg(a_.altivec_f64); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vnegq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_neg(a_.wasm_v128); - #elif defined(SIMDE_VECTOR_NEGATE) - r_.f64 = -a_.f64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = -a_.f64[i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_xor_si128 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE2_NATIVE) - return _mm_xor_si128(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = veorq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_xor(a_.altivec_i32, b_.altivec_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_xor(b_.wasm_v128, a_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = a_.i32f ^ b_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = a_.i32f[i] ^ b_.i32f[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _mm_xor_si128(a, b) simde_mm_xor_si128(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_not_si128 (simde__m128i a) { - #if defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_ternarylogic_epi32(a, a, a, 0x55); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vmvnq_s32(a_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i32 = vec_nor(a_.altivec_i32, a_.altivec_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_not(a_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = ~a_.i32f; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = ~(a_.i32f[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -#define SIMDE_MM_SHUFFLE2(x, y) (((x) << 1) | (y)) -#if defined(SIMDE_X86_SSE2_ENABLE_NATIVE_ALIASES) - #define _MM_SHUFFLE2(x, y) SIMDE_MM_SHUFFLE2(x, y) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_SSE2_H) */ diff --git a/ffi-deps/simde/simde/x86/sse3.h b/ffi-deps/simde/simde/x86/sse3.h deleted file mode 100644 index db2683c..0000000 --- a/ffi-deps/simde/simde/x86/sse3.h +++ /dev/null @@ -1,515 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2017-2020 Evan Nemerson - */ - -#if !defined(SIMDE_X86_SSE3_H) -#define SIMDE_X86_SSE3_H - -#include "sse2.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_deinterleaveeven_epi16 (simde__m128i a, simde__m128i b) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i16 = vuzp1q_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x8x2_t t = vuzpq_s16(a_.neon_i16, b_.neon_i16); - r_.neon_i16 = t.val[0]; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2, 4, 6, 8, 10, 12, 14); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 0, 2, 4, 6, 8, 10, 12, 14); - #else - const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2; - for(size_t i = 0 ; i < halfway_point ; i++) { - r_.i16[i] = a_.i16[2 * i]; - r_.i16[i + halfway_point] = b_.i16[2 * i]; - } - #endif - - return simde__m128i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_deinterleaveodd_epi16 (simde__m128i a, simde__m128i b) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i16 = vuzp2q_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x8x2_t t = vuzpq_s16(a_.neon_i16, b_.neon_i16); - r_.neon_i16 = t.val[1]; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3, 5, 7, 9, 11, 13, 15); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, b_.i16, 1, 3, 5, 7, 9, 11, 13, 15); - #else - const size_t halfway_point = (sizeof(r_.i16) / sizeof(r_.i16[0])) / 2; - for(size_t i = 0 ; i < halfway_point ; i++) { - r_.i16[i] = a_.i16[2 * i + 1]; - r_.i16[i + halfway_point] = b_.i16[2 * i + 1]; - } - #endif - - return simde__m128i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_deinterleaveeven_epi32 (simde__m128i a, simde__m128i b) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i32 = vuzp1q_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int32x4x2_t t = vuzpq_s32(a_.neon_i32, b_.neon_i32); - r_.neon_i32 = t.val[0]; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2, 4, 6); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 0, 2, 4, 6); - #else - const size_t halfway_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 2; - for(size_t i = 0 ; i < halfway_point ; i++) { - r_.i32[i] = a_.i32[2 * i]; - r_.i32[i + halfway_point] = b_.i32[2 * i]; - } - #endif - - return simde__m128i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_deinterleaveodd_epi32 (simde__m128i a, simde__m128i b) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i32 = vuzp2q_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int32x4x2_t t = vuzpq_s32(a_.neon_i32, b_.neon_i32); - r_.neon_i32 = t.val[1]; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3, 5, 7); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, b_.i32, 1, 3, 5, 7); - #else - const size_t halfway_point = (sizeof(r_.i32) / sizeof(r_.i32[0])) / 2; - for(size_t i = 0 ; i < halfway_point ; i++) { - r_.i32[i] = a_.i32[2 * i + 1]; - r_.i32[i + halfway_point] = b_.i32[2 * i + 1]; - } - #endif - - return simde__m128i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_deinterleaveeven_ps (simde__m128 a, simde__m128 b) { - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vuzp1q_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x4x2_t t = vuzpq_f32(a_.neon_f32, b_.neon_f32); - r_.neon_f32 = t.val[0]; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2, 4, 6); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 0, 2, 4, 6); - #else - const size_t halfway_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 2; - for(size_t i = 0 ; i < halfway_point ; i++) { - r_.f32[i] = a_.f32[2 * i]; - r_.f32[i + halfway_point] = b_.f32[2 * i]; - } - #endif - - return simde__m128_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_deinterleaveodd_ps (simde__m128 a, simde__m128 b) { - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vuzp2q_f32(a_.neon_f32, b_.neon_f32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x4x2_t t = vuzpq_f32(a_.neon_f32, b_.neon_f32); - r_.neon_f32 = t.val[1]; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3, 5, 7); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, b_.f32, 1, 3, 5, 7); - #else - const size_t halfway_point = (sizeof(r_.f32) / sizeof(r_.f32[0])) / 2; - for(size_t i = 0 ; i < halfway_point ; i++) { - r_.f32[i] = a_.f32[2 * i + 1]; - r_.f32[i + halfway_point] = b_.f32[2 * i + 1]; - } - #endif - - return simde__m128_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_x_mm_deinterleaveeven_pd (simde__m128d a, simde__m128d b) { - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vuzp1q_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 0, 2); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 0, 2); - #else - const size_t halfway_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 2; - for(size_t i = 0 ; i < halfway_point ; i++) { - r_.f64[i] = a_.f64[2 * i]; - r_.f64[i + halfway_point] = b_.f64[2 * i]; - } - #endif - - return simde__m128d_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_x_mm_deinterleaveodd_pd (simde__m128d a, simde__m128d b) { - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vuzp2q_f64(a_.neon_f64, b_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, b_.wasm_v128, 1, 3); - #elif defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, b_.f64, 1, 3); - #else - const size_t halfway_point = (sizeof(r_.f64) / sizeof(r_.f64[0])) / 2; - for(size_t i = 0 ; i < halfway_point ; i++) { - r_.f64[i] = a_.f64[2 * i + 1]; - r_.f64[i + halfway_point] = b_.f64[2 * i + 1]; - } - #endif - - return simde__m128d_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_addsub_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE3_NATIVE) - return _mm_addsub_pd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - float64x2_t rs = vsubq_f64(a_.neon_f64, b_.neon_f64); - float64x2_t ra = vaddq_f64(a_.neon_f64, b_.neon_f64); - return vcombine_f64(vget_low_f64(rs), vget_high_f64(ra)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64 - b_.f64, a_.f64 + b_.f64, 0, 3); - #else - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i += 2) { - r_.f64[ i ] = a_.f64[ i ] - b_.f64[ i ]; - r_.f64[1 + i] = a_.f64[1 + i] + b_.f64[1 + i]; - } - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_addsub_pd(a, b) simde_mm_addsub_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_addsub_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE3_NATIVE) - return _mm_addsub_ps(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - float32x4_t rs = vsubq_f32(a_.neon_f32, b_.neon_f32); - float32x4_t ra = vaddq_f32(a_.neon_f32, b_.neon_f32); - return vtrn2q_f32(vreinterpretq_f32_s32(vrev64q_s32(vreinterpretq_s32_f32(rs))), ra); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32 - b_.f32, a_.f32 + b_.f32, 0, 5, 2, 7); - #else - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) { - r_.f32[ i ] = a_.f32[ i ] - b_.f32[ i ]; - r_.f32[1 + i] = a_.f32[1 + i] + b_.f32[1 + i]; - } - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_addsub_ps(a, b) simde_mm_addsub_ps((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_hadd_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE3_NATIVE) - return _mm_hadd_pd(a, b); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return simde__m128d_from_neon_f64(vpaddq_f64(simde__m128d_to_neon_f64(a), simde__m128d_to_neon_f64(b))); - #else - return simde_mm_add_pd(simde_x_mm_deinterleaveeven_pd(a, b), simde_x_mm_deinterleaveodd_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hadd_pd(a, b) simde_mm_hadd_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_hadd_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE3_NATIVE) - return _mm_hadd_ps(a, b); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return simde__m128_from_neon_f32(vpaddq_f32(simde__m128_to_neon_f32(a), simde__m128_to_neon_f32(b))); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x4x2_t t = vuzpq_f32(simde__m128_to_neon_f32(a), simde__m128_to_neon_f32(b)); - return simde__m128_from_neon_f32(vaddq_f32(t.val[0], t.val[1])); - #else - return simde_mm_add_ps(simde_x_mm_deinterleaveeven_ps(a, b), simde_x_mm_deinterleaveodd_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hadd_ps(a, b) simde_mm_hadd_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_hsub_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE3_NATIVE) - return _mm_hsub_pd(a, b); - #else - return simde_mm_sub_pd(simde_x_mm_deinterleaveeven_pd(a, b), simde_x_mm_deinterleaveodd_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hsub_pd(a, b) simde_mm_hsub_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_hsub_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE3_NATIVE) - return _mm_hsub_ps(a, b); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - float32x4x2_t t = vuzpq_f32(simde__m128_to_neon_f32(a), simde__m128_to_neon_f32(b)); - return simde__m128_from_neon_f32(vaddq_f32(t.val[0], vnegq_f32(t.val[1]))); - #else - return simde_mm_sub_ps(simde_x_mm_deinterleaveeven_ps(a, b), simde_x_mm_deinterleaveodd_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hsub_ps(a, b) simde_mm_hsub_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_lddqu_si128 (simde__m128i const* mem_addr) { - #if defined(SIMDE_X86_SSE3_NATIVE) - return _mm_lddqu_si128(mem_addr); - #else - simde__m128i_private r_; - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vld1q_s32(HEDLEY_REINTERPRET_CAST(int32_t const*, mem_addr)); - #else - simde_memcpy(&r_, mem_addr, sizeof(r_)); - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_lddqu_si128(mem_addr) simde_mm_lddqu_si128(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_loaddup_pd (simde_float64 const* mem_addr) { - #if defined(SIMDE_X86_SSE3_NATIVE) - return _mm_loaddup_pd(mem_addr); - #else - simde__m128d_private r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vdupq_n_f64(*mem_addr); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vdupq_n_s64(*HEDLEY_REINTERPRET_CAST(int64_t const*, mem_addr)); - #else - r_.f64[0] = *mem_addr; - r_.f64[1] = *mem_addr; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_loaddup_pd(mem_addr) simde_mm_loaddup_pd(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_movedup_pd (simde__m128d a) { - #if defined(SIMDE_X86_SSE3_NATIVE) - return _mm_movedup_pd(a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vdupq_laneq_f64(a_.neon_f64, 0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 0); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && defined(SIMDE_SHUFFLE_VECTOR_) - r_.f64 = SIMDE_SHUFFLE_VECTOR_(64, 16, a_.f64, a_.f64, 0, 0); - #else - r_.f64[0] = a_.f64[0]; - r_.f64[1] = a_.f64[0]; - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_movedup_pd(a) simde_mm_movedup_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_movehdup_ps (simde__m128 a) { - #if defined(SIMDE_X86_SSE3_NATIVE) - return _mm_movehdup_ps(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vtrn2q_f32(a_.neon_f32, a_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 1, 1, 3, 3); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 1, 1, 3, 3); - #else - r_.f32[0] = a_.f32[1]; - r_.f32[1] = a_.f32[1]; - r_.f32[2] = a_.f32[3]; - r_.f32[3] = a_.f32[3]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_movehdup_ps(a) simde_mm_movehdup_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_moveldup_ps (simde__m128 a) { - #if defined(SIMDE__SSE3_NATIVE) - return _mm_moveldup_ps(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vtrn1q_f32(a_.neon_f32, a_.neon_f32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_shuffle(a_.wasm_v128, a_.wasm_v128, 0, 0, 2, 2); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) - r_.f32 = SIMDE_SHUFFLE_VECTOR_(32, 16, a_.f32, a_.f32, 0, 0, 2, 2); - #else - r_.f32[0] = a_.f32[0]; - r_.f32[1] = a_.f32[0]; - r_.f32[2] = a_.f32[2]; - r_.f32[3] = a_.f32[2]; - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE3_ENABLE_NATIVE_ALIASES) -# define _mm_moveldup_ps(a) simde_mm_moveldup_ps(a) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_SSE3_H) */ diff --git a/ffi-deps/simde/simde/x86/sse4.1.h b/ffi-deps/simde/simde/x86/sse4.1.h deleted file mode 100644 index 15a197b..0000000 --- a/ffi-deps/simde/simde/x86/sse4.1.h +++ /dev/null @@ -1,2367 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2017-2020 Evan Nemerson - */ - -#include "sse.h" -#if !defined(SIMDE_X86_SSE4_1_H) -#define SIMDE_X86_SSE4_1_H - -#include "ssse3.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -#if !defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) -# define SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_blend_epi16 (simde__m128i a, simde__m128i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = ((imm8 >> i) & 1) ? b_.u16[i] : a_.u16[i]; - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) - #define simde_mm_blend_epi16(a, b, imm8) _mm_blend_epi16(a, b, imm8) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm_blend_epi16(a, b, imm8) \ - (__extension__ ({ \ - simde__m128i_private \ - simde_mm_blend_epi16_a_ = simde__m128i_to_private(a), \ - simde_mm_blend_epi16_b_ = simde__m128i_to_private(b), \ - simde_mm_blend_epi16_r_; \ - \ - simde_mm_blend_epi16_r_.i16 = \ - SIMDE_SHUFFLE_VECTOR_( \ - 16, 16, \ - simde_mm_blend_epi16_a_.i16, \ - simde_mm_blend_epi16_b_.i16, \ - ((imm8) & (1 << 0)) ? 8 : 0, \ - ((imm8) & (1 << 1)) ? 9 : 1, \ - ((imm8) & (1 << 2)) ? 10 : 2, \ - ((imm8) & (1 << 3)) ? 11 : 3, \ - ((imm8) & (1 << 4)) ? 12 : 4, \ - ((imm8) & (1 << 5)) ? 13 : 5, \ - ((imm8) & (1 << 6)) ? 14 : 6, \ - ((imm8) & (1 << 7)) ? 15 : 7 \ - ); \ - \ - simde__m128i_from_private(simde_mm_blend_epi16_r_); \ - })) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_blend_epi16 - #define _mm_blend_epi16(a, b, imm8) simde_mm_blend_epi16(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_blend_pd (simde__m128d a, simde__m128d b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = ((imm8 >> i) & 1) ? b_.f64[i] : a_.f64[i]; - } - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) - #define simde_mm_blend_pd(a, b, imm8) _mm_blend_pd(a, b, imm8) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm_blend_pd(a, b, imm8) \ - (__extension__ ({ \ - simde__m128d_private \ - simde_mm_blend_pd_a_ = simde__m128d_to_private(a), \ - simde_mm_blend_pd_b_ = simde__m128d_to_private(b), \ - simde_mm_blend_pd_r_; \ - \ - simde_mm_blend_pd_r_.f64 = \ - SIMDE_SHUFFLE_VECTOR_( \ - 64, 16, \ - simde_mm_blend_pd_a_.f64, \ - simde_mm_blend_pd_b_.f64, \ - ((imm8) & (1 << 0)) ? 2 : 0, \ - ((imm8) & (1 << 1)) ? 3 : 1 \ - ); \ - \ - simde__m128d_from_private(simde_mm_blend_pd_r_); \ - })) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_blend_pd - #define _mm_blend_pd(a, b, imm8) simde_mm_blend_pd(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_blend_ps (simde__m128 a, simde__m128 b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = ((imm8 >> i) & 1) ? b_.f32[i] : a_.f32[i]; - } - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) -# define simde_mm_blend_ps(a, b, imm8) _mm_blend_ps(a, b, imm8) -#elif defined(SIMDE_SHUFFLE_VECTOR_) - #define simde_mm_blend_ps(a, b, imm8) \ - (__extension__ ({ \ - simde__m128_private \ - simde_mm_blend_ps_a_ = simde__m128_to_private(a), \ - simde_mm_blend_ps_b_ = simde__m128_to_private(b), \ - simde_mm_blend_ps_r_; \ - \ - simde_mm_blend_ps_r_.f32 = \ - SIMDE_SHUFFLE_VECTOR_( \ - 32, 16, \ - simde_mm_blend_ps_a_.f32, \ - simde_mm_blend_ps_b_.f32, \ - ((imm8) & (1 << 0)) ? 4 : 0, \ - ((imm8) & (1 << 1)) ? 5 : 1, \ - ((imm8) & (1 << 2)) ? 6 : 2, \ - ((imm8) & (1 << 3)) ? 7 : 3 \ - ); \ - \ - simde__m128_from_private(simde_mm_blend_ps_r_); \ - })) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_blend_ps - #define _mm_blend_ps(a, b, imm8) simde_mm_blend_ps(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_blendv_epi8 (simde__m128i a, simde__m128i b, simde__m128i mask) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_blendv_epi8(a, b, mask); - #elif defined(SIMDE_X86_SSE2_NATIVE) - __m128i m = _mm_cmpgt_epi8(_mm_setzero_si128(), mask); - return _mm_xor_si128(_mm_subs_epu8(_mm_xor_si128(a, b), m), b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - mask_ = simde__m128i_to_private(mask); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - /* Use a signed shift right to create a mask with the sign bit */ - mask_.neon_i8 = vshrq_n_s8(mask_.neon_i8, 7); - r_.neon_i8 = vbslq_s8(mask_.neon_u8, b_.neon_i8, a_.neon_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_i8x16_shr(mask_.wasm_v128, 7); - r_.wasm_v128 = wasm_v128_bitselect(b_.wasm_v128, a_.wasm_v128, m); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i8 = vec_sel(a_.altivec_i8, b_.altivec_i8, vec_cmplt(mask_.altivec_i8, vec_splat_s8(0))); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - /* https://software.intel.com/en-us/forums/intel-c-compiler/topic/850087 */ - #if defined(HEDLEY_INTEL_VERSION_CHECK) - __typeof__(mask_.i8) z = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; - mask_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(mask_.i8), mask_.i8 < z); - #else - mask_.i8 >>= (CHAR_BIT * sizeof(mask_.i8[0])) - 1; - #endif - - r_.i8 = (mask_.i8 & b_.i8) | (~mask_.i8 & a_.i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - int8_t m = mask_.i8[i] >> 7; - r_.i8[i] = (m & b_.i8[i]) | (~m & a_.i8[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_blendv_epi8 - #define _mm_blendv_epi8(a, b, mask) simde_mm_blendv_epi8(a, b, mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_blendv_epi16 (simde__m128i a, simde__m128i b, simde__m128i mask) { - #if defined(SIMDE_X86_SSE2_NATIVE) - mask = simde_mm_srai_epi16(mask, 15); - return simde_mm_or_si128(simde_mm_and_si128(mask, b), simde_mm_andnot_si128(mask, a)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - mask_ = simde__m128i_to_private(mask); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - mask_ = simde__m128i_to_private(simde_mm_cmplt_epi16(mask, simde_mm_setzero_si128())); - r_.neon_i16 = vbslq_s16(mask_.neon_u16, b_.neon_i16, a_.neon_i16); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i16 = vec_sel(a_.altivec_i16, b_.altivec_i16, vec_cmplt(mask_.altivec_i16, vec_splat_s16(0))); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - #if defined(HEDLEY_INTEL_VERSION_CHECK) - __typeof__(mask_.i16) z = { 0, 0, 0, 0, 0, 0, 0, 0 }; - mask_.i16 = mask_.i16 < z; - #else - mask_.i16 >>= (CHAR_BIT * sizeof(mask_.i16[0])) - 1; - #endif - - r_.i16 = (mask_.i16 & b_.i16) | (~mask_.i16 & a_.i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - int16_t m = mask_.i16[i] >> 15; - r_.i16[i] = (m & b_.i16[i]) | (~m & a_.i16[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_blendv_epi32 (simde__m128i a, simde__m128i b, simde__m128i mask) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask))); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - mask_ = simde__m128i_to_private(mask); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - mask_ = simde__m128i_to_private(simde_mm_cmplt_epi32(mask, simde_mm_setzero_si128())); - r_.neon_i32 = vbslq_s32(mask_.neon_u32, b_.neon_i32, a_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_i32x4_shr(mask_.wasm_v128, 31); - r_.wasm_v128 = wasm_v128_or(wasm_v128_and(b_.wasm_v128, m), wasm_v128_andnot(a_.wasm_v128, m)); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, vec_cmplt(mask_.altivec_i32, vec_splat_s32(0))); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - #if defined(HEDLEY_INTEL_VERSION_CHECK) - __typeof__(mask_.i32) z = { 0, 0, 0, 0 }; - mask_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(mask_.i32), mask_.i32 < z); - #else - mask_.i32 >>= (CHAR_BIT * sizeof(mask_.i32[0])) - 1; - #endif - - r_.i32 = (mask_.i32 & b_.i32) | (~mask_.i32 & a_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - int32_t m = mask_.i32[i] >> 31; - r_.i32[i] = (m & b_.i32[i]) | (~m & a_.i32[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_blendv_epi64 (simde__m128i a, simde__m128i b, simde__m128i mask) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_castpd_si128(_mm_blendv_pd(_mm_castsi128_pd(a), _mm_castsi128_pd(b), _mm_castsi128_pd(mask))); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - mask_ = simde__m128i_to_private(mask); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - mask_.neon_u64 = vcltq_s64(mask_.neon_i64, vdupq_n_s64(UINT64_C(0))); - r_.neon_i64 = vbslq_s64(mask_.neon_u64, b_.neon_i64, a_.neon_i64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_i64x2_shr(mask_.wasm_v128, 63); - r_.wasm_v128 = wasm_v128_or(wasm_v128_and(b_.wasm_v128, m), wasm_v128_andnot(a_.wasm_v128, m)); - #elif (defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) && !defined(SIMDE_BUG_CLANG_46770)) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i64 = vec_sel(a_.altivec_i64, b_.altivec_i64, vec_cmplt(mask_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(signed long long, 0)))); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(signed long long) selector = vec_sra(mask_.altivec_i64, vec_splats(HEDLEY_STATIC_CAST(unsigned long long, 63))); - r_.altivec_i32 = vec_sel(a_.altivec_i32, b_.altivec_i32, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned int), selector)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - #if defined(HEDLEY_INTEL_VERSION_CHECK) - __typeof__(mask_.i64) z = { 0, 0 }; - mask_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(mask_.i64), mask_.i64 < z); - #else - mask_.i64 >>= (CHAR_BIT * sizeof(mask_.i64[0])) - 1; - #endif - - r_.i64 = (mask_.i64 & b_.i64) | (~mask_.i64 & a_.i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - int64_t m = mask_.i64[i] >> 63; - r_.i64[i] = (m & b_.i64[i]) | (~m & a_.i64[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_blendv_pd (simde__m128d a, simde__m128d b, simde__m128d mask) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_blendv_pd(a, b, mask); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m_ = wasm_i64x2_shr(HEDLEY_REINTERPRET_CAST(v128_t, mask), 63); - return simde__m128d_from_wasm_v128(wasm_v128_bitselect(simde__m128d_to_wasm_v128(b), simde__m128d_to_wasm_v128(a), m_)); - #else - return simde_mm_castsi128_pd(simde_x_mm_blendv_epi64(simde_mm_castpd_si128(a), simde_mm_castpd_si128(b), simde_mm_castpd_si128(mask))); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_blendv_pd - #define _mm_blendv_pd(a, b, mask) simde_mm_blendv_pd(a, b, mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_blendv_ps (simde__m128 a, simde__m128 b, simde__m128 mask) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_blendv_ps(a, b, mask); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m_ = wasm_i32x4_shr(HEDLEY_REINTERPRET_CAST(v128_t, mask), 31); - return simde__m128d_from_wasm_v128(wasm_v128_bitselect(simde__m128d_to_wasm_v128(b), simde__m128d_to_wasm_v128(a), m_)); - #else - return simde_mm_castsi128_ps(simde_x_mm_blendv_epi32(simde_mm_castps_si128(a), simde_mm_castps_si128(b), simde_mm_castps_si128(mask))); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_blendv_ps - #define _mm_blendv_ps(a, b, mask) simde_mm_blendv_ps(a, b, mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_round_pd (simde__m128d a, int rounding) - SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) { - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - /* For architectures which lack a current direction SIMD instruction. */ - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - if ((rounding & 7) == SIMDE_MM_FROUND_CUR_DIRECTION) - rounding = HEDLEY_STATIC_CAST(int, SIMDE_MM_GET_ROUNDING_MODE()) << 13; - #endif - - switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { - case SIMDE_MM_FROUND_CUR_DIRECTION: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vrndiq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_nearest(a_.wasm_v128); - #elif defined(simde_math_nearbyint) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_nearbyint(a_.f64[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_NEAREST_INT: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_round(a_.altivec_f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vrndaq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_nearest(a_.wasm_v128); - #elif defined(simde_math_roundeven) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_roundeven(a_.f64[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_NEG_INF: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_floor(a_.altivec_f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vrndmq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_floor(a_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_floor(a_.f64[i]); - } - #endif - break; - - case SIMDE_MM_FROUND_TO_POS_INF: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_ceil(a_.altivec_f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vrndpq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_ceil(a_.wasm_v128); - #elif defined(simde_math_ceil) - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_ceil(a_.f64[i]); - } - #else - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - #endif - break; - - case SIMDE_MM_FROUND_TO_ZERO: - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_f64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(double), vec_trunc(a_.altivec_f64)); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vrndq_f64(a_.neon_f64); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_f64x2_trunc(a_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_trunc(a_.f64[i]); - } - #endif - break; - - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - } - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) - #define simde_mm_round_pd(a, rounding) _mm_round_pd(a, rounding) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_round_pd - #define _mm_round_pd(a, rounding) simde_mm_round_pd(a, rounding) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_ceil_pd (simde__m128d a) { - #if defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128d_from_wasm_v128(wasm_f64x2_ceil(simde__m128d_to_wasm_v128(a))); - #endif - return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_POS_INF); -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_ceil_pd - #define _mm_ceil_pd(a) simde_mm_ceil_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_ceil_ps (simde__m128 a) { - #if defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128_from_wasm_v128(wasm_f32x4_ceil(simde__m128_to_wasm_v128(a))); - #endif - return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_POS_INF); -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_ceil_ps - #define _mm_ceil_ps(a) simde_mm_ceil_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_ceil_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_ceil_sd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(simde_math_ceilf) - r_ = simde__m128d_to_private(simde_mm_set_pd(a_.f64[1], simde_math_ceil(b_.f64[0]))); - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_ceil_sd - #define _mm_ceil_sd(a, b) simde_mm_ceil_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_ceil_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_ceil_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_ceil_ps(b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_ceil_ps(simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(simde_math_ceilf) - r_ = simde__m128_to_private(simde_mm_set_ps(a_.f32[3], a_.f32[2], a_.f32[1], simde_math_ceilf(b_.f32[0]))); - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_ceil_ss - #define _mm_ceil_ss(a, b) simde_mm_ceil_ss(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmpeq_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cmpeq_epi64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vceqq_u64(a_.neon_u64, b_.neon_u64); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - /* (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) */ - uint32x4_t cmp = vceqq_u32(a_.neon_u32, b_.neon_u32); - uint32x4_t swapped = vrev64q_u32(cmp); - r_.neon_u32 = vandq_u32(cmp, swapped); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 == b_.i64); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_i64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed long long), vec_cmpeq(a_.altivec_i64, b_.altivec_i64)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] == b_.u64[i]) ? ~UINT64_C(0) : UINT64_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpeq_epi64 - #define _mm_cmpeq_epi64(a, b) simde_mm_cmpeq_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepi8_epi16 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepi8_epi16(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int8x16_t s8x16 = a_.neon_i8; /* xxxx xxxx xxxx DCBA */ - int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ - r_.neon_i16 = s16x8; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_extend_low_i8x16(a_.wasm_v128); - #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8, - -1, 0, -1, 1, -1, 2, -1, 3, - -1, 4, -1, 5, -1, 6, -1, 7)); - r_.i16 >>= 8; - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i16, a_.m64_private[0].i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepi8_epi16 - #define _mm_cvtepi8_epi16(a) simde_mm_cvtepi8_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepi8_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepi8_epi32(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - __m128i tmp = _mm_unpacklo_epi8(a, a); - tmp = _mm_unpacklo_epi16(tmp, tmp); - return _mm_srai_epi32(tmp, 24); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int8x16_t s8x16 = a_.neon_i8; /* xxxx xxxx xxxx DCBA */ - int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ - int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */ - r_.neon_i32 = s32x4; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(a_.wasm_v128)); - #elif defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8, - -1, -1, -1, 0, -1, -1, -1, 1, - -1, -1, -1, 2, -1, -1, -1, 3)); - r_.i32 >>= 24; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepi8_epi32 - #define _mm_cvtepi8_epi32(a) simde_mm_cvtepi8_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepi8_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepi8_epi64(a); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int8x16_t s8x16 = a_.neon_i8; /* xxxx xxxx xxxx xxBA */ - int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */ - int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ - int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ - r_.neon_i64 = s64x2; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t extra = wasm_i32x4_extend_low_i16x8(wasm_i16x8_extend_low_i8x16(a_.wasm_v128)); - v128_t sign = wasm_i32x4_gt(wasm_i64x2_const(0, 0), extra); - r_.wasm_v128 = wasm_i32x4_shuffle(extra, sign, 0, 4, 1, 5); - #elif (!defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - /* Disabled on x86 due to lack of 64-bit arithmetic shift until - * until AVX-512 (at which point we would be using the native - * _mm_cvtepi_epi64 anyways). */ - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, a_.i8, - -1, -1, -1, -1, -1, -1, -1, 0, - -1, -1, -1, -1, -1, -1, -1, 1)); - r_.i64 >>= 56; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepi8_epi64 - #define _mm_cvtepi8_epi64(a) simde_mm_cvtepi8_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepu8_epi16 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepu8_epi16(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpacklo_epi8(a, _mm_setzero_si128()); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint8x16_t u8x16 = a_.neon_u8; /* xxxx xxxx xxxx DCBA */ - uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ - r_.neon_u16 = u16x8; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u16x8_extend_low_u8x16(a_.wasm_v128); - #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - __typeof__(r_.i8) z = { 0, }; - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z, - 0, 16, 1, 17, 2, 18, 3, 19, - 4, 20, 5, 21, 6, 22, 7, 23)); - #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_CLANG_45541) && (!defined(SIMDE_ARCH_POWER) || !defined(__clang__)) - SIMDE_CONVERT_VECTOR_(r_.i16, a_.m64_private[0].u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.u8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepu8_epi16 - #define _mm_cvtepu8_epi16(a) simde_mm_cvtepu8_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepu8_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepu8_epi32(a); - #elif defined(SIMDE_X86_SSSE3_NATIVE) - __m128i s = _mm_set_epi8( - HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x03), - HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x02), - HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x01), - HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x00)); - return _mm_shuffle_epi8(a, s); - #elif defined(SIMDE_X86_SSE2_NATIVE) - __m128i z = _mm_setzero_si128(); - return _mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint8x16_t u8x16 = a_.neon_u8; /* xxxx xxxx xxxx DCBA */ - uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ - uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */ - r_.neon_u32 = u32x4; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u32x4_extend_low_u16x8(wasm_u16x8_extend_low_u8x16(a_.wasm_v128)); - #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - __typeof__(r_.i8) z = { 0, }; - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z, - 0, 17, 18, 19, 1, 21, 22, 23, - 2, 25, 26, 27, 3, 29, 30, 31)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.u8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepu8_epi32 - #define _mm_cvtepu8_epi32(a) simde_mm_cvtepu8_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepu8_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepu8_epi64(a); - #elif defined(SIMDE_X86_SSSE3_NATIVE) - __m128i s = _mm_set_epi8( - HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), - HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x01), - HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), - HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x80), HEDLEY_STATIC_CAST(char, 0x00)); - return _mm_shuffle_epi8(a, s); - #elif defined(SIMDE_X86_SSE2_NATIVE) - __m128i z = _mm_setzero_si128(); - return _mm_unpacklo_epi32(_mm_unpacklo_epi16(_mm_unpacklo_epi8(a, z), z), z); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint8x16_t u8x16 = a_.neon_u8; /* xxxx xxxx xxxx xxBA */ - uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */ - uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ - uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ - r_.neon_u64 = u64x2; - #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - __typeof__(r_.i8) z = { 0, }; - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(8, 16, a_.i8, z, - 0, 17, 18, 19, 20, 21, 22, 23, - 1, 25, 26, 27, 28, 29, 30, 31)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.u8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepu8_epi64 - #define _mm_cvtepu8_epi64(a) simde_mm_cvtepu8_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepi16_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepi16_epi32(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vmovl_s16(vget_low_s16(a_.neon_i16)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_extend_low_i16x8(a_.wasm_v128); - #elif !defined(SIMDE_ARCH_X86) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, a_.i16, 8, 0, 10, 1, 12, 2, 14, 3)); - r_.i32 >>= 16; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepi16_epi32 - #define _mm_cvtepi16_epi32(a) simde_mm_cvtepi16_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepu16_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepu16_epi32(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpacklo_epi16(a, _mm_setzero_si128()); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vmovl_u16(vget_low_u16(a_.neon_u16)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u32x4_extend_low_u16x8(a_.wasm_v128); - #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - __typeof__(r_.u16) z = { 0, }; - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.u16, z, - 0, 9, 1, 11, 2, 13, 3, 15)); - #elif defined(SIMDE_CONVERT_VECTOR_) && !defined(SIMDE_BUG_CLANG_45541) && (!defined(SIMDE_ARCH_POWER) || !defined(__clang__)) - SIMDE_CONVERT_VECTOR_(r_.i32, a_.m64_private[0].u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.u16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepu16_epi32 - #define _mm_cvtepu16_epi32(a) simde_mm_cvtepu16_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepu16_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepu16_epi64(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - __m128i z = _mm_setzero_si128(); - return _mm_unpacklo_epi32(_mm_unpacklo_epi16(a, z), z); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint16x8_t u16x8 = a_.neon_u16; /* xxxx xxxx xxxx 0B0A */ - uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ - uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ - r_.neon_u64 = u64x2; - #elif defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - __typeof__(r_.u16) z = { 0, }; - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.u16, z, - 0, 9, 10, 11, - 1, 13, 14, 15)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.u16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepu16_epi64 - #define _mm_cvtepu16_epi64(a) simde_mm_cvtepu16_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepi16_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepi16_epi64(a); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x8_t s16x8 = a_.neon_i16; /* xxxx xxxx xxxx 0B0A */ - int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ - int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ - r_.neon_i64 = s64x2; - #elif (!defined(SIMDE_ARCH_X86) && !defined(SIMDE_ARCH_AMD64)) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(16, 16, a_.i16, a_.i16, - 8, 9, 10, 0, - 12, 13, 14, 1)); - r_.i64 >>= 48; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepi16_epi64 - #define _mm_cvtepi16_epi64(a) simde_mm_cvtepi16_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepi32_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepi32_epi64(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - __m128i tmp = _mm_shuffle_epi32(a, 0x50); - tmp = _mm_srai_epi32(tmp, 31); - tmp = _mm_shuffle_epi32(tmp, 0xed); - return _mm_unpacklo_epi32(a, tmp); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vmovl_s32(vget_low_s32(a_.neon_i32)); - #elif !defined(SIMDE_ARCH_X86) && defined(SIMDE_SHUFFLE_VECTOR_) && defined(SIMDE_VECTOR_SCALAR) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(32, 16, a_.i32, a_.i32, -1, 0, -1, 1)); - r_.i64 >>= 32; - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepi32_epi64 - #define _mm_cvtepi32_epi64(a) simde_mm_cvtepi32_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cvtepu32_epi64 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_cvtepu32_epi64(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - return _mm_unpacklo_epi32(a, _mm_setzero_si128()); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u64 = vmovl_u32(vget_low_u32(a_.neon_u32)); - #elif defined(SIMDE_VECTOR_SCALAR) && defined(SIMDE_SHUFFLE_VECTOR_) && (SIMDE_ENDIAN_ORDER == SIMDE_ENDIAN_LITTLE) - __typeof__(r_.u32) z = { 0, }; - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), SIMDE_SHUFFLE_VECTOR_(32, 16, a_.u32, z, 0, 4, 1, 6)); - #elif defined(SIMDE_CONVERT_VECTOR_) - SIMDE_CONVERT_VECTOR_(r_.i64, a_.m64_private[0].u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.u32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_cvtepu32_epi64 - #define _mm_cvtepu32_epi64(a) simde_mm_cvtepu32_epi64(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_dp_pd (simde__m128d a, simde__m128d b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vmulq_f64(a_.neon_f64, b_.neon_f64); - - switch (imm8) { - case 0xff: - r_.neon_f64 = vaddq_f64(r_.neon_f64, vextq_f64(r_.neon_f64, r_.neon_f64, 1)); - break; - case 0x13: - r_.neon_f64 = vdupq_lane_f64(vget_low_f64(r_.neon_f64), 0); - break; - default: - { /* imm8 is a compile-time constant, so this all becomes just a load */ - uint64_t mask_data[] = { - (imm8 & (1 << 4)) ? ~UINT64_C(0) : UINT64_C(0), - (imm8 & (1 << 5)) ? ~UINT64_C(0) : UINT64_C(0), - }; - r_.neon_f64 = vreinterpretq_f64_u64(vandq_u64(vld1q_u64(mask_data), vreinterpretq_u64_f64(r_.neon_f64))); - } - - r_.neon_f64 = vdupq_n_f64(vaddvq_f64(r_.neon_f64)); - - { - uint64_t mask_data[] = { - (imm8 & 1) ? ~UINT64_C(0) : UINT64_C(0), - (imm8 & 2) ? ~UINT64_C(0) : UINT64_C(0) - }; - r_.neon_f64 = vreinterpretq_f64_u64(vandq_u64(vld1q_u64(mask_data), vreinterpretq_u64_f64(r_.neon_f64))); - } - break; - } - #else - simde_float64 sum = SIMDE_FLOAT64_C(0.0); - - SIMDE_VECTORIZE_REDUCTION(+:sum) - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - sum += ((imm8 >> (i + 4)) & 1) ? (a_.f64[i] * b_.f64[i]) : 0.0; - } - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = ((imm8 >> i) & 1) ? sum : 0.0; - } - #endif - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) -# define simde_mm_dp_pd(a, b, imm8) _mm_dp_pd(a, b, imm8) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_dp_pd - #define _mm_dp_pd(a, b, imm8) simde_mm_dp_pd(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_dp_ps (simde__m128 a, simde__m128 b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f32 = vmulq_f32(a_.neon_f32, b_.neon_f32); - - switch (imm8) { - case 0xff: - r_.neon_f32 = vdupq_n_f32(vaddvq_f32(r_.neon_f32)); - break; - case 0x7f: - r_.neon_f32 = vsetq_lane_f32(0, r_.neon_f32, 3); - r_.neon_f32 = vdupq_n_f32(vaddvq_f32(r_.neon_f32)); - break; - default: - { - { - uint32_t mask_data[] = { - (imm8 & (1 << 4)) ? ~UINT32_C(0) : UINT32_C(0), - (imm8 & (1 << 5)) ? ~UINT32_C(0) : UINT32_C(0), - (imm8 & (1 << 6)) ? ~UINT32_C(0) : UINT32_C(0), - (imm8 & (1 << 7)) ? ~UINT32_C(0) : UINT32_C(0) - }; - r_.neon_f32 = vreinterpretq_f32_u32(vandq_u32(vld1q_u32(mask_data), vreinterpretq_u32_f32(r_.neon_f32))); - } - - r_.neon_f32 = vdupq_n_f32(vaddvq_f32(r_.neon_f32)); - - { - uint32_t mask_data[] = { - (imm8 & 1) ? ~UINT32_C(0) : UINT32_C(0), - (imm8 & 2) ? ~UINT32_C(0) : UINT32_C(0), - (imm8 & 4) ? ~UINT32_C(0) : UINT32_C(0), - (imm8 & 8) ? ~UINT32_C(0) : UINT32_C(0) - }; - r_.neon_f32 = vreinterpretq_f32_u32(vandq_u32(vld1q_u32(mask_data), vreinterpretq_u32_f32(r_.neon_f32))); - } - } - break; - } - #else - simde_float32 sum = SIMDE_FLOAT32_C(0.0); - - SIMDE_VECTORIZE_REDUCTION(+:sum) - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - sum += ((imm8 >> (i + 4)) & 1) ? (a_.f32[i] * b_.f32[i]) : SIMDE_FLOAT32_C(0.0); - } - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = ((imm8 >> i) & 1) ? sum : SIMDE_FLOAT32_C(0.0); - } - #endif - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) - #if defined(HEDLEY_MCST_LCC_VERSION) - #define simde_mm_dp_ps(a, b, imm8) (__extension__ ({ \ - SIMDE_LCC_DISABLE_DEPRECATED_WARNINGS \ - _mm_dp_ps((a), (b), (imm8)); \ - SIMDE_LCC_REVERT_DEPRECATED_WARNINGS \ - })) - #else - #define simde_mm_dp_ps(a, b, imm8) _mm_dp_ps(a, b, imm8) - #endif -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_dp_ps - #define _mm_dp_ps(a, b, imm8) simde_mm_dp_ps(a, b, imm8) -#endif - -#if defined(simde_mm_extract_epi8) -# undef simde_mm_extract_epi8 -#endif -SIMDE_FUNCTION_ATTRIBUTES -int8_t -simde_mm_extract_epi8 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m128i_private - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - #if defined(SIMDE_BUG_GCC_95227) - (void) a_; - (void) imm8; - #endif - return vec_extract(a_.altivec_i8, imm8); - #else - return a_.i8[imm8 & 15]; - #endif -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_EXTRACT_EPI8) -# define simde_mm_extract_epi8(a, imm8) HEDLEY_STATIC_CAST(int8_t, _mm_extract_epi8(a, imm8)) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define simde_mm_extract_epi8(a, imm8) vgetq_lane_s8(simde__m128i_to_neon_i8(a), imm8) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) -# define simde_mm_extract_epi8(a, imm8) wasm_u8x16_extract_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 15) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_extract_epi8 - #define _mm_extract_epi8(a, imm8) HEDLEY_STATIC_CAST(int, simde_mm_extract_epi8(a, imm8)) -#endif - -#if defined(simde_mm_extract_epi32) -# undef simde_mm_extract_epi32 -#endif -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_extract_epi32 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m128i_private - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - #if defined(SIMDE_BUG_GCC_95227) - (void) a_; - (void) imm8; - #endif - return vec_extract(a_.altivec_i32, imm8); - #else - return a_.i32[imm8 & 3]; - #endif -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) -# define simde_mm_extract_epi32(a, imm8) _mm_extract_epi32(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define simde_mm_extract_epi32(a, imm8) vgetq_lane_s32(simde__m128i_to_neon_i32(a), imm8) -#elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) -# define simde_mm_extract_epi32(a, imm8) HEDLEY_STATIC_CAST(int32_t, vec_extract(simde__m128i_to_altivec_i32(a), imm8)) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) -# define simde_mm_extract_epi32(a, imm8) wasm_i32x4_extract_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 3) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_extract_epi32 - #define _mm_extract_epi32(a, imm8) simde_mm_extract_epi32(a, imm8) -#endif - -#if defined(simde_mm_extract_epi64) -# undef simde_mm_extract_epi64 -#endif -SIMDE_FUNCTION_ATTRIBUTES -int64_t -simde_mm_extract_epi64 (simde__m128i a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - simde__m128i_private - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - #if defined(SIMDE_BUG_GCC_95227) - (void) a_; - (void) imm8; - #endif - return vec_extract(a_.altivec_i64, imm8); - #else - return a_.i64[imm8 & 1]; - #endif -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ARCH_AMD64) -# define simde_mm_extract_epi64(a, imm8) _mm_extract_epi64(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define simde_mm_extract_epi64(a, imm8) vgetq_lane_s64(simde__m128i_to_neon_i64(a), imm8) -#elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) -# define simde_mm_extract_epi64(a, imm8) HEDLEY_STATIC_CAST(int64_t, vec_extract(simde__m128i_to_altivec_i64(a), imm8)) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) - #undef _mm_extract_epi64 - #define _mm_extract_epi64(a, imm8) simde_mm_extract_epi64(a, imm8) -#endif - -#if defined(simde_mm_extract_ps) -# undef simde_mm_extract_ps -#endif -SIMDE_FUNCTION_ATTRIBUTES -int32_t -simde_mm_extract_ps (simde__m128 a, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m128_private - a_ = simde__m128_to_private(a); - - return a_.i32[imm8 & 3]; -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) - #define simde_mm_extract_ps(a, imm8) _mm_extract_ps(a, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_extract_ps(a, imm8) vgetq_lane_s32(simde__m128_to_neon_i32(a), imm8) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) - #define simde_mm_extract_ps(a, imm8) wasm_i32x4_extract_lane(simde__m128_to_wasm_v128((a)), (imm8) & 3) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_extract_ps - #define _mm_extract_ps(a, imm8) simde_mm_extract_ps(a, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_floor_pd (simde__m128d a) { - #if defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128d_from_wasm_v128(wasm_f64x2_floor(simde__m128d_to_wasm_v128(a))); - #endif - return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_NEG_INF); -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_floor_pd - #define _mm_floor_pd(a) simde_mm_floor_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_floor_ps (simde__m128 a) { - #if defined(SIMDE_WASM_SIMD128_NATIVE) - return simde__m128_from_wasm_v128(wasm_f32x4_floor(simde__m128_to_wasm_v128(a))); - #endif - return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEG_INF); -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_floor_ps - #define _mm_floor_ps(a) simde_mm_floor_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_floor_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_floor_sd(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(simde_math_floor) - r_.f64[0] = simde_math_floor(b_.f64[0]); - r_.f64[1] = a_.f64[1]; - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_floor_sd - #define _mm_floor_sd(a, b) simde_mm_floor_sd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_floor_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_floor_ss(a, b); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_FAST_EXCEPTIONS) - return simde_mm_move_ss(a, simde_mm_floor_ps(b)); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - return simde_mm_move_ss(a, simde_mm_floor_ps(simde_x_mm_broadcastlow_ps(b))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(simde_math_floorf) - r_.f32[0] = simde_math_floorf(b_.f32[0]); - for (size_t i = 1 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = a_.f32[i]; - } - #else - HEDLEY_UNREACHABLE(); - #endif - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_floor_ss - #define _mm_floor_ss(a, b) simde_mm_floor_ss(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_insert_epi8 (simde__m128i a, int i, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 15) { - simde__m128i_private - r_ = simde__m128i_to_private(a); - - r_.i8[imm8] = HEDLEY_STATIC_CAST(int8_t, i); - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) - /* clang-3.8 returns an incompatible type, so we need the cast. MSVC - * can't handle the cast ("error C2440: 'type cast': cannot convert - * from '__m128i' to '__m128i'"). */ - #if defined(__clang__) - #define simde_mm_insert_epi8(a, i, imm8) HEDLEY_REINTERPRET_CAST(__m128i, _mm_insert_epi8(a, i, imm8)) - #else - #define simde_mm_insert_epi8(a, i, imm8) _mm_insert_epi8(a, i, imm8) - #endif -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define simde_mm_insert_epi8(a, i, imm8) simde__m128i_from_neon_i8(vsetq_lane_s8(i, simde__m128i_to_neon_i8(a), imm8)) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) -# define simde_mm_insert_epi8(a, i, imm8) simde__m128i_from_wasm_v128(wasm_i8x16_replace_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 15, HEDLEY_STATIC_CAST(int8_t, (i)))) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_insert_epi8 - #define _mm_insert_epi8(a, i, imm8) simde_mm_insert_epi8(a, i, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_insert_epi32 (simde__m128i a, int i, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 3) { - simde__m128i_private - r_ = simde__m128i_to_private(a); - - r_.i32[imm8] = HEDLEY_STATIC_CAST(int32_t, i); - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) - #if defined(__clang__) - #define simde_mm_insert_epi32(a, i, imm8) HEDLEY_REINTERPRET_CAST(__m128i, _mm_insert_epi32(a, i, imm8)) - #else - #define simde_mm_insert_epi32(a, i, imm8) _mm_insert_epi32(a, i, imm8) - #endif -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define simde_mm_insert_epi32(a, i, imm8) simde__m128i_from_neon_i32(vsetq_lane_s32(i, simde__m128i_to_neon_i32(a), imm8)) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) -# define simde_mm_insert_epi32(a, i, imm8) simde__m128i_from_wasm_v128(wasm_i32x4_replace_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 3, (i))) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_insert_epi32 - #define _mm_insert_epi32(a, i, imm8) simde_mm_insert_epi32(a, i, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_insert_epi64 (simde__m128i a, int64_t i, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 1) { - #if defined(SIMDE_BUG_GCC_94482) - simde__m128i_private - a_ = simde__m128i_to_private(a); - - switch(imm8) { - case 0: - return simde_mm_set_epi64x(a_.i64[1], i); - break; - case 1: - return simde_mm_set_epi64x(i, a_.i64[0]); - break; - default: - HEDLEY_UNREACHABLE(); - break; - } - #else - simde__m128i_private - r_ = simde__m128i_to_private(a); - - r_.i64[imm8] = i; - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) && defined(SIMDE_ARCH_AMD64) -# define simde_mm_insert_epi64(a, i, imm8) _mm_insert_epi64(a, i, imm8) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) -# define simde_mm_insert_epi64(a, i, imm8) simde__m128i_from_neon_i64(vsetq_lane_s64(i, simde__m128i_to_neon_i64(a), imm8)) -#elif defined(SIMDE_WASM_SIMD128_NATIVE) -# define simde_mm_insert_epi64(a, i, imm8) simde__m128i_from_wasm_v128(wasm_i64x2_replace_lane(simde__m128i_to_wasm_v128((a)), (imm8) & 1, (i))) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) - #undef _mm_insert_epi64 - #define _mm_insert_epi64(a, i, imm8) simde_mm_insert_epi64(a, i, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_insert_ps (simde__m128 a, simde__m128 b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - float tmp1_ = b_.f32[(imm8 >> 6) & 3]; - a_.f32[(imm8 >> 4) & 3] = tmp1_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = ((imm8 >> i) & 1 ) ? SIMDE_FLOAT32_C(0.0) : a_.f32[i]; - } - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) -# define simde_mm_insert_ps(a, b, imm8) _mm_insert_ps(a, b, imm8) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_insert_ps - #define _mm_insert_ps(a, b, imm8) simde_mm_insert_ps(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_max_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI) - return _mm_max_epi8(a, b); - #elif defined(SIMDE_X86_SSE2_NATIVE) - __m128i m = _mm_cmpgt_epi8(a, b); - return _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vmaxq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_max(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i8 = vec_max(a_.altivec_i8, b_.altivec_i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] > b_.i8[i] ? a_.i8[i] : b_.i8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_max_epi8 - #define _mm_max_epi8(a, b) simde_mm_max_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_max_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI) - return _mm_max_epi32(a, b); - #elif defined(SIMDE_X86_SSE2_NATIVE) - __m128i m = _mm_cmpgt_epi32(a, b); - return _mm_or_si128(_mm_and_si128(m, a), _mm_andnot_si128(m, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vmaxq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_max(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = vec_max(a_.altivec_i32, b_.altivec_i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] > b_.i32[i] ? a_.i32[i] : b_.i32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_max_epi32 - #define _mm_max_epi32(a, b) simde_mm_max_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_max_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_max_epu16(a, b); - #elif defined(SIMDE_X86_SSE2_NATIVE) - /* https://github.com/simd-everywhere/simde/issues/855#issuecomment-881656284 */ - return _mm_add_epi16(b, _mm_subs_epu16(a, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vmaxq_u16(a_.neon_u16, b_.neon_u16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u16x8_max(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u16 = vec_max(a_.altivec_u16, b_.altivec_u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = a_.u16[i] > b_.u16[i] ? a_.u16[i] : b_.u16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_max_epu16 - #define _mm_max_epu16(a, b) simde_mm_max_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_max_epu32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_max_epu32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vmaxq_u32(a_.neon_u32, b_.neon_u32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u32x4_max(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u32 = vec_max(a_.altivec_u32, b_.altivec_u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] > b_.u32[i] ? a_.u32[i] : b_.u32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_max_epu32 - #define _mm_max_epu32(a, b) simde_mm_max_epu32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_min_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI) - return _mm_min_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vminq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_min(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i8 = vec_min(a_.altivec_i8, b_.altivec_i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] < b_.i8[i] ? a_.i8[i] : b_.i8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_min_epi8 - #define _mm_min_epi8(a, b) simde_mm_min_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_min_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(__PGI) - return _mm_min_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vminq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_min(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = vec_min(a_.altivec_i32, b_.altivec_i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] < b_.i32[i] ? a_.i32[i] : b_.i32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_min_epi32 - #define _mm_min_epi32(a, b) simde_mm_min_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_min_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_min_epu16(a, b); - #elif defined(SIMDE_X86_SSE2_NATIVE) - /* https://github.com/simd-everywhere/simde/issues/855#issuecomment-881656284 */ - return _mm_sub_epi16(a, _mm_subs_epu16(a, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vminq_u16(a_.neon_u16, b_.neon_u16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u16x8_min(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u16 = vec_min(a_.altivec_u16, b_.altivec_u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = a_.u16[i] < b_.u16[i] ? a_.u16[i] : b_.u16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_min_epu16 - #define _mm_min_epu16(a, b) simde_mm_min_epu16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_min_epu32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_min_epu32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vminq_u32(a_.neon_u32, b_.neon_u32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u32x4_min(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_u32 = vec_min(a_.altivec_u32, b_.altivec_u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] < b_.u32[i] ? a_.u32[i] : b_.u32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_min_epu32 - #define _mm_min_epu32(a, b) simde_mm_min_epu32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_minpos_epu16 (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_minpos_epu16(a); - #else - simde__m128i_private - r_ = simde__m128i_to_private(simde_mm_setzero_si128()), - a_ = simde__m128i_to_private(a); - - r_.u16[0] = UINT16_MAX; - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - if (a_.u16[i] < r_.u16[0]) { - r_.u16[0] = a_.u16[i]; - r_.u16[1] = HEDLEY_STATIC_CAST(uint16_t, i); - } - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_minpos_epu16 - #define _mm_minpos_epu16(a) simde_mm_minpos_epu16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mpsadbw_epu8 (simde__m128i a, simde__m128i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - const int a_offset = imm8 & 4; - const int b_offset = (imm8 & 3) << 2; - -#if defined(simde_math_abs) - for (int i = 0 ; i < HEDLEY_STATIC_CAST(int, (sizeof(r_.u16) / sizeof(r_.u16[0]))) ; i++) { - r_.u16[i] = - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 0] - b_.u8[b_offset + 0]))) + - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 1] - b_.u8[b_offset + 1]))) + - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 2] - b_.u8[b_offset + 2]))) + - HEDLEY_STATIC_CAST(uint16_t, simde_math_abs(HEDLEY_STATIC_CAST(int, a_.u8[a_offset + i + 3] - b_.u8[b_offset + 3]))); - } -#else - HEDLEY_UNREACHABLE(); -#endif - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) && !defined(SIMDE_BUG_PGI_30107) -# define simde_mm_mpsadbw_epu8(a, b, imm8) _mm_mpsadbw_epu8(a, b, imm8) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_mpsadbw_epu8 - #define _mm_mpsadbw_epu8(a, b, imm8) simde_mm_mpsadbw_epu8(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mul_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_mul_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - // vmull_s32 upcasts instead of masking, so we downcast. - int32x2_t a_lo = vmovn_s64(a_.neon_i64); - int32x2_t b_lo = vmovn_s64(b_.neon_i64); - r_.neon_i64 = vmull_s32(a_lo, b_lo); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_make( - wasm_i32x4_extract_lane(a_.wasm_v128, 0) * HEDLEY_STATIC_CAST(int64_t, wasm_i32x4_extract_lane(b_.wasm_v128, 0)), - wasm_i32x4_extract_lane(a_.wasm_v128, 2) * HEDLEY_STATIC_CAST(int64_t, wasm_i32x4_extract_lane(b_.wasm_v128, 2))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = - HEDLEY_STATIC_CAST(int64_t, a_.i32[i * 2]) * - HEDLEY_STATIC_CAST(int64_t, b_.i32[i * 2]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_mul_epi32 - #define _mm_mul_epi32(a, b) simde_mm_mul_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mullo_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_mullo_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vmulq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - (void) a_; - (void) b_; - r_.altivec_i32 = vec_mul(a_.altivec_i32, b_.altivec_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_mul(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (HEDLEY_STATIC_CAST(uint64_t, (HEDLEY_STATIC_CAST(int64_t, a_.i32[i]) * HEDLEY_STATIC_CAST(int64_t, b_.i32[i]))) & 0xffffffff)); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_mullo_epi32 - #define _mm_mullo_epi32(a, b) simde_mm_mullo_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_x_mm_mullo_epu32 (simde__m128i a, simde__m128i b) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vmulq_u32(a_.neon_u32, b_.neon_u32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = a_.u32 * b_.u32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] * b_.u32[i]; - } - #endif - - return simde__m128i_from_private(r_); -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_packus_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_packus_epi32(a, b); - #elif defined(SIMDE_X86_SSE2_NATIVE) - const __m128i max = _mm_set1_epi32(UINT16_MAX); - const __m128i tmpa = _mm_andnot_si128(_mm_srai_epi32(a, 31), a); - const __m128i tmpb = _mm_andnot_si128(_mm_srai_epi32(b, 31), b); - return - _mm_packs_epi32( - _mm_srai_epi32(_mm_slli_epi32(_mm_or_si128(tmpa, _mm_cmpgt_epi32(tmpa, max)), 16), 16), - _mm_srai_epi32(_mm_slli_epi32(_mm_or_si128(tmpb, _mm_cmpgt_epi32(tmpb, max)), 16), 16) - ); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - r_; - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - #if defined(SIMDE_BUG_CLANG_46840) - r_.neon_u16 = vqmovun_high_s32(vreinterpret_s16_u16(vqmovun_s32(a_.neon_i32)), b_.neon_i32); - #else - r_.neon_u16 = vqmovun_high_s32(vqmovun_s32(a_.neon_i32), b_.neon_i32); - #endif - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = - vcombine_u16( - vqmovun_s32(a_.neon_i32), - vqmovun_s32(b_.neon_i32) - ); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r_.altivec_u16 = vec_packsu(a_.altivec_i32, b_.altivec_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u16x8_narrow_i32x4(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_CONVERT_VECTOR_) && HEDLEY_HAS_BUILTIN(__builtin_shufflevector) && defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - int32_t v SIMDE_VECTOR(32) = SIMDE_SHUFFLE_VECTOR_(32, 32, a_.i32, b_.i32, 0, 1, 2, 3, 4, 5, 6, 7); - - v &= ~(v >> 31); - v |= HEDLEY_REINTERPRET_CAST(__typeof__(v), v > UINT16_MAX); - - SIMDE_CONVERT_VECTOR_(r_.i16, v); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - int32_t v = (i < (sizeof(a_.i32) / sizeof(a_.i32[0]))) ? a_.i32[i] : b_.i32[i & 3]; - r_.u16[i] = (v < 0) ? UINT16_C(0) : ((v > UINT16_MAX) ? UINT16_MAX : HEDLEY_STATIC_CAST(uint16_t, v)); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_packus_epi32 - #define _mm_packus_epi32(a, b) simde_mm_packus_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_round_sd (simde__m128d a, simde__m128d b, int rounding) - SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) { - simde__m128d_private - r_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { - #if defined(simde_math_nearbyint) - case SIMDE_MM_FROUND_TO_NEAREST_INT: - case SIMDE_MM_FROUND_CUR_DIRECTION: - r_.f64[0] = simde_math_nearbyint(b_.f64[0]); - break; - #endif - - #if defined(simde_math_floor) - case SIMDE_MM_FROUND_TO_NEG_INF: - r_.f64[0] = simde_math_floor(b_.f64[0]); - break; - #endif - - #if defined(simde_math_ceil) - case SIMDE_MM_FROUND_TO_POS_INF: - r_.f64[0] = simde_math_ceil(b_.f64[0]); - break; - #endif - - #if defined(simde_math_trunc) - case SIMDE_MM_FROUND_TO_ZERO: - r_.f64[0] = simde_math_trunc(b_.f64[0]); - break; - #endif - - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - } - - return simde__m128d_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) -# define simde_mm_round_sd(a, b, rounding) _mm_round_sd(a, b, rounding) -#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) && defined(SIMDE_FAST_EXCEPTIONS) -# define simde_mm_round_sd(a, b, rounding) simde_mm_move_sd(a, simde_mm_round_pd(b, rounding)) -#elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - #define simde_mm_round_sd(a, b, rounding) simde_mm_move_sd(a, simde_mm_round_pd(simde_x_mm_broadcastlow_pd(b), rounding)) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_round_sd - #define _mm_round_sd(a, b, rounding) simde_mm_round_sd(a, b, rounding) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_round_ss (simde__m128 a, simde__m128 b, int rounding) - SIMDE_REQUIRE_CONSTANT_RANGE(rounding, 0, 15) { - simde__m128_private - r_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - switch (rounding & ~SIMDE_MM_FROUND_NO_EXC) { - #if defined(simde_math_nearbyintf) - case SIMDE_MM_FROUND_TO_NEAREST_INT: - case SIMDE_MM_FROUND_CUR_DIRECTION: - r_.f32[0] = simde_math_nearbyintf(b_.f32[0]); - break; - #endif - - #if defined(simde_math_floorf) - case SIMDE_MM_FROUND_TO_NEG_INF: - r_.f32[0] = simde_math_floorf(b_.f32[0]); - break; - #endif - - #if defined(simde_math_ceilf) - case SIMDE_MM_FROUND_TO_POS_INF: - r_.f32[0] = simde_math_ceilf(b_.f32[0]); - break; - #endif - - #if defined(simde_math_truncf) - case SIMDE_MM_FROUND_TO_ZERO: - r_.f32[0] = simde_math_truncf(b_.f32[0]); - break; - #endif - - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm_undefined_pd()); - } - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_SSE4_1_NATIVE) - #define simde_mm_round_ss(a, b, rounding) _mm_round_ss(a, b, rounding) -#elif SIMDE_NATURAL_VECTOR_SIZE > 0 && defined(SIMDE_FAST_EXCEPTIONS) - #define simde_mm_round_ss(a, b, rounding) simde_mm_move_ss((a), simde_mm_round_ps((b), (rounding))) -#elif SIMDE_NATURAL_VECTOR_SIZE > 0 - #define simde_mm_round_ss(a, b, rounding) simde_mm_move_ss((a), simde_mm_round_ps(simde_x_mm_broadcastlow_ps(b), (rounding))) -#endif -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_round_ss - #define _mm_round_ss(a, b, rounding) simde_mm_round_ss(a, b, rounding) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_stream_load_si128 (const simde__m128i* mem_addr) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_stream_load_si128(HEDLEY_CONST_CAST(simde__m128i*, mem_addr)); - #elif HEDLEY_HAS_BUILTIN(__builtin_nontemporal_load) && ( \ - defined(SIMDE_ARM_NEON_A32V7_NATIVE) || defined(SIMDE_VECTOR_SUBSCRIPT) || \ - defined(SIMDE_WASM_SIMD128_NATIVE) || defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || \ - defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)) - return __builtin_nontemporal_load(mem_addr); - #else - return simde_mm_load_si128(mem_addr); - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_stream_load_si128 - #define _mm_stream_load_si128(mem_addr) simde_mm_stream_load_si128(mem_addr) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_test_all_ones (simde__m128i a) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_test_all_ones(a); - #else - simde__m128i_private a_ = simde__m128i_to_private(a); - int r; - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r = vec_all_eq(a_.altivec_i32, vec_splats(~0)); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r = ((vgetq_lane_s64(a_.neon_i64, 0) & vgetq_lane_s64(a_.neon_i64, 1)) == ~HEDLEY_STATIC_CAST(int64_t, 0)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r = HEDLEY_STATIC_CAST(unsigned long long, wasm_i64x2_extract_lane(a_.wasm_v128, 0) & wasm_i64x2_extract_lane(a_.wasm_v128, 1)) == 0xFFFFFFFFFFFFFFFFull; - #else - int_fast32_t r_ = ~HEDLEY_STATIC_CAST(int_fast32_t, 0); - - SIMDE_VECTORIZE_REDUCTION(&:r_) - for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) { - r_ &= a_.i32f[i]; - } - - r = (r_ == ~HEDLEY_STATIC_CAST(int_fast32_t, 0)); - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_test_all_ones - #define _mm_test_all_ones(a) simde_mm_test_all_ones(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_test_all_zeros (simde__m128i a, simde__m128i mask) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_test_all_zeros(a, mask); - #else - simde__m128i_private tmp_ = simde__m128i_to_private(simde_mm_and_si128(a, mask)); - int r; - - #if defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - r = vec_all_eq(tmp_.altivec_i32, vec_splats(0)); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r = !(vgetq_lane_s64(tmp_.neon_i64, 0) | vgetq_lane_s64(tmp_.neon_i64, 1)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r = (wasm_i64x2_extract_lane(tmp_.wasm_v128, 0) | wasm_i64x2_extract_lane(tmp_.wasm_v128, 1)) == 0; - #else - int_fast32_t r_ = HEDLEY_STATIC_CAST(int_fast32_t, 0); - - SIMDE_VECTORIZE_REDUCTION(|:r_) - for (size_t i = 0 ; i < (sizeof(tmp_.i32f) / sizeof(tmp_.i32f[0])) ; i++) { - r_ |= tmp_.i32f[i]; - } - - r = !r_; - #endif - - return r; - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_test_all_zeros - #define _mm_test_all_zeros(a, mask) simde_mm_test_all_zeros(a, mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_test_mix_ones_zeros (simde__m128i a, simde__m128i mask) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_test_mix_ones_zeros(a, mask); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - mask_ = simde__m128i_to_private(mask); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int64x2_t s640 = vandq_s64(a_.neon_i64, mask_.neon_i64); - int64x2_t s641 = vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(a_.neon_i64))), mask_.neon_i64); - return (((vgetq_lane_s64(s640, 0) | vgetq_lane_s64(s640, 1)) & (vgetq_lane_s64(s641, 0) | vgetq_lane_s64(s641, 1)))!=0); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_v128_and(a_.wasm_v128, mask_.wasm_v128); - long long c0 = wasm_i64x2_extract_lane(m, 0); - long long c1 = wasm_i64x2_extract_lane(m, 1); - long long ones = c0 | c1; - long long zeros = ~(c0 & c1); - return ones && zeros; - #else - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) - if (((a_.u64[i] & mask_.u64[i]) != 0) && ((~a_.u64[i] & mask_.u64[i]) != 0)) - return 1; - - return 0; - #endif - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_test_mix_ones_zeros - #define _mm_test_mix_ones_zeros(a, mask) simde_mm_test_mix_ones_zeros(a, mask) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_testc_si128 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_testc_si128(a, b); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int64x2_t s64 = vbicq_s64(b_.neon_i64, a_.neon_i64); - return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); - return (wasm_i64x2_extract_lane(m, 0) | wasm_i64x2_extract_lane(m, 1)) == 0; - #else - int_fast32_t r = 0; - - SIMDE_VECTORIZE_REDUCTION(|:r) - for (size_t i = 0 ; i < (sizeof(a_.i32f) / sizeof(a_.i32f[0])) ; i++) { - r |= ~a_.i32f[i] & b_.i32f[i]; - } - - return HEDLEY_STATIC_CAST(int, !r); - #endif - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_testc_si128 - #define _mm_testc_si128(a, b) simde_mm_testc_si128(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_testnzc_si128 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_testnzc_si128(a, b); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int64x2_t s640 = vandq_s64(b_.neon_i64, a_.neon_i64); - int64x2_t s641 = vbicq_s64(b_.neon_i64, a_.neon_i64); - return !( !(vgetq_lane_s64(s641, 0) || vgetq_lane_s64(s641, 1)) \ - || !(vgetq_lane_s64(s640, 0) || vgetq_lane_s64(s640, 1)) ); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m1 = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); - v128_t m2 = wasm_v128_andnot(b_.wasm_v128, a_.wasm_v128); - return (wasm_i64x2_extract_lane(m1, 0) | wasm_i64x2_extract_lane(m1, 1)) \ - && (wasm_i64x2_extract_lane(m2, 0) | wasm_i64x2_extract_lane(m2, 1)); - #else - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - if (((a_.u64[i] & b_.u64[i]) != 0) && ((~a_.u64[i] & b_.u64[i]) != 0)) - return 1; - } - - return 0; - #endif - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_testnzc_si128 - #define _mm_testnzc_si128(a, b) simde_mm_testnzc_si128(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_testz_si128 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_1_NATIVE) - return _mm_testz_si128(a, b); - #else - simde__m128i_private - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int64x2_t s64 = vandq_s64(a_.neon_i64, b_.neon_i64); - return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t m = wasm_v128_and(a_.wasm_v128, b_.wasm_v128); - return (wasm_i64x2_extract_lane(m, 0) | wasm_i64x2_extract_lane(m, 1)) == 0; - #elif defined(SIMDE_HAVE_INT128_) - if ((a_.u128[0] & b_.u128[0]) == 0) { - return 1; - } - return 0; - #else - for (size_t i = 0 ; i < (sizeof(a_.u64) / sizeof(a_.u64[0])) ; i++) { - if ((a_.u64[i] & b_.u64[i]) > 0) - return 0; - } - #endif - - return 1; - #endif -} -#if defined(SIMDE_X86_SSE4_1_ENABLE_NATIVE_ALIASES) - #undef _mm_testz_si128 - #define _mm_testz_si128(a, b) simde_mm_testz_si128(a, b) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_SSE4_1_H) */ diff --git a/ffi-deps/simde/simde/x86/sse4.2.h b/ffi-deps/simde/simde/x86/sse4.2.h deleted file mode 100644 index ae9e756..0000000 --- a/ffi-deps/simde/simde/x86/sse4.2.h +++ /dev/null @@ -1,381 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2017 Evan Nemerson - * 2020 Hidayat Khan - */ - -#if !defined(SIMDE_X86_SSE4_2_H) -#define SIMDE_X86_SSE4_2_H - -#include "sse4.1.h" - -#if defined(__ARM_ACLE) || (defined(__GNUC__) && defined(__ARM_FEATURE_CRC32)) - #include -#endif - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -#if defined(SIMDE_X86_SSE4_2_NATIVE) - #define SIMDE_SIDD_UBYTE_OPS _SIDD_UBYTE_OPS - #define SIMDE_SIDD_UWORD_OPS _SIDD_UWORD_OPS - #define SIMDE_SIDD_SBYTE_OPS _SIDD_SBYTE_OPS - #define SIMDE_SIDD_SWORD_OPS _SIDD_SWORD_OPS - #define SIMDE_SIDD_CMP_EQUAL_ANY _SIDD_CMP_EQUAL_ANY - #define SIMDE_SIDD_CMP_RANGES _SIDD_CMP_RANGES - #define SIMDE_SIDD_CMP_EQUAL_EACH _SIDD_CMP_EQUAL_EACH - #define SIMDE_SIDD_CMP_EQUAL_ORDERED _SIDD_CMP_EQUAL_ORDERED - #define SIMDE_SIDD_POSITIVE_POLARITY _SIDD_POSITIVE_POLARITY - #define SIMDE_SIDD_NEGATIVE_POLARITY _SIDD_NEGATIVE_POLARITY - #define SIMDE_SIDD_MASKED_POSITIVE_POLARITY _SIDD_MASKED_POSITIVE_POLARITY - #define SIMDE_SIDD_MASKED_NEGATIVE_POLARITY _SIDD_MASKED_NEGATIVE_POLARITY - #define SIMDE_SIDD_LEAST_SIGNIFICANT _SIDD_LEAST_SIGNIFICANT - #define SIMDE_SIDD_MOST_SIGNIFICANT _SIDD_MOST_SIGNIFICANT - #define SIMDE_SIDD_BIT_MASK _SIDD_BIT_MASK - #define SIMDE_SIDD_UNIT_MASK _SIDD_UNIT_MASK -#else - #define SIMDE_SIDD_UBYTE_OPS 0x00 - #define SIMDE_SIDD_UWORD_OPS 0x01 - #define SIMDE_SIDD_SBYTE_OPS 0x02 - #define SIMDE_SIDD_SWORD_OPS 0x03 - #define SIMDE_SIDD_CMP_EQUAL_ANY 0x00 - #define SIMDE_SIDD_CMP_RANGES 0x04 - #define SIMDE_SIDD_CMP_EQUAL_EACH 0x08 - #define SIMDE_SIDD_CMP_EQUAL_ORDERED 0x0c - #define SIMDE_SIDD_POSITIVE_POLARITY 0x00 - #define SIMDE_SIDD_NEGATIVE_POLARITY 0x10 - #define SIMDE_SIDD_MASKED_POSITIVE_POLARITY 0x20 - #define SIMDE_SIDD_MASKED_NEGATIVE_POLARITY 0x30 - #define SIMDE_SIDD_LEAST_SIGNIFICANT 0x00 - #define SIMDE_SIDD_MOST_SIGNIFICANT 0x40 - #define SIMDE_SIDD_BIT_MASK 0x00 - #define SIMDE_SIDD_UNIT_MASK 0x40 -#endif - -#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) && !defined(_SIDD_UBYTE_OPS) - #define _SIDD_UBYTE_OPS SIMDE_SIDD_UBYTE_OPS - #define _SIDD_UWORD_OPS SIMDE_SIDD_UWORD_OPS - #define _SIDD_SBYTE_OPS SIMDE_SIDD_SBYTE_OPS - #define _SIDD_SWORD_OPS SIMDE_SIDD_SWORD_OPS - #define _SIDD_CMP_EQUAL_ANY SIMDE_SIDD_CMP_EQUAL_ANY - #define _SIDD_CMP_RANGES SIMDE_SIDD_CMP_RANGES - #define _SIDD_CMP_EQUAL_EACH SIMDE_SIDD_CMP_EQUAL_EACH - #define _SIDD_CMP_EQUAL_ORDERED SIMDE_SIDD_CMP_EQUAL_ORDERED - #define _SIDD_POSITIVE_POLARITY SIMDE_SIDD_POSITIVE_POLARITY - #define _SIDD_NEGATIVE_POLARITY SIMDE_SIDD_NEGATIVE_POLARITY - #define _SIDD_MASKED_POSITIVE_POLARITY SIMDE_SIDD_MASKED_POSITIVE_POLARITY - #define _SIDD_MASKED_NEGATIVE_POLARITY SIMDE_SIDD_MASKED_NEGATIVE_POLARITY - #define _SIDD_LEAST_SIGNIFICANT SIMDE_SIDD_LEAST_SIGNIFICANT - #define _SIDD_MOST_SIGNIFICANT SIMDE_SIDD_MOST_SIGNIFICANT - #define _SIDD_BIT_MASK SIMDE_SIDD_BIT_MASK - #define _SIDD_UNIT_MASK SIMDE_SIDD_UNIT_MASK -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int simde_mm_cmpestrs (simde__m128i a, int la, simde__m128i b, int lb, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - #if !defined(HEDLEY_PGI_VERSION) - /* https://www.pgroup.com/userforum/viewtopic.php?f=4&p=27590&sid=cf89f8bf30be801831fe4a2ff0a2fa6c */ - (void) a; - (void) b; - #endif - (void) la; - (void) lb; - return la <= ((128 / ((imm8 & SIMDE_SIDD_UWORD_OPS) ? 16 : 8)) - 1); -} -#if defined(SIMDE_X86_SSE4_2_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm_cmpestrs(a, la, b, lb, imm8) \ - _mm_cmpestrs( \ - HEDLEY_REINTERPRET_CAST(__v16qi, a), la, \ - HEDLEY_REINTERPRET_CAST(__v16qi, b), lb, \ - imm8) - #else - #define simde_mm_cmpestrs(a, la, b, lb, imm8) _mm_cmpestrs(a, la, b, lb, imm8) - #endif -#endif -#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpestrs - #define _mm_cmpestrs(a, la, b, lb, imm8) simde_mm_cmpestrs(a, la, b, lb, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int simde_mm_cmpestrz (simde__m128i a, int la, simde__m128i b, int lb, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 255) { - #if !defined(HEDLEY_PGI_VERSION) - /* https://www.pgroup.com/userforum/viewtopic.php?f=4&p=27590&sid=cf89f8bf30be801831fe4a2ff0a2fa6c */ - (void) a; - (void) b; - #endif - (void) la; - (void) lb; - return lb <= ((128 / ((imm8 & SIMDE_SIDD_UWORD_OPS) ? 16 : 8)) - 1); -} -#if defined(SIMDE_X86_SSE4_2_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm_cmpestrz(a, la, b, lb, imm8) \ - _mm_cmpestrz( \ - HEDLEY_REINTERPRET_CAST(__v16qi, a), la, \ - HEDLEY_REINTERPRET_CAST(__v16qi, b), lb, \ - imm8) - #else - #define simde_mm_cmpestrz(a, la, b, lb, imm8) _mm_cmpestrz(a, la, b, lb, imm8) - #endif -#endif -#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpestrz - #define _mm_cmpestrz(a, la, b, lb, imm8) simde_mm_cmpestrz(a, la, b, lb, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmpgt_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSE4_2_NATIVE) - return _mm_cmpgt_epi64(a, b); - #elif defined(SIMDE_X86_SSE2_NATIVE) - /* https://stackoverflow.com/a/65175746/501126 */ - __m128i r = _mm_and_si128(_mm_cmpeq_epi32(a, b), _mm_sub_epi64(b, a)); - r = _mm_or_si128(r, _mm_cmpgt_epi32(a, b)); - return _mm_shuffle_epi32(r, _MM_SHUFFLE(3, 3, 1, 1)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vcgtq_s64(a_.neon_i64, b_.neon_i64); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - /* https://stackoverflow.com/a/65223269/501126 */ - r_.neon_i64 = vshrq_n_s64(vqsubq_s64(b_.neon_i64, a_.neon_i64), 63); - #elif defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) - r_.altivec_u64 = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned long long), vec_cmpgt(a_.altivec_i64, b_.altivec_i64)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x2_gt(a_.wasm_v128, b_.wasm_v128); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 > b_.i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = (a_.i64[i] > b_.i64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpgt_epi64 - #define _mm_cmpgt_epi64(a, b) simde_mm_cmpgt_epi64(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_cmpistrs_8_(simde__m128i a) { - simde__m128i_private a_= simde__m128i_to_private(a); - const int upper_bound = (128 / 8) - 1; - int a_invalid = 0; - SIMDE_VECTORIZE - for (int i = 0 ; i <= upper_bound ; i++) { - if(!a_.i8[i]) - a_invalid = 1; - } - return a_invalid; -} - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_cmpistrs_16_(simde__m128i a) { - simde__m128i_private a_= simde__m128i_to_private(a); - const int upper_bound = (128 / 16) - 1; - int a_invalid = 0; - SIMDE_VECTORIZE - for (int i = 0 ; i <= upper_bound ; i++) { - if(!a_.i16[i]) - a_invalid = 1; - } - return a_invalid; -} - -#if defined(SIMDE_X86_SSE4_2_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm_cmpistrs(a, b, imm8) \ - _mm_cmpistrs( \ - HEDLEY_REINTERPRET_CAST(__v16qi, a), \ - HEDLEY_REINTERPRET_CAST(__v16qi, b), \ - imm8) - #else - #define simde_mm_cmpistrs(a, b, imm8) _mm_cmpistrs(a, b, imm8) - #endif -#else - #define simde_mm_cmpistrs(a, b, imm8) \ - (((imm8) & SIMDE_SIDD_UWORD_OPS) \ - ? simde_mm_cmpistrs_16_((a)) \ - : simde_mm_cmpistrs_8_((a))) -#endif -#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpistrs - #define _mm_cmpistrs(a, b, imm8) simde_mm_cmpistrs(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_cmpistrz_8_(simde__m128i b) { - simde__m128i_private b_= simde__m128i_to_private(b); - const int upper_bound = (128 / 8) - 1; - int b_invalid = 0; - SIMDE_VECTORIZE - for (int i = 0 ; i <= upper_bound ; i++) { - if(!b_.i8[i]) - b_invalid = 1; - } - return b_invalid; -} - -SIMDE_FUNCTION_ATTRIBUTES -int -simde_mm_cmpistrz_16_(simde__m128i b) { - simde__m128i_private b_= simde__m128i_to_private(b); - const int upper_bound = (128 / 16) - 1; - int b_invalid = 0; - SIMDE_VECTORIZE - for (int i = 0 ; i <= upper_bound ; i++) { - if(!b_.i16[i]) - b_invalid = 1; - } - return b_invalid; -} - -#if defined(SIMDE_X86_SSE4_2_NATIVE) - #if defined(__clang__) && !SIMDE_DETECT_CLANG_VERSION_CHECK(3,8,0) - #define simde_mm_cmpistrz(a, b, imm8) \ - _mm_cmpistrz( \ - HEDLEY_REINTERPRET_CAST(__v16qi, a), \ - HEDLEY_REINTERPRET_CAST(__v16qi, b), \ - imm8) - #else - #define simde_mm_cmpistrz(a, b, imm8) _mm_cmpistrz(a, b, imm8) - #endif -#else - #define simde_mm_cmpistrz(a, b, imm8) \ - (((imm8) & SIMDE_SIDD_UWORD_OPS) \ - ? simde_mm_cmpistrz_16_((b)) \ - : simde_mm_cmpistrz_8_((b))) -#endif -#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) - #undef _mm_cmpistrz - #define _mm_cmpistrz(a, b, imm8) simde_mm_cmpistrz(a, b, imm8) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -uint32_t -simde_mm_crc32_u8(uint32_t prevcrc, uint8_t v) { - #if defined(SIMDE_X86_SSE4_2_NATIVE) - return _mm_crc32_u8(prevcrc, v); - #else - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_CRC32) - return __crc32cb(prevcrc, v); - #else - uint32_t crc = prevcrc; - crc ^= v; - for(int bit = 0 ; bit < 8 ; bit++) { - if (crc & 1) - crc = (crc >> 1) ^ UINT32_C(0x82f63b78); - else - crc = (crc >> 1); - } - return crc; - #endif - #endif -} -#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) - #define _mm_crc32_u8(prevcrc, v) simde_mm_crc32_u8(prevcrc, v) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -uint32_t -simde_mm_crc32_u16(uint32_t prevcrc, uint16_t v) { - #if defined(SIMDE_X86_SSE4_2_NATIVE) - return _mm_crc32_u16(prevcrc, v); - #else - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_CRC32) - return __crc32ch(prevcrc, v); - #else - uint32_t crc = prevcrc; - crc = simde_mm_crc32_u8(crc, v & 0xff); - crc = simde_mm_crc32_u8(crc, (v >> 8) & 0xff); - return crc; - #endif - #endif -} -#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) - #define _mm_crc32_u16(prevcrc, v) simde_mm_crc32_u16(prevcrc, v) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -uint32_t -simde_mm_crc32_u32(uint32_t prevcrc, uint32_t v) { - #if defined(SIMDE_X86_SSE4_2_NATIVE) - return _mm_crc32_u32(prevcrc, v); - #else - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_CRC32) - return __crc32cw(prevcrc, v); - #else - uint32_t crc = prevcrc; - crc = simde_mm_crc32_u16(crc, v & 0xffff); - crc = simde_mm_crc32_u16(crc, (v >> 16) & 0xffff); - return crc; - #endif - #endif -} -#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) - #define _mm_crc32_u32(prevcrc, v) simde_mm_crc32_u32(prevcrc, v) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -uint64_t -simde_mm_crc32_u64(uint64_t prevcrc, uint64_t v) { - #if defined(SIMDE_X86_SSE4_2_NATIVE) && defined(SIMDE_ARCH_AMD64) - return _mm_crc32_u64(prevcrc, v); - #else - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) && defined(__ARM_FEATURE_CRC32) - return __crc32cd(HEDLEY_STATIC_CAST(uint32_t, prevcrc), v); - #else - uint64_t crc = prevcrc; - crc = simde_mm_crc32_u32(HEDLEY_STATIC_CAST(uint32_t, crc), v & 0xffffffff); - crc = simde_mm_crc32_u32(HEDLEY_STATIC_CAST(uint32_t, crc), (v >> 32) & 0xffffffff); - return crc; - #endif - #endif -} -#if defined(SIMDE_X86_SSE4_2_ENABLE_NATIVE_ALIASES) || (defined(SIMDE_ENABLE_NATIVE_ALIASES) && !defined(SIMDE_ARCH_AMD64)) - #define _mm_crc32_u64(prevcrc, v) simde_mm_crc32_u64(prevcrc, v) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_SSE4_2_H) */ diff --git a/ffi-deps/simde/simde/x86/ssse3.h b/ffi-deps/simde/simde/x86/ssse3.h deleted file mode 100644 index 6c4c12d..0000000 --- a/ffi-deps/simde/simde/x86/ssse3.h +++ /dev/null @@ -1,1057 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2017-2020 Evan Nemerson - */ - -#if !defined(SIMDE_X86_SSSE3_H) -#define SIMDE_X86_SSSE3_H - -#include "sse3.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_abs_epi8 (simde__m128i a) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_abs_epi8(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - return _mm_min_epu8(a, _mm_sub_epi8(_mm_setzero_si128(), a)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vabsq_s8(a_.neon_i8); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i8 = vec_abs(a_.altivec_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_abs(a_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, (a_.i8[i] < 0) ? (- a_.i8[i]) : a_.i8[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_abs_epi8(a) simde_mm_abs_epi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_abs_epi16 (simde__m128i a) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_abs_epi16(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vabsq_s16(a_.neon_i16); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i16 = vec_abs(a_.altivec_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_abs(a_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a_.i16[i] < 0) ? (- a_.i16[i]) : a_.i16[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_abs_epi16(a) simde_mm_abs_epi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_abs_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_abs_epi32(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - const __m128i m = _mm_cmpgt_epi32(_mm_setzero_si128(), a); - return _mm_sub_epi32(_mm_xor_si128(a, m), m); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vabsq_s32(a_.neon_i32); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE) - r_.altivec_i32 = vec_abs(a_.altivec_i32); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_abs(a_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - #if defined(_MSC_VER) - HEDLEY_DIAGNOSTIC_PUSH - #pragma warning(disable:4146) - #endif - r_.u32[i] = (a_.i32[i] < 0) ? (- HEDLEY_STATIC_CAST(uint32_t, a_.i32[i])) : HEDLEY_STATIC_CAST(uint32_t, a_.i32[i]); - #if defined(_MSC_VER) - HEDLEY_DIAGNOSTIC_POP - #endif - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_abs_epi32(a) simde_mm_abs_epi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_abs_pi8 (simde__m64 a) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_abs_pi8(a); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vabs_s8(a_.neon_i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, (a_.i8[i] < 0) ? (- a_.i8[i]) : a_.i8[i]); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_abs_pi8(a) simde_mm_abs_pi8(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_abs_pi16 (simde__m64 a) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_abs_pi16(a); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vabs_s16(a_.neon_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, (a_.i16[i] < 0) ? (- a_.i16[i]) : a_.i16[i]); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_abs_pi16(a) simde_mm_abs_pi16(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_abs_pi32 (simde__m64 a) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_abs_pi32(a); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vabs_s32(a_.neon_i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, (a_.i32[i] < 0) ? (- a_.i32[i]) : a_.i32[i]); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_abs_pi32(a) simde_mm_abs_pi32(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_alignr_epi8 (simde__m128i a, simde__m128i b, int count) - SIMDE_REQUIRE_CONSTANT_RANGE(count, 0, 255) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - if (HEDLEY_UNLIKELY(count > 31)) - return simde_mm_setzero_si128(); - - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - const int srcpos = count + HEDLEY_STATIC_CAST(int, i); - if (srcpos > 31) { - r_.i8[i] = 0; - } else if (srcpos > 15) { - r_.i8[i] = a_.i8[(srcpos) & 15]; - } else { - r_.i8[i] = b_.i8[srcpos]; - } - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_SSSE3_NATIVE) - #define simde_mm_alignr_epi8(a, b, count) _mm_alignr_epi8(a, b, count) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_alignr_epi8(a, b, count) \ - ( \ - ((count) > 31) \ - ? simde__m128i_from_neon_i8(vdupq_n_s8(0)) \ - : ( \ - ((count) > 15) \ - ? (simde__m128i_from_neon_i8(vextq_s8(simde__m128i_to_neon_i8(a), vdupq_n_s8(0), (count) & 15))) \ - : (simde__m128i_from_neon_i8(vextq_s8(simde__m128i_to_neon_i8(b), simde__m128i_to_neon_i8(a), ((count) & 15)))))) -#endif -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) - #define _mm_alignr_epi8(a, b, count) simde_mm_alignr_epi8(a, b, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_alignr_pi8 (simde__m64 a, simde__m64 b, const int count) - SIMDE_REQUIRE_CONSTANT(count) { - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - if (HEDLEY_UNLIKELY(count > 15)) - return simde_mm_setzero_si64(); - - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - const int srcpos = count + HEDLEY_STATIC_CAST(int, i); - if (srcpos > 15) { - r_.i8[i] = 0; - } else if (srcpos > 7) { - r_.i8[i] = a_.i8[(srcpos) & 7]; - } else { - r_.i8[i] = b_.i8[srcpos]; - } - } - - return simde__m64_from_private(r_); -} -#if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) -# define simde_mm_alignr_pi8(a, b, count) _mm_alignr_pi8(a, b, count) -#elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - #define simde_mm_alignr_pi8(a, b, count) \ - ( \ - ((count) > 15) \ - ? simde__m64_from_neon_i8(vdup_n_s8(0)) \ - : ( \ - ((count) > 7) \ - ? (simde__m64_from_neon_i8(vext_s8(simde__m64_to_neon_i8(a), vdup_n_s8(0), (count) & 7))) \ - : (simde__m64_from_neon_i8(vext_s8(simde__m64_to_neon_i8(b), simde__m64_to_neon_i8(a), ((count) & 7)))))) -#endif -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_alignr_pi8(a, b, count) simde_mm_alignr_pi8(a, b, count) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_shuffle_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_shuffle_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i8 = vqtbl1q_s8(a_.neon_i8, vandq_u8(b_.neon_u8, vdupq_n_u8(0x8F))); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - /* Mask out the bits we're not interested in. vtbl will result in 0 - * for any values outside of [0, 15], so if the high bit is set it - * will return 0, just like in SSSE3. */ - b_.neon_i8 = vandq_s8(b_.neon_i8, vdupq_n_s8(HEDLEY_STATIC_CAST(int8_t, (1 << 7) | 15))); - - /* Convert a from an int8x16_t to an int8x8x2_t */ - int8x8x2_t i; - i.val[0] = vget_low_s8(a_.neon_i8); - i.val[1] = vget_high_s8(a_.neon_i8); - - /* Table lookups */ - int8x8_t l = vtbl2_s8(i, vget_low_s8(b_.neon_i8)); - int8x8_t h = vtbl2_s8(i, vget_high_s8(b_.neon_i8)); - - r_.neon_i8 = vcombine_s8(l, h); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - /* This is a bit ugly because of the casts and the awful type - * macros (SIMDE_POWER_ALTIVEC_VECTOR), but it's really just - * vec_sel(vec_perm(a, a, b), 0, vec_cmplt(b, 0)) */ - SIMDE_POWER_ALTIVEC_VECTOR(signed char) z = { 0, }; - SIMDE_POWER_ALTIVEC_VECTOR(signed char) msb_mask = HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(signed char), vec_cmplt(b_.altivec_i8, z)); - SIMDE_POWER_ALTIVEC_VECTOR(signed char) c = vec_perm(a_.altivec_i8, a_.altivec_i8, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), b_.altivec_i8)); - r_.altivec_i8 = vec_sel(c, z, HEDLEY_REINTERPRET_CAST(SIMDE_POWER_ALTIVEC_VECTOR(unsigned char), msb_mask)); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x16_swizzle( - a_.wasm_v128, wasm_v128_and(b_.wasm_v128, wasm_i8x16_splat(0x8F))); - #else - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[b_.i8[i] & 15] & (~(b_.i8[i]) >> 7); - } - #endif - - return simde__m128i_from_private(r_); -#endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_shuffle_epi8(a, b) simde_mm_shuffle_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_shuffle_pi8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_shuffle_pi8(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - b_.neon_i8 = vand_s8(b_.neon_i8, vdup_n_s8(HEDLEY_STATIC_CAST(int8_t, (1 << 7) | 7))); - r_.neon_i8 = vtbl1_s8(a_.neon_i8, b_.neon_i8); - #else - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.i8[i] = a_.i8[b_.i8[i] & 7] & (~(b_.i8[i]) >> 7); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_shuffle_pi8(a, b) simde_mm_shuffle_pi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_hadd_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_hadd_epi16(a, b); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return simde__m128i_from_neon_i16(vpaddq_s16(simde__m128i_to_neon_i16(a), simde__m128i_to_neon_i16(b))); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x8x2_t t = vuzpq_s16(simde__m128i_to_neon_i16(a), simde__m128i_to_neon_i16(b)); - return simde__m128i_from_neon_i16(vaddq_s16(t.val[0], t.val[1])); - #else - return simde_mm_add_epi16(simde_x_mm_deinterleaveeven_epi16(a, b), simde_x_mm_deinterleaveodd_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hadd_epi16(a, b) simde_mm_hadd_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_hadd_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_hadd_epi32(a, b); - #elif defined(SIMDE_ARM_NEON_A64V8_NATIVE) - return simde__m128i_from_neon_i32(vpaddq_s32(simde__m128i_to_neon_i32(a), simde__m128i_to_neon_i32(b))); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int32x4x2_t t = vuzpq_s32(simde__m128i_to_neon_i32(a), simde__m128i_to_neon_i32(b)); - return simde__m128i_from_neon_i32(vaddq_s32(t.val[0], t.val[1])); - #else - return simde_mm_add_epi32(simde_x_mm_deinterleaveeven_epi32(a, b), simde_x_mm_deinterleaveodd_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hadd_epi32(a, b) simde_mm_hadd_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_hadd_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_hadd_pi16(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i16 = vpadd_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x4x2_t t = vuzp_s16(a_.neon_i16, b_.neon_i16); - r_.neon_i16 = vadd_s16(t.val[0], t.val[1]); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = - SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 0, 2, 4, 6) + - SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 1, 3, 5, 7); - #else - r_.i16[0] = a_.i16[0] + a_.i16[1]; - r_.i16[1] = a_.i16[2] + a_.i16[3]; - r_.i16[2] = b_.i16[0] + b_.i16[1]; - r_.i16[3] = b_.i16[2] + b_.i16[3]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hadd_pi16(a, b) simde_mm_hadd_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_hadd_pi32 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_hadd_pi32(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_i32 = vpadd_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int32x2x2_t t = vuzp_s32(a_.neon_i32, b_.neon_i32); - r_.neon_i32 = vadd_s32(t.val[0], t.val[1]); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = - SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 0, 2) + - SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 1, 3); - #else - r_.i32[0] = a_.i32[0] + a_.i32[1]; - r_.i32[1] = b_.i32[0] + b_.i32[1]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hadd_pi32(a, b) simde_mm_hadd_pi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_hadds_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_hadds_epi16(a, b); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x8x2_t t = vuzpq_s16(simde__m128i_to_neon_i16(a), simde__m128i_to_neon_i16(b)); - return simde__m128i_from_neon_i16(vqaddq_s16(t.val[0], t.val[1])); - #else - return simde_mm_adds_epi16(simde_x_mm_deinterleaveeven_epi16(a, b), simde_x_mm_deinterleaveodd_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hadds_epi16(a, b) simde_mm_hadds_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_hadds_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_hadds_pi16(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x4x2_t t = vuzp_s16(a_.neon_i16, b_.neon_i16); - r_.neon_i16 = vqadd_s16(t.val[0], t.val[1]); - #else - for (size_t i = 0 ; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2) ; i++) { - int32_t ta = HEDLEY_STATIC_CAST(int32_t, a_.i16[i * 2]) + HEDLEY_STATIC_CAST(int32_t, a_.i16[(i * 2) + 1]); - r_.i16[ i ] = HEDLEY_LIKELY(ta > INT16_MIN) ? (HEDLEY_LIKELY(ta < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ta) : INT16_MAX) : INT16_MIN; - int32_t tb = HEDLEY_STATIC_CAST(int32_t, b_.i16[i * 2]) + HEDLEY_STATIC_CAST(int32_t, b_.i16[(i * 2) + 1]); - r_.i16[i + 2] = HEDLEY_LIKELY(tb > INT16_MIN) ? (HEDLEY_LIKELY(tb < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, tb) : INT16_MAX) : INT16_MIN; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hadds_pi16(a, b) simde_mm_hadds_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_hsub_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_hsub_epi16(a, b); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x8x2_t t = vuzpq_s16(simde__m128i_to_neon_i16(a), simde__m128i_to_neon_i16(b)); - return simde__m128i_from_neon_i16(vsubq_s16(t.val[0], t.val[1])); - #else - return simde_mm_sub_epi16(simde_x_mm_deinterleaveeven_epi16(a, b), simde_x_mm_deinterleaveodd_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hsub_epi16(a, b) simde_mm_hsub_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_hsub_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_hsub_epi32(a, b); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int32x4x2_t t = vuzpq_s32(simde__m128i_to_neon_i32(a), simde__m128i_to_neon_i32(b)); - return simde__m128i_from_neon_i32(vsubq_s32(t.val[0], t.val[1])); - #else - return simde_mm_sub_epi32(simde_x_mm_deinterleaveeven_epi32(a, b), simde_x_mm_deinterleaveodd_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hsub_epi32(a, b) simde_mm_hsub_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_hsub_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_hsub_pi16(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x4x2_t t = vuzp_s16(a_.neon_i16, b_.neon_i16); - r_.neon_i16 = vsub_s16(t.val[0], t.val[1]); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) - r_.i16 = - SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 0, 2, 4, 6) - - SIMDE_SHUFFLE_VECTOR_(16, 8, a_.i16, b_.i16, 1, 3, 5, 7); - #else - r_.i16[0] = a_.i16[0] - a_.i16[1]; - r_.i16[1] = a_.i16[2] - a_.i16[3]; - r_.i16[2] = b_.i16[0] - b_.i16[1]; - r_.i16[3] = b_.i16[2] - b_.i16[3]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hsub_pi16(a, b) simde_mm_hsub_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_hsub_pi32 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_hsub_pi32(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int32x2x2_t t = vuzp_s32(a_.neon_i32, b_.neon_i32); - r_.neon_i32 = vsub_s32(t.val[0], t.val[1]); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) && defined(SIMDE_SHUFFLE_VECTOR_) - r_.i32 = - SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 0, 2) - - SIMDE_SHUFFLE_VECTOR_(32, 8, a_.i32, b_.i32, 1, 3); - #else - r_.i32[0] = a_.i32[0] - a_.i32[1]; - r_.i32[1] = b_.i32[0] - b_.i32[1]; - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hsub_pi32(a, b) simde_mm_hsub_pi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_hsubs_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_hsubs_epi16(a, b); - #elif defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x8x2_t t = vuzpq_s16(simde__m128i_to_neon_i16(a), simde__m128i_to_neon_i16(b)); - return simde__m128i_from_neon_i16(vqsubq_s16(t.val[0], t.val[1])); - #else - return simde_mm_subs_epi16(simde_x_mm_deinterleaveeven_epi16(a, b), simde_x_mm_deinterleaveodd_epi16(a, b)); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hsubs_epi16(a, b) simde_mm_hsubs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_hsubs_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_hsubs_pi16(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - int16x4x2_t t = vuzp_s16(a_.neon_i16, b_.neon_i16); - r_.neon_i16 = vqsub_s16(t.val[0], t.val[1]); - #else - for (size_t i = 0 ; i < ((sizeof(r_.i16) / sizeof(r_.i16[0])) / 2) ; i++) { - r_.i16[ i ] = simde_math_subs_i16(a_.i16[i * 2], a_.i16[(i * 2) + 1]); - r_.i16[i + 2] = simde_math_subs_i16(b_.i16[i * 2], b_.i16[(i * 2) + 1]); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_hsubs_pi16(a, b) simde_mm_hsubs_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maddubs_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_maddubs_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - /* Zero extend a */ - int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a_.neon_u16, 8)); - int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a_.neon_u16, vdupq_n_u16(0xff00))); - - /* Sign extend by shifting left then shifting right. */ - int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b_.neon_i16, 8), 8); - int16x8_t b_odd = vshrq_n_s16(b_.neon_i16, 8); - - /* multiply */ - int16x8_t prod1 = vmulq_s16(a_even, b_even); - int16x8_t prod2 = vmulq_s16(a_odd, b_odd); - - /* saturated add */ - r_.neon_i16 = vqaddq_s16(prod1, prod2); - #else - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - const int idx = HEDLEY_STATIC_CAST(int, i) << 1; - int32_t ts = - (HEDLEY_STATIC_CAST(int16_t, a_.u8[ idx ]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[ idx ])) + - (HEDLEY_STATIC_CAST(int16_t, a_.u8[idx + 1]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[idx + 1])); - r_.i16[i] = (ts > INT16_MIN) ? ((ts < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ts) : INT16_MAX) : INT16_MIN; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_maddubs_epi16(a, b) simde_mm_maddubs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_maddubs_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_maddubs_pi16(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - int16x8_t ai = vreinterpretq_s16_u16(vmovl_u8(a_.neon_u8)); - int16x8_t bi = vmovl_s8(b_.neon_i8); - int16x8_t p = vmulq_s16(ai, bi); - int16x4_t l = vget_low_s16(p); - int16x4_t h = vget_high_s16(p); - r_.neon_i16 = vqadd_s16(vuzp1_s16(l, h), vuzp2_s16(l, h)); - #else - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - const int idx = HEDLEY_STATIC_CAST(int, i) << 1; - int32_t ts = - (HEDLEY_STATIC_CAST(int16_t, a_.u8[ idx ]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[ idx ])) + - (HEDLEY_STATIC_CAST(int16_t, a_.u8[idx + 1]) * HEDLEY_STATIC_CAST(int16_t, b_.i8[idx + 1])); - r_.i16[i] = (ts > INT16_MIN) ? ((ts < INT16_MAX) ? HEDLEY_STATIC_CAST(int16_t, ts) : INT16_MAX) : INT16_MIN; - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_maddubs_pi16(a, b) simde_mm_maddubs_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_mulhrs_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_mulhrs_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - /* Multiply */ - int32x4_t mul_lo = vmull_s16(vget_low_s16(a_.neon_i16), - vget_low_s16(b_.neon_i16)); - int32x4_t mul_hi = vmull_s16(vget_high_s16(a_.neon_i16), - vget_high_s16(b_.neon_i16)); - - /* Rounding narrowing shift right - * narrow = (int16_t)((mul + 16384) >> 15); */ - int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15); - int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15); - - /* Join together */ - r_.neon_i16 = vcombine_s16(narrow_lo, narrow_hi); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - v128_t __lo = wasm_i32x4_mul(wasm_i32x4_extend_low_i16x8(a_.wasm_v128), wasm_i32x4_extend_low_i16x8(b_.wasm_v128)); - v128_t __hi = wasm_i32x4_mul(wasm_i32x4_extend_high_i16x8(a_.wasm_v128), wasm_i32x4_extend_high_i16x8(b_.wasm_v128)); - const v128_t __inc = wasm_i32x4_splat(0x4000); - __lo = wasm_i32x4_add(__lo, __inc); - __hi = wasm_i32x4_add(__hi, __inc); - __lo = wasm_i32x4_add(__lo, __lo); - __hi = wasm_i32x4_add(__hi, __hi); - r_.wasm_v128 = wasm_i16x8_shuffle(__lo, __hi, 1, 3, 5, 7, 9, 11, 13, 15); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, (((HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) + 0x4000) >> 15)); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_mulhrs_epi16(a, b) simde_mm_mulhrs_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_mulhrs_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_mulhrs_pi16(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - /* Multiply */ - int32x4_t mul = vmull_s16(a_.neon_i16, b_.neon_i16); - - /* Rounding narrowing shift right - * narrow = (int16_t)((mul + 16384) >> 15); */ - int16x4_t narrow = vrshrn_n_s32(mul, 15); - - /* Join together */ - r_.neon_i16 = narrow; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, (((HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i])) + 0x4000) >> 15)); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_mulhrs_pi16(a, b) simde_mm_mulhrs_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sign_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_sign_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint8x16_t aneg_mask = vreinterpretq_u8_s8(vshrq_n_s8(b_.neon_i8, 7)); - uint8x16_t bnz_mask; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - bnz_mask = vceqzq_s8(b_.neon_i8); - #else - bnz_mask = vceqq_s8(b_.neon_i8, vdupq_n_s8(0)); - #endif - bnz_mask = vmvnq_u8(bnz_mask); - - r_.neon_i8 = vbslq_s8(aneg_mask, vnegq_s8(a_.neon_i8), vandq_s8(a_.neon_i8, vreinterpretq_s8_u8(bnz_mask))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - simde__m128i mask = wasm_i8x16_shr(b_.wasm_v128, 7); - simde__m128i zeromask = simde_mm_cmpeq_epi8(b_.wasm_v128, simde_mm_setzero_si128()); - r_.wasm_v128 = simde_mm_andnot_si128(zeromask, simde_mm_xor_si128(simde_mm_add_epi8(a_.wasm_v128, mask), mask)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (b_.i8[i] < 0) ? (- a_.i8[i]) : ((b_.i8[i] != 0) ? (a_.i8[i]) : INT8_C(0)); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_sign_epi8(a, b) simde_mm_sign_epi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sign_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_sign_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint16x8_t aneg_mask = vreinterpretq_u16_s16(vshrq_n_s16(b_.neon_i16, 15)); - uint16x8_t bnz_mask; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - bnz_mask = vceqzq_s16(b_.neon_i16); - #else - bnz_mask = vceqq_s16(b_.neon_i16, vdupq_n_s16(0)); - #endif - bnz_mask = vmvnq_u16(bnz_mask); - - r_.neon_i16 = vbslq_s16(aneg_mask, vnegq_s16(a_.neon_i16), vandq_s16(a_.neon_i16, vreinterpretq_s16_u16(bnz_mask))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - simde__m128i mask = simde_mm_srai_epi16(b_.wasm_v128, 15); - simde__m128i zeromask = simde_mm_cmpeq_epi16(b_.wasm_v128, simde_mm_setzero_si128()); - r_.wasm_v128 = simde_mm_andnot_si128(zeromask, simde_mm_xor_si128(simde_mm_add_epi16(a_.wasm_v128, mask), mask)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (b_.i16[i] < 0) ? (- a_.i16[i]) : ((b_.i16[i] != 0) ? (a_.i16[i]) : INT16_C(0)); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_sign_epi16(a, b) simde_mm_sign_epi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sign_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_sign_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x4_t aneg_mask = vreinterpretq_u32_s32(vshrq_n_s32(b_.neon_i32, 31)); - uint32x4_t bnz_mask; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - bnz_mask = vceqzq_s32(b_.neon_i32); - #else - bnz_mask = vceqq_s32(b_.neon_i32, vdupq_n_s32(0)); - #endif - bnz_mask = vmvnq_u32(bnz_mask); - - r_.neon_i32 = vbslq_s32(aneg_mask, vnegq_s32(a_.neon_i32), vandq_s32(a_.neon_i32, vreinterpretq_s32_u32(bnz_mask))); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - simde__m128i mask = simde_mm_srai_epi32(b_.wasm_v128, 31); - simde__m128i zeromask = simde_mm_cmpeq_epi32(b_.wasm_v128, simde_mm_setzero_si128()); - r_.wasm_v128 = simde_mm_andnot_si128(zeromask, simde_mm_xor_si128(simde_mm_add_epi32(a_.wasm_v128, mask), mask)); - #else - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (b_.i32[i] < 0) ? (- a_.i32[i]) : ((b_.i32[i] != 0) ? (a_.i32[i]) : INT32_C(0)); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_sign_epi32(a, b) simde_mm_sign_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sign_pi8 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_sign_pi8(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint8x8_t aneg_mask = vreinterpret_u8_s8(vshr_n_s8(b_.neon_i8, 7)); - uint8x8_t bnz_mask; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - bnz_mask = vceqz_s8(b_.neon_i8); - #else - bnz_mask = vceq_s8(b_.neon_i8, vdup_n_s8(0)); - #endif - bnz_mask = vmvn_u8(bnz_mask); - - r_.neon_i8 = vbsl_s8(aneg_mask, vneg_s8(a_.neon_i8), vand_s8(a_.neon_i8, vreinterpret_s8_u8(bnz_mask))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (b_.i8[i] < 0) ? (- a_.i8[i]) : ((b_.i8[i] != 0) ? (a_.i8[i]) : INT8_C(0)); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_sign_pi8(a, b) simde_mm_sign_pi8(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sign_pi16 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_sign_pi16(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint16x4_t aneg_mask = vreinterpret_u16_s16(vshr_n_s16(b_.neon_i16, 15)); - uint16x4_t bnz_mask; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - bnz_mask = vceqz_s16(b_.neon_i16); - #else - bnz_mask = vceq_s16(b_.neon_i16, vdup_n_s16(0)); - #endif - bnz_mask = vmvn_u16(bnz_mask); - - r_.neon_i16 = vbsl_s16(aneg_mask, vneg_s16(a_.neon_i16), vand_s16(a_.neon_i16, vreinterpret_s16_u16(bnz_mask))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (b_.i16[i] < 0) ? (- a_.i16[i]) : ((b_.i16[i] > 0) ? (a_.i16[i]) : INT16_C(0)); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_sign_pi16(a, b) simde_mm_sign_pi16(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m64 -simde_mm_sign_pi32 (simde__m64 a, simde__m64 b) { - #if defined(SIMDE_X86_SSSE3_NATIVE) && defined(SIMDE_X86_MMX_NATIVE) - return _mm_sign_pi32(a, b); - #else - simde__m64_private - r_, - a_ = simde__m64_to_private(a), - b_ = simde__m64_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - uint32x2_t aneg_mask = vreinterpret_u32_s32(vshr_n_s32(b_.neon_i32, 31)); - uint32x2_t bnz_mask; - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - bnz_mask = vceqz_s32(b_.neon_i32); - #else - bnz_mask = vceq_s32(b_.neon_i32, vdup_n_s32(0)); - #endif - bnz_mask = vmvn_u32(bnz_mask); - - r_.neon_i32 = vbsl_s32(aneg_mask, vneg_s32(a_.neon_i32), vand_s32(a_.neon_i32, vreinterpret_s32_u32(bnz_mask))); - #else - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (b_.i32[i] < 0) ? (- a_.i32[i]) : ((b_.i32[i] > 0) ? (a_.i32[i]) : INT32_C(0)); - } - #endif - - return simde__m64_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SSSE3_ENABLE_NATIVE_ALIASES) -# define _mm_sign_pi32(a, b) simde_mm_sign_pi32(a, b) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_SSE2_H) */ diff --git a/ffi-deps/simde/simde/x86/svml.h b/ffi-deps/simde/simde/x86/svml.h deleted file mode 100644 index 3a588c1..0000000 --- a/ffi-deps/simde/simde/x86/svml.h +++ /dev/null @@ -1,12129 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - * 2020 Himanshi Mathur - */ - -#if !defined(SIMDE_X86_SVML_H) -#define SIMDE_X86_SVML_H - -#include "fma.h" -#include "avx2.h" -#include "avx512/abs.h" -#include "avx512/add.h" -#include "avx512/cmp.h" -#include "avx512/copysign.h" -#include "avx512/xorsign.h" -#include "avx512/div.h" -#include "avx512/fmadd.h" -#include "avx512/mov.h" -#include "avx512/mul.h" -#include "avx512/negate.h" -#include "avx512/or.h" -#include "avx512/set1.h" -#include "avx512/setone.h" -#include "avx512/setzero.h" -#include "avx512/sqrt.h" -#include "avx512/sub.h" - -#include "../simde-complex.h" - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_acos_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_acos_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_acosf4_u10(a); - #else - return Sleef_acosf4_u35(a); - #endif - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_acosf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_acos_ps - #define _mm_acos_ps(a) simde_mm_acos_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_acos_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_acos_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_acosd2_u10(a); - #else - return Sleef_acosd2_u35(a); - #endif - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_acos(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_acos_pd - #define _mm_acos_pd(a) simde_mm_acos_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_acos_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_acos_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_acosf8_u10(a); - #else - return Sleef_acosf8_u35(a); - #endif - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_acos_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_acosf(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_acos_ps - #define _mm256_acos_ps(a) simde_mm256_acos_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_acos_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_acos_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_acosd4_u10(a); - #else - return Sleef_acosd4_u35(a); - #endif - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_acos_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_acos(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_acos_pd - #define _mm256_acos_pd(a) simde_mm256_acos_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_acos_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_acos_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_acosf16_u10(a); - #else - return Sleef_acosf16_u35(a); - #endif - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_acos_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_acosf(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_acos_ps - #define _mm512_acos_ps(a) simde_mm512_acos_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_acos_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_acos_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_acosd8_u10(a); - #else - return Sleef_acosd8_u35(a); - #endif - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_acos_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_acos(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_acos_pd - #define _mm512_acos_pd(a) simde_mm512_acos_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_acos_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_acos_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_acos_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_acos_ps - #define _mm512_mask_acos_ps(src, k, a) simde_mm512_mask_acos_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_acos_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_acos_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_acos_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_acos_pd - #define _mm512_mask_acos_pd(src, k, a) simde_mm512_mask_acos_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_acosh_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_acosh_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_acoshf4_u10(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_acoshf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_acosh_ps - #define _mm_acosh_ps(a) simde_mm_acosh_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_acosh_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_acosh_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_acoshd2_u10(a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_acosh(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_acosh_pd - #define _mm_acosh_pd(a) simde_mm_acosh_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_acosh_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_acosh_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_acoshf8_u10(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_acosh_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_acoshf(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_acosh_ps - #define _mm256_acosh_ps(a) simde_mm256_acosh_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_acosh_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_acosh_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_acoshd4_u10(a); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_acosh_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_acosh(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_acosh_pd - #define _mm256_acosh_pd(a) simde_mm256_acosh_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_acosh_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_acosh_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_acoshf16_u10(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_acosh_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_acoshf(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_acosh_ps - #define _mm512_acosh_ps(a) simde_mm512_acosh_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_acosh_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_acosh_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_acoshd8_u10(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_acosh_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_acosh(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_acosh_pd - #define _mm512_acosh_pd(a) simde_mm512_acosh_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_acosh_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_acosh_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_acosh_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_acosh_ps - #define _mm512_mask_acosh_ps(src, k, a) simde_mm512_mask_acosh_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_acosh_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_acosh_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_acosh_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_acosh_pd - #define _mm512_mask_acosh_pd(src, k, a) simde_mm512_mask_acosh_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_asin_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_asin_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_asinf4_u10(a); - #else - return Sleef_asinf4_u35(a); - #endif - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_asinf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_asin_ps - #define _mm_asin_ps(a) simde_mm_asin_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_asin_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_asin_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_asind2_u10(a); - #else - return Sleef_asind2_u35(a); - #endif - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_asin(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_asin_pd - #define _mm_asin_pd(a) simde_mm_asin_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_asin_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_asin_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_asinf8_u10(a); - #else - return Sleef_asinf8_u35(a); - #endif - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_asin_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_asinf(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_asin_ps - #define _mm256_asin_ps(a) simde_mm256_asin_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_asin_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_asin_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_asind4_u10(a); - #else - return Sleef_asind4_u35(a); - #endif - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_asin_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_asin(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_asin_pd - #define _mm256_asin_pd(a) simde_mm256_asin_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_asin_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_asin_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_asinf16_u10(a); - #else - return Sleef_asinf16_u35(a); - #endif - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_asin_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_asinf(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_asin_ps - #define _mm512_asin_ps(a) simde_mm512_asin_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_asin_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_asin_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_asind8_u10(a); - #else - return Sleef_asind8_u35(a); - #endif - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_asin_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_asin(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_asin_pd - #define _mm512_asin_pd(a) simde_mm512_asin_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_asin_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_asin_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_asin_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_asin_ps - #define _mm512_mask_asin_ps(src, k, a) simde_mm512_mask_asin_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_asin_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_asin_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_asin_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_asin_pd - #define _mm512_mask_asin_pd(src, k, a) simde_mm512_mask_asin_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_asinh_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_asinh_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_asinhf4_u10(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_asinhf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_asinh_ps - #define _mm_asinh_ps(a) simde_mm_asinh_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_asinh_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_asinh_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_asinhd2_u10(a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_asinh(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_asinh_pd - #define _mm_asinh_pd(a) simde_mm_asinh_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_asinh_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_asinh_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_asinhf8_u10(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_asinh_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_asinhf(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_asinh_ps - #define _mm256_asinh_ps(a) simde_mm256_asinh_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_asinh_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_asinh_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_asinhd4_u10(a); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_asinh_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_asinh(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_asinh_pd - #define _mm256_asinh_pd(a) simde_mm256_asinh_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_asinh_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_asinh_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_asinhf16_u10(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_asinh_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_asinhf(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_asinh_ps - #define _mm512_asinh_ps(a) simde_mm512_asinh_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_asinh_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_asinh_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_asinhd8_u10(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_asinh_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_asinh(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_asinh_pd - #define _mm512_asinh_pd(a) simde_mm512_asinh_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_asinh_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_asinh_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_asinh_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_asinh_ps - #define _mm512_mask_asinh_ps(src, k, a) simde_mm512_mask_asinh_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_asinh_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_asinh_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_asinh_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_asinh_pd - #define _mm512_mask_asinh_pd(src, k, a) simde_mm512_mask_asinh_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_atan_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_atan_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_atanf4_u10(a); - #else - return Sleef_atanf4_u35(a); - #endif - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_atanf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_atan_ps - #define _mm_atan_ps(a) simde_mm_atan_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_atan_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_atan_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_atand2_u10(a); - #else - return Sleef_atand2_u35(a); - #endif - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_atan(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_atan_pd - #define _mm_atan_pd(a) simde_mm_atan_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_atan_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_atan_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_atanf8_u10(a); - #else - return Sleef_atanf8_u35(a); - #endif - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_atan_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_atanf(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_atan_ps - #define _mm256_atan_ps(a) simde_mm256_atan_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_atan_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_atan_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_atand4_u10(a); - #else - return Sleef_atand4_u35(a); - #endif - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_atan_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_atan(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_atan_pd - #define _mm256_atan_pd(a) simde_mm256_atan_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_atan_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_atan_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_atanf16_u10(a); - #else - return Sleef_atanf16_u35(a); - #endif - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_atan_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_atanf(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_atan_ps - #define _mm512_atan_ps(a) simde_mm512_atan_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_atan_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_atan_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_atand8_u10(a); - #else - return Sleef_atand8_u35(a); - #endif - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_atan_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_atan(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_atan_pd - #define _mm512_atan_pd(a) simde_mm512_atan_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_atan_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_atan_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_atan_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_atan_ps - #define _mm512_mask_atan_ps(src, k, a) simde_mm512_mask_atan_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_atan_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_atan_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_atan_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_atan_pd - #define _mm512_mask_atan_pd(src, k, a) simde_mm512_mask_atan_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_atan2_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_atan2_ps(a, b); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_atan2f4_u10(a, b); - #else - return Sleef_atan2f4_u35(a, b); - #endif - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_atan2f(a_.f32[i], b_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_atan2_ps - #define _mm_atan2_ps(a, b) simde_mm_atan2_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_atan2_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_atan2_pd(a, b); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_atan2d2_u10(a, b); - #else - return Sleef_atan2d2_u35(a, b); - #endif - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_atan2(a_.f64[i], b_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_atan2_pd - #define _mm_atan2_pd(a, b) simde_mm_atan2_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_atan2_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_atan2_ps(a, b); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_atan2f8_u10(a, b); - #else - return Sleef_atan2f8_u35(a, b); - #endif - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_atan2_ps(a_.m128[i], b_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_atan2f(a_.f32[i], b_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_atan2_ps - #define _mm256_atan2_ps(a, b) simde_mm256_atan2_ps(a, b) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_atan2_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_atan2_pd(a, b); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_atan2d4_u10(a, b); - #else - return Sleef_atan2d4_u35(a, b); - #endif - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_atan2_pd(a_.m128d[i], b_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_atan2(a_.f64[i], b_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_atan2_pd - #define _mm256_atan2_pd(a, b) simde_mm256_atan2_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_atan2_ps (simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_atan2_ps(a, b); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_atan2f16_u10(a, b); - #else - return Sleef_atan2f16_u35(a, b); - #endif - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a), - b_ = simde__m512_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_atan2_ps(a_.m256[i], b_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_atan2f(a_.f32[i], b_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_atan2_ps - #define _mm512_atan2_ps(a, b) simde_mm512_atan2_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_atan2_pd (simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_atan2_pd(a, b); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_atan2d8_u10(a, b); - #else - return Sleef_atan2d8_u35(a, b); - #endif - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a), - b_ = simde__m512d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_atan2_pd(a_.m256d[i], b_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_atan2(a_.f64[i], b_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_atan2_pd - #define _mm512_atan2_pd(a, b) simde_mm512_atan2_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_atan2_ps(simde__m512 src, simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_atan2_ps(src, k, a, b); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_atan2_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_atan2_ps - #define _mm512_mask_atan2_ps(src, k, a, b) simde_mm512_mask_atan2_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_atan2_pd(simde__m512d src, simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_atan2_pd(src, k, a, b); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_atan2_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_atan2_pd - #define _mm512_mask_atan2_pd(src, k, a, b) simde_mm512_mask_atan2_pd(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_atanh_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_atanh_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_atanhf4_u10(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_atanhf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_atanh_ps - #define _mm_atanh_ps(a) simde_mm_atanh_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_atanh_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_atanh_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_atanhd2_u10(a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_atanh(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_atanh_pd - #define _mm_atanh_pd(a) simde_mm_atanh_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_atanh_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_atanh_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_atanhf8_u10(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_atanh_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_atanhf(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_atanh_ps - #define _mm256_atanh_ps(a) simde_mm256_atanh_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_atanh_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_atanh_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_atanhd4_u10(a); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_atanh_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_atanh(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_atanh_pd - #define _mm256_atanh_pd(a) simde_mm256_atanh_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_atanh_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_atanh_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_atanhf16_u10(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_atanh_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_atanhf(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_atanh_ps - #define _mm512_atanh_ps(a) simde_mm512_atanh_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_atanh_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_atanh_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_atanhd8_u10(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_atanh_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_atanh(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_atanh_pd - #define _mm512_atanh_pd(a) simde_mm512_atanh_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_atanh_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_atanh_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_atanh_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_atanh_ps - #define _mm512_mask_atanh_ps(src, k, a) simde_mm512_mask_atanh_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_atanh_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_atanh_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_atanh_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_atanh_pd - #define _mm512_mask_atanh_pd(src, k, a) simde_mm512_mask_atanh_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cbrt_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_cbrt_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_cbrtf4_u10(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_cbrtf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_cbrt_ps - #define _mm_cbrt_ps(a) simde_mm_cbrt_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cbrt_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_cbrt_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_cbrtd2_u10(a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_cbrt(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_cbrt_pd - #define _mm_cbrt_pd(a) simde_mm_cbrt_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_cbrt_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cbrt_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_cbrtf8_u10(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_cbrt_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_cbrtf(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_cbrt_ps - #define _mm256_cbrt_ps(a) simde_mm256_cbrt_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_cbrt_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cbrt_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_cbrtd4_u10(a); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_cbrt_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_cbrt(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_cbrt_pd - #define _mm256_cbrt_pd(a) simde_mm256_cbrt_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_cbrt_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cbrt_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_cbrtf16_u10(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_cbrt_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_cbrtf(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_cbrt_ps - #define _mm512_cbrt_ps(a) simde_mm512_cbrt_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_cbrt_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cbrt_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_cbrtd8_u10(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_cbrt_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_cbrt(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_cbrt_pd - #define _mm512_cbrt_pd(a) simde_mm512_cbrt_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_cbrt_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cbrt_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_cbrt_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cbrt_ps - #define _mm512_mask_cbrt_ps(src, k, a) simde_mm512_mask_cbrt_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_cbrt_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cbrt_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_cbrt_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cbrt_pd - #define _mm512_mask_cbrt_pd(src, k, a) simde_mm512_mask_cbrt_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cexp_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_cexp_ps(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i+=2) { - simde_cfloat32 val = simde_math_cexpf(SIMDE_MATH_CMPLXF(a_.f32[i], a_.f32[i+1])); - r_.f32[ i ] = simde_math_crealf(val); - r_.f32[i + 1] = simde_math_cimagf(val); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_cexp_ps - #define _mm_cexp_ps(a) simde_mm_cexp_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_cexp_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cexp_ps(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i+=2) { - simde_cfloat32 val = simde_math_cexpf(SIMDE_MATH_CMPLXF(a_.f32[i], a_.f32[i+1])); - r_.f32[ i ] = simde_math_crealf(val); - r_.f32[i + 1] = simde_math_cimagf(val); - } - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_cexp_ps - #define _mm256_cexp_ps(a) simde_mm256_cexp_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cos_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_cos_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_cosf4_u10(a); - #else - return Sleef_cosf4_u35(a); - #endif - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_cosf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_cos_ps - #define _mm_cos_ps(a) simde_mm_cos_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cos_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_cos_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_cosd2_u10(a); - #else - return Sleef_cosd2_u35(a); - #endif - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_cos(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_cos_pd - #define _mm_cos_pd(a) simde_mm_cos_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_cos_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cos_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_cosf8_u10(a); - #else - return Sleef_cosf8_u35(a); - #endif - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_cos_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_cosf(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_cos_ps - #define _mm256_cos_ps(a) simde_mm256_cos_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_cos_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cos_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_cosd4_u10(a); - #else - return Sleef_cosd4_u35(a); - #endif - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_cos_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_cos(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_cos_pd - #define _mm256_cos_pd(a) simde_mm256_cos_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_cos_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cos_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_cosf16_u10(a); - #else - return Sleef_cosf16_u35(a); - #endif - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_cos_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_cosf(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_cos_ps - #define _mm512_cos_ps(a) simde_mm512_cos_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_cos_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cos_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_cosd8_u10(a); - #else - return Sleef_cosd8_u35(a); - #endif - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_cos_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_cos(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_cos_pd - #define _mm512_cos_pd(a) simde_mm512_cos_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_cos_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cos_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_cos_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cos_ps - #define _mm512_mask_cos_ps(src, k, a) simde_mm512_mask_cos_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_cos_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cos_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_cos_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cos_pd - #define _mm512_mask_cos_pd(src, k, a) simde_mm512_mask_cos_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_x_mm_deg2rad_ps(simde__m128 a) { - #if SIMDE_NATURAL_VECTOR_SIZE_GE(128) - return simde_mm_mul_ps(a, simde_mm_set1_ps(SIMDE_MATH_PI_OVER_180F)); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_f32 = vmulq_n_f32(a_.neon_i32, SIMDE_MATH_PI_OVER_180F); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) - r_.f32 = a_.f32 * SIMDE_MATH_PI_OVER_180F; - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - const __typeof__(r_.f32) tmp = { SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F }; - r_.f32 = a_.f32 * tmp; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_deg2radf(a_.f32[i]); - } - - #endif - return simde__m128_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_x_mm_deg2rad_pd(simde__m128d a) { - #if SIMDE_NATURAL_VECTOR_SIZE_GE(128) - return simde_mm_mul_pd(a, simde_mm_set1_pd(SIMDE_MATH_PI_OVER_180)); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_f64 = vmulq_n_f64(a_.neon_i64, SIMDE_MATH_PI_OVER_180); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) - r_.f64 = a_.f64 * SIMDE_MATH_PI_OVER_180; - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - const __typeof__(r_.f64) tmp = { SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180 }; - r_.f64 = a_.f64 * tmp; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_deg2rad(a_.f64[i]); - } - - #endif - return simde__m128d_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_x_mm256_deg2rad_ps(simde__m256 a) { - #if SIMDE_NATURAL_VECTOR_SIZE_GE(256) - return simde_mm256_mul_ps(a, simde_mm256_set1_ps(SIMDE_MATH_PI_OVER_180F)); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_x_mm_deg2rad_ps(a_.m128[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) - r_.f32 = a_.f32 * SIMDE_MATH_PI_OVER_180F; - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - const __typeof__(r_.f32) tmp = { - SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, - SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F - }; - r_.f32 = a_.f32 * tmp; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_deg2radf(a_.f32[i]); - } - - #endif - return simde__m256_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_x_mm256_deg2rad_pd(simde__m256d a) { - #if SIMDE_NATURAL_VECTOR_SIZE_GE(256) - return simde_mm256_mul_pd(a, simde_mm256_set1_pd(SIMDE_MATH_PI_OVER_180)); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_x_mm_deg2rad_pd(a_.m128d[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) - r_.f64 = a_.f64 * SIMDE_MATH_PI_OVER_180; - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - const __typeof__(r_.f64) tmp = { SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180 }; - r_.f64 = a_.f64 * tmp; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_deg2rad(a_.f64[i]); - } - - #endif - return simde__m256d_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_x_mm512_deg2rad_ps(simde__m512 a) { - #if SIMDE_NATURAL_VECTOR_SIZE_GE(512) - return simde_mm512_mul_ps(a, simde_mm512_set1_ps(SIMDE_MATH_PI_OVER_180F)); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_x_mm256_deg2rad_ps(a_.m256[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) - r_.f32 = a_.f32 * SIMDE_MATH_PI_OVER_180F; - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - const __typeof__(r_.f32) tmp = { - SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, - SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, - SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, - SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F, SIMDE_MATH_PI_OVER_180F - }; - r_.f32 = a_.f32 * tmp; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_deg2radf(a_.f32[i]); - } - - #endif - return simde__m512_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_x_mm512_deg2rad_pd(simde__m512d a) { - #if SIMDE_NATURAL_VECTOR_SIZE_GE(512) - return simde_mm512_mul_pd(a, simde_mm512_set1_pd(SIMDE_MATH_PI_OVER_180)); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_x_mm256_deg2rad_pd(a_.m256d[i]); - } - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) && !defined(SIMDE_BUG_GCC_53784) - r_.f64 = a_.f64 * SIMDE_MATH_PI_OVER_180; - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - const __typeof__(r_.f64) tmp = { - SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180, - SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180, SIMDE_MATH_PI_OVER_180 - }; - r_.f64 = a_.f64 * tmp; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_deg2rad(a_.f64[i]); - } - - #endif - return simde__m512d_from_private(r_); - #endif -} - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cosd_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_cosd_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_cosf4_u10(simde_x_mm_deg2rad_ps(a)); - #else - return Sleef_cosf4_u35(simde_x_mm_deg2rad_ps(a)); - #endif - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_cosf(simde_math_deg2radf(a_.f32[i])); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_cosd_ps - #define _mm_cosd_ps(a) simde_mm_cosd_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cosd_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_cosd_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_cosd2_u10(simde_x_mm_deg2rad_pd(a)); - #else - return Sleef_cosd2_u35(simde_x_mm_deg2rad_pd(a)); - #endif - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_cos(simde_math_deg2rad(a_.f64[i])); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_cosd_pd - #define _mm_cosd_pd(a) simde_mm_cosd_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_cosd_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cosd_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_cosf8_u10(simde_x_mm256_deg2rad_ps(a)); - #else - return Sleef_cosf8_u35(simde_x_mm256_deg2rad_ps(a)); - #endif - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_cosd_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_cosf(simde_math_deg2radf(a_.f32[i])); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_cosd_ps - #define _mm256_cosd_ps(a) simde_mm256_cosd_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_cosd_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cosd_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_cosd4_u10(simde_x_mm256_deg2rad_pd(a)); - #else - return Sleef_cosd4_u35(simde_x_mm256_deg2rad_pd(a)); - #endif - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_cosd_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_cos(simde_math_deg2rad(a_.f64[i])); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_cosd_pd - #define _mm256_cosd_pd(a) simde_mm256_cosd_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_cosd_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cosd_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_cosf16_u10(simde_x_mm512_deg2rad_ps(a)); - #else - return Sleef_cosf16_u35(simde_x_mm512_deg2rad_ps(a)); - #endif - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_cosd_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_cosf(simde_math_deg2radf(a_.f32[i])); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_cosd_ps - #define _mm512_cosd_ps(a) simde_mm512_cosd_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_cosd_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cosd_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_cosd8_u10(simde_x_mm512_deg2rad_pd(a)); - #else - return Sleef_cosd8_u35(simde_x_mm512_deg2rad_pd(a)); - #endif - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_cosd_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_cos(simde_math_deg2rad(a_.f64[i])); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_cosd_pd - #define _mm512_cosd_pd(a) simde_mm512_cosd_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_cosd_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cosd_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_cosd_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cosd_ps - #define _mm512_mask_cosd_ps(src, k, a) simde_mm512_mask_cosd_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_cosd_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cosd_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_cosd_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cosd_pd - #define _mm512_mask_cosd_pd(src, k, a) simde_mm512_mask_cosd_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cosh_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_cosh_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_coshf4_u10(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_coshf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_cosh_ps - #define _mm_cosh_ps(a) simde_mm_cosh_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cosh_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_cosh_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_coshd2_u10(a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_cosh(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_cosh_pd - #define _mm_cosh_pd(a) simde_mm_cosh_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_cosh_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cosh_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_coshf8_u10(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_cosh_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_coshf(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_cosh_ps - #define _mm256_cosh_ps(a) simde_mm256_cosh_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_cosh_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cosh_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_coshd4_u10(a); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_cosh_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_cosh(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_cosh_pd - #define _mm256_cosh_pd(a) simde_mm256_cosh_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_cosh_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cosh_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_coshf16_u10(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_cosh_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_coshf(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_cosh_ps - #define _mm512_cosh_ps(a) simde_mm512_cosh_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_cosh_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cosh_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_coshd8_u10(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_cosh_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_cosh(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_cosh_pd - #define _mm512_cosh_pd(a) simde_mm512_cosh_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_cosh_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cosh_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_cosh_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cosh_ps - #define _mm512_mask_cosh_ps(src, k, a) simde_mm512_mask_cosh_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_cosh_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cosh_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_cosh_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cosh_pd - #define _mm512_mask_cosh_pd(src, k, a) simde_mm512_mask_cosh_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_div_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) - return _mm_div_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = a_.i8 / b_.i8; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i8x4_div(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] / b_.i8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_div_epi8 - #define _mm_div_epi8(a, b) simde_mm_div_epi8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_div_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) - return _mm_div_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = a_.i16 / b_.i16; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x4_div(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] / b_.i16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_div_epi16 - #define _mm_div_epi16(a, b) simde_mm_div_epi16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_div_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) - return _mm_div_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 / b_.i32; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_div(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] / b_.i32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#define simde_mm_idiv_epi32(a, b) simde_mm_div_epi32(a, b) -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_div_epi32 - #define _mm_div_epi32(a, b) simde_mm_div_epi32(a, b) - #undef _mm_idiv_epi32 - #define _mm_idiv_epi32(a, b) simde_mm_div_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_div_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) - return _mm_div_epi64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 / b_.i64; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i64x4_div(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] / b_.i64[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_div_epi64 - #define _mm_div_epi64(a, b) simde_mm_div_epi64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_div_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) - return _mm_div_epu8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u8 = a_.u8 / b_.u8; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u8x16_div(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = a_.u8[i] / b_.u8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_div_epu8 - #define _mm_div_epu8(a, b) simde_mm_div_epu8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_div_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) - return _mm_div_epu16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u16 = a_.u16 / b_.u16; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u16x16_div(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = a_.u16[i] / b_.u16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_div_epu16 - #define _mm_div_epu16(a, b) simde_mm_div_epu16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_div_epu32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) - return _mm_div_epu32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = a_.u32 / b_.u32; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u32x16_div(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] / b_.u32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#define simde_mm_udiv_epi32(a, b) simde_mm_div_epu32(a, b) -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_div_epu32 - #define _mm_div_epu32(a, b) simde_mm_div_epu32(a, b) - #undef _mm_udiv_epi32 - #define _mm_udiv_epi32(a, b) simde_mm_div_epu32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_div_epu64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) - return _mm_div_epu64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = a_.u64 / b_.u64; - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u64x16_div(a_.wasm_v128, b_.wasm_v128); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = a_.u64[i] / b_.u64[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_div_epu64 - #define _mm_div_epu64(a, b) simde_mm_div_epu64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_div_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_div_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = a_.i8 / b_.i8; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_div_epi8(a_.m128i[i], b_.m128i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] / b_.i8[i]; - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_div_epi8 - #define _mm256_div_epi8(a, b) simde_mm256_div_epi8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_div_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_div_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = a_.i16 / b_.i16; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_div_epi16(a_.m128i[i], b_.m128i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] / b_.i16[i]; - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_div_epi16 - #define _mm256_div_epi16(a, b) simde_mm256_div_epi16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_div_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_div_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 / b_.i32; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_div_epi32(a_.m128i[i], b_.m128i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] / b_.i32[i]; - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#define simde_mm256_idiv_epi32(a, b) simde_mm256_div_epi32(a, b) -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_div_epi32 - #define _mm256_div_epi32(a, b) simde_mm256_div_epi32(a, b) - #undef _mm256_idiv_epi32 - #define _mm256_idiv_epi32(a, b) simde_mm256_div_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_div_epi64 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_div_epi64(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 / b_.i64; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_div_epi64(a_.m128i[i], b_.m128i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] / b_.i64[i]; - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_div_epi64 - #define _mm256_div_epi64(a, b) simde_mm256_div_epi64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_div_epu8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_div_epu8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u8 = a_.u8 / b_.u8; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_div_epu8(a_.m128i[i], b_.m128i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = a_.u8[i] / b_.u8[i]; - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_div_epu8 - #define _mm256_div_epu8(a, b) simde_mm256_div_epu8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_div_epu16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_div_epu16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u16 = a_.u16 / b_.u16; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_div_epu16(a_.m128i[i], b_.m128i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = a_.u16[i] / b_.u16[i]; - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_div_epu16 - #define _mm256_div_epu16(a, b) simde_mm256_div_epu16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_div_epu32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_div_epu32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = a_.u32 / b_.u32; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_div_epu32(a_.m128i[i], b_.m128i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] / b_.u32[i]; - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#define simde_mm256_udiv_epi32(a, b) simde_mm256_div_epu32(a, b) -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_div_epu32 - #define _mm256_div_epu32(a, b) simde_mm256_div_epu32(a, b) - #undef _mm256_udiv_epi32 - #define _mm256_udiv_epi32(a, b) simde_mm256_div_epu32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_div_epu64 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_div_epu64(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = a_.u64 / b_.u64; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_div_epu64(a_.m128i[i], b_.m128i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = a_.u64[i] / b_.u64[i]; - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_div_epu64 - #define _mm256_div_epu64(a, b) simde_mm256_div_epu64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_div_epi8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_div_epi8(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = a_.i8 / b_.i8; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_div_epi8(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] / b_.i8[i]; - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_div_epi8 - #define _mm512_div_epi8(a, b) simde_mm512_div_epi8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_div_epi16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_div_epi16(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = a_.i16 / b_.i16; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_div_epi16(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] / b_.i16[i]; - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_div_epi16 - #define _mm512_div_epi16(a, b) simde_mm512_div_epi16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_div_epi32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_div_epi32(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = a_.i32 / b_.i32; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_div_epi32(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] / b_.i32[i]; - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_div_epi32 - #define _mm512_div_epi32(a, b) simde_mm512_div_epi32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_div_epi32(simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_div_epi32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_div_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_div_epi32 - #define _mm512_mask_div_epi32(src, k, a, b) simde_mm512_mask_div_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_div_epi64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_div_epi64(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = a_.i64 / b_.i64; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_div_epi64(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] / b_.i64[i]; - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_div_epi64 - #define _mm512_div_epi64(a, b) simde_mm512_div_epi64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_div_epu8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_div_epu8(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u8 = a_.u8 / b_.u8; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_div_epu8(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = a_.u8[i] / b_.u8[i]; - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_div_epu8 - #define _mm512_div_epu8(a, b) simde_mm512_div_epu8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_div_epu16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_div_epu16(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u16 = a_.u16 / b_.u16; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_div_epu16(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = a_.u16[i] / b_.u16[i]; - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_div_epu16 - #define _mm512_div_epu16(a, b) simde_mm512_div_epu16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_div_epu32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_div_epu32(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = a_.u32 / b_.u32; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_div_epu32(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] / b_.u32[i]; - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_div_epu32 - #define _mm512_div_epu32(a, b) simde_mm512_div_epu32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_div_epu32(simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_div_epu32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_div_epu32(a, b)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_div_epu32 - #define _mm512_mask_div_epu32(src, k, a, b) simde_mm512_mask_div_epu32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_div_epu64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_div_epu64(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = a_.u64 / b_.u64; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_div_epu64(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = a_.u64[i] / b_.u64[i]; - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_div_epu64 - #define _mm512_div_epu64(a, b) simde_mm512_div_epu64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_erf_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_erf_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_erff4_u10(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_erff(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_erf_ps - #define _mm_erf_ps(a) simde_mm_erf_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_erf_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_erf_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_erfd2_u10(a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_erf(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_erf_pd - #define _mm_erf_pd(a) simde_mm_erf_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_erf_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_erf_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_erff8_u10(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_erf_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_erff(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_erf_ps - #define _mm256_erf_ps(a) simde_mm256_erf_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_erf_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_erf_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_erfd4_u10(a); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_erf_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_erf(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_erf_pd - #define _mm256_erf_pd(a) simde_mm256_erf_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_erf_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_erf_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_erff16_u10(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_erf_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_erff(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_erf_ps - #define _mm512_erf_ps(a) simde_mm512_erf_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_erf_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_erf_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_erfd8_u10(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_erf_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_erf(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_erf_pd - #define _mm512_erf_pd(a) simde_mm512_erf_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_erf_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_erf_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_erf_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_erf_ps - #define _mm512_mask_erf_ps(src, k, a) simde_mm512_mask_erf_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_erf_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_erf_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_erf_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_erf_pd - #define _mm512_mask_erf_pd(src, k, a) simde_mm512_mask_erf_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_erfc_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_erfc_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_erfcf4_u15(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_erfcf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_erfc_ps - #define _mm_erfc_ps(a) simde_mm_erfc_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_erfc_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_erfc_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_erfcd2_u15(a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_erfc(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_erfc_pd - #define _mm_erfc_pd(a) simde_mm_erfc_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_erfc_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_erfc_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_erfcf8_u15(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_erfc_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_erfcf(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_erfc_ps - #define _mm256_erfc_ps(a) simde_mm256_erfc_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_erfc_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_erfc_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_erfcd4_u15(a); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_erfc_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_erfc(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_erfc_pd - #define _mm256_erfc_pd(a) simde_mm256_erfc_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_erfc_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_erfc_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_erfcf16_u15(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_erfc_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_erfcf(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_erfc_ps - #define _mm512_erfc_ps(a) simde_mm512_erfc_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_erfc_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_erfc_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_erfcd8_u15(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_erfc_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_erfc(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_erfc_pd - #define _mm512_erfc_pd(a) simde_mm512_erfc_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_erfc_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_erfc_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_erfc_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_erfc_ps - #define _mm512_mask_erfc_ps(src, k, a) simde_mm512_mask_erfc_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_erfc_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_erfc_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_erfc_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_erfc_pd - #define _mm512_mask_erfc_pd(src, k, a) simde_mm512_mask_erfc_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_exp_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_exp_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_expf4_u10(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_expf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_exp_ps - #define _mm_exp_ps(a) simde_mm_exp_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_exp_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_exp_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_expd2_u10(a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_exp(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_exp_pd - #define _mm_exp_pd(a) simde_mm_exp_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_exp_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_exp_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_expf8_u10(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_exp_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_expf(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_exp_ps - #define _mm256_exp_ps(a) simde_mm256_exp_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_exp_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_exp_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_expd4_u10(a); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_exp_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_exp(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_exp_pd - #define _mm256_exp_pd(a) simde_mm256_exp_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_exp_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_exp_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_expf16_u10(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_exp_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_expf(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_exp_ps - #define _mm512_exp_ps(a) simde_mm512_exp_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_exp_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_exp_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_expd8_u10(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_exp_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_exp(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_exp_pd - #define _mm512_exp_pd(a) simde_mm512_exp_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_exp_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_exp_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_exp_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_exp_ps - #define _mm512_mask_exp_ps(src, k, a) simde_mm512_mask_exp_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_exp_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_exp_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_exp_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_exp_pd - #define _mm512_mask_exp_pd(src, k, a) simde_mm512_mask_exp_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_expm1_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_expm1_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_expm1f4_u10(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_expm1f(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_expm1_ps - #define _mm_expm1_ps(a) simde_mm_expm1_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_expm1_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_expm1_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_expm1d2_u10(a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_expm1(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_expm1_pd - #define _mm_expm1_pd(a) simde_mm_expm1_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_expm1_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_expm1_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_expm1f8_u10(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_expm1_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_expm1f(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_expm1_ps - #define _mm256_expm1_ps(a) simde_mm256_expm1_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_expm1_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_expm1_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_expm1d4_u10(a); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_expm1_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_expm1(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_expm1_pd - #define _mm256_expm1_pd(a) simde_mm256_expm1_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_expm1_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_expm1_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_expm1f16_u10(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_expm1_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_expm1f(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_expm1_ps - #define _mm512_expm1_ps(a) simde_mm512_expm1_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_expm1_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_expm1_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_expm1d8_u10(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_expm1_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_expm1(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_expm1_pd - #define _mm512_expm1_pd(a) simde_mm512_expm1_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_expm1_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_expm1_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_expm1_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_expm1_ps - #define _mm512_mask_expm1_ps(src, k, a) simde_mm512_mask_expm1_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_expm1_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_expm1_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_expm1_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_expm1_pd - #define _mm512_mask_expm1_pd(src, k, a) simde_mm512_mask_expm1_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_exp2_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_exp2_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_exp2f4_u10(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_exp2f(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_exp2_ps - #define _mm_exp2_ps(a) simde_mm_exp2_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_exp2_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_exp2_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_exp2d2_u10(a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_exp2(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_exp2_pd - #define _mm_exp2_pd(a) simde_mm_exp2_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_exp2_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_exp2_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_exp2f8_u10(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_exp2_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_exp2f(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_exp2_ps - #define _mm256_exp2_ps(a) simde_mm256_exp2_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_exp2_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_exp2_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_exp2d4_u10(a); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_exp2_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_exp2(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_exp2_pd - #define _mm256_exp2_pd(a) simde_mm256_exp2_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_exp2_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_exp2_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_exp2f16_u10(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_exp2_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_exp2f(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_exp2_ps - #define _mm512_exp2_ps(a) simde_mm512_exp2_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_exp2_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_exp2_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_exp2d8_u10(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_exp2_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_exp2(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_exp2_pd - #define _mm512_exp2_pd(a) simde_mm512_exp2_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_exp2_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_exp2_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_exp2_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_exp2_ps - #define _mm512_mask_exp2_ps(src, k, a) simde_mm512_mask_exp2_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_exp2_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_exp2_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_exp2_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_exp2_pd - #define _mm512_mask_exp2_pd(src, k, a) simde_mm512_mask_exp2_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_exp10_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_exp10_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_exp10f4_u10(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_exp10f(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_exp10_ps - #define _mm_exp10_ps(a) simde_mm_exp10_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_exp10_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_exp10_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_exp10d2_u10(a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_exp10(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_exp10_pd - #define _mm_exp10_pd(a) simde_mm_exp10_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_exp10_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_exp10_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_exp10f8_u10(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_exp10_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_exp10f(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_exp10_ps - #define _mm256_exp10_ps(a) simde_mm256_exp10_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_exp10_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_exp10_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_exp10d4_u10(a); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_exp10_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_exp10(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_exp10_pd - #define _mm256_exp10_pd(a) simde_mm256_exp10_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_exp10_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_exp10_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_exp10f16_u10(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_exp10_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_exp10f(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_exp10_ps - #define _mm512_exp10_ps(a) simde_mm512_exp10_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_exp10_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_exp10_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_exp10d8_u10(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_exp10_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_exp10(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_exp10_pd - #define _mm512_exp10_pd(a) simde_mm512_exp10_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_exp10_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_exp10_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_exp10_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_exp10_ps - #define _mm512_mask_exp10_ps(src, k, a) simde_mm512_mask_exp10_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_exp10_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_exp10_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_exp10_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_exp10_pd - #define _mm512_mask_exp10_pd(src, k, a) simde_mm512_mask_exp10_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cdfnorm_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_cdfnorm_ps(a); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - /* https://www.johndcook.com/blog/cpp_phi/ */ - const simde__m128 a1 = simde_mm_set1_ps(SIMDE_FLOAT32_C( 0.254829592)); - const simde__m128 a2 = simde_mm_set1_ps(SIMDE_FLOAT32_C(-0.284496736)); - const simde__m128 a3 = simde_mm_set1_ps(SIMDE_FLOAT32_C(1.421413741)); - const simde__m128 a4 = simde_mm_set1_ps(SIMDE_FLOAT32_C(-1.453152027)); - const simde__m128 a5 = simde_mm_set1_ps(SIMDE_FLOAT32_C(1.061405429)); - const simde__m128 p = simde_mm_set1_ps(SIMDE_FLOAT32_C(0.3275911)); - const simde__m128 one = simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0)); - - /* simde_math_fabsf(x) / sqrtf(2.0) */ - const simde__m128 x = simde_mm_div_ps(simde_x_mm_abs_ps(a), simde_mm_sqrt_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C(2.0)))); - - /* 1.0 / (1.0 + p * x) */ - const simde__m128 t = simde_mm_div_ps(one, simde_mm_add_ps(one, simde_mm_mul_ps(p, x))); - - /* 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * exp(-x * x) */ - simde__m128 y = simde_mm_mul_ps(a5, t); - y = simde_mm_add_ps(y, a4); - y = simde_mm_mul_ps(y, t); - y = simde_mm_add_ps(y, a3); - y = simde_mm_mul_ps(y, t); - y = simde_mm_add_ps(y, a2); - y = simde_mm_mul_ps(y, t); - y = simde_mm_add_ps(y, a1); - y = simde_mm_mul_ps(y, t); - y = simde_mm_mul_ps(y, simde_mm_exp_ps(simde_mm_mul_ps(x, simde_x_mm_negate_ps(x)))); - y = simde_mm_sub_ps(one, y); - - /* 0.5 * (1.0 + ((a < 0.0) ? -y : y)) */ - return simde_mm_mul_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C(0.5)), simde_mm_add_ps(one, simde_x_mm_xorsign_ps(y, a))); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_cdfnormf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_cdfnorm_ps - #define _mm_cdfnorm_ps(a) simde_mm_cdfnorm_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cdfnorm_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_cdfnorm_pd(a); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - /* https://www.johndcook.com/blog/cpp_phi/ */ - const simde__m128d a1 = simde_mm_set1_pd(SIMDE_FLOAT64_C( 0.254829592)); - const simde__m128d a2 = simde_mm_set1_pd(SIMDE_FLOAT64_C(-0.284496736)); - const simde__m128d a3 = simde_mm_set1_pd(SIMDE_FLOAT64_C(1.421413741)); - const simde__m128d a4 = simde_mm_set1_pd(SIMDE_FLOAT64_C(-1.453152027)); - const simde__m128d a5 = simde_mm_set1_pd(SIMDE_FLOAT64_C(1.061405429)); - const simde__m128d p = simde_mm_set1_pd(SIMDE_FLOAT64_C(0.6475911)); - const simde__m128d one = simde_mm_set1_pd(SIMDE_FLOAT64_C(1.0)); - - /* simde_math_fabs(x) / sqrt(2.0) */ - const simde__m128d x = simde_mm_div_pd(simde_x_mm_abs_pd(a), simde_mm_sqrt_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C(2.0)))); - - /* 1.0 / (1.0 + p * x) */ - const simde__m128d t = simde_mm_div_pd(one, simde_mm_add_pd(one, simde_mm_mul_pd(p, x))); - - /* 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * exp(-x * x) */ - simde__m128d y = simde_mm_mul_pd(a5, t); - y = simde_mm_add_pd(y, a4); - y = simde_mm_mul_pd(y, t); - y = simde_mm_add_pd(y, a3); - y = simde_mm_mul_pd(y, t); - y = simde_mm_add_pd(y, a2); - y = simde_mm_mul_pd(y, t); - y = simde_mm_add_pd(y, a1); - y = simde_mm_mul_pd(y, t); - y = simde_mm_mul_pd(y, simde_mm_exp_pd(simde_mm_mul_pd(x, simde_x_mm_negate_pd(x)))); - y = simde_mm_sub_pd(one, y); - - /* 0.5 * (1.0 + ((a < 0.0) ? -y : y)) */ - return simde_mm_mul_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C(0.5)), simde_mm_add_pd(one, simde_x_mm_xorsign_pd(y, a))); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_cdfnorm(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_cdfnorm_pd - #define _mm_cdfnorm_pd(a) simde_mm_cdfnorm_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_cdfnorm_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cdfnorm_ps(a); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - /* https://www.johndcook.com/blog/cpp_phi/ */ - const simde__m256 a1 = simde_mm256_set1_ps(SIMDE_FLOAT32_C( 0.254829592)); - const simde__m256 a2 = simde_mm256_set1_ps(SIMDE_FLOAT32_C(-0.284496736)); - const simde__m256 a3 = simde_mm256_set1_ps(SIMDE_FLOAT32_C(1.421413741)); - const simde__m256 a4 = simde_mm256_set1_ps(SIMDE_FLOAT32_C(-1.453152027)); - const simde__m256 a5 = simde_mm256_set1_ps(SIMDE_FLOAT32_C(1.061405429)); - const simde__m256 p = simde_mm256_set1_ps(SIMDE_FLOAT32_C(0.3275911)); - const simde__m256 one = simde_mm256_set1_ps(SIMDE_FLOAT32_C(1.0)); - - /* simde_math_fabsf(x) / sqrtf(2.0) */ - const simde__m256 x = simde_mm256_div_ps(simde_x_mm256_abs_ps(a), simde_mm256_sqrt_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C(2.0)))); - - /* 1.0 / (1.0 + p * x) */ - const simde__m256 t = simde_mm256_div_ps(one, simde_mm256_add_ps(one, simde_mm256_mul_ps(p, x))); - - /* 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * exp(-x * x) */ - simde__m256 y = simde_mm256_mul_ps(a5, t); - y = simde_mm256_add_ps(y, a4); - y = simde_mm256_mul_ps(y, t); - y = simde_mm256_add_ps(y, a3); - y = simde_mm256_mul_ps(y, t); - y = simde_mm256_add_ps(y, a2); - y = simde_mm256_mul_ps(y, t); - y = simde_mm256_add_ps(y, a1); - y = simde_mm256_mul_ps(y, t); - y = simde_mm256_mul_ps(y, simde_mm256_exp_ps(simde_mm256_mul_ps(x, simde_x_mm256_negate_ps(x)))); - y = simde_mm256_sub_ps(one, y); - - /* 0.5 * (1.0 + ((a < 0.0) ? -y : y)) */ - return simde_mm256_mul_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C(0.5)), simde_mm256_add_ps(one, simde_x_mm256_xorsign_ps(y, a))); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_cdfnorm_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_cdfnormf(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_cdfnorm_ps - #define _mm256_cdfnorm_ps(a) simde_mm256_cdfnorm_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_cdfnorm_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cdfnorm_pd(a); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - /* https://www.johndcook.com/blog/cpp_phi/ */ - const simde__m256d a1 = simde_mm256_set1_pd(SIMDE_FLOAT64_C( 0.254829592)); - const simde__m256d a2 = simde_mm256_set1_pd(SIMDE_FLOAT64_C(-0.284496736)); - const simde__m256d a3 = simde_mm256_set1_pd(SIMDE_FLOAT64_C(1.421413741)); - const simde__m256d a4 = simde_mm256_set1_pd(SIMDE_FLOAT64_C(-1.453152027)); - const simde__m256d a5 = simde_mm256_set1_pd(SIMDE_FLOAT64_C(1.061405429)); - const simde__m256d p = simde_mm256_set1_pd(SIMDE_FLOAT64_C(0.6475911)); - const simde__m256d one = simde_mm256_set1_pd(SIMDE_FLOAT64_C(1.0)); - - /* simde_math_fabs(x) / sqrt(2.0) */ - const simde__m256d x = simde_mm256_div_pd(simde_x_mm256_abs_pd(a), simde_mm256_sqrt_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C(2.0)))); - - /* 1.0 / (1.0 + p * x) */ - const simde__m256d t = simde_mm256_div_pd(one, simde_mm256_add_pd(one, simde_mm256_mul_pd(p, x))); - - /* 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * exp(-x * x) */ - simde__m256d y = simde_mm256_mul_pd(a5, t); - y = simde_mm256_add_pd(y, a4); - y = simde_mm256_mul_pd(y, t); - y = simde_mm256_add_pd(y, a3); - y = simde_mm256_mul_pd(y, t); - y = simde_mm256_add_pd(y, a2); - y = simde_mm256_mul_pd(y, t); - y = simde_mm256_add_pd(y, a1); - y = simde_mm256_mul_pd(y, t); - y = simde_mm256_mul_pd(y, simde_mm256_exp_pd(simde_mm256_mul_pd(x, simde_x_mm256_negate_pd(x)))); - y = simde_mm256_sub_pd(one, y); - - /* 0.5 * (1.0 + ((a < 0.0) ? -y : y)) */ - return simde_mm256_mul_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C(0.5)), simde_mm256_add_pd(one, simde_x_mm256_xorsign_pd(y, a))); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_cdfnorm_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_cdfnorm(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_cdfnorm_pd - #define _mm256_cdfnorm_pd(a) simde_mm256_cdfnorm_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_cdfnorm_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cdfnorm_ps(a); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - /* https://www.johndcook.com/blog/cpp_phi/ */ - const simde__m512 a1 = simde_mm512_set1_ps(SIMDE_FLOAT32_C( 0.254829592)); - const simde__m512 a2 = simde_mm512_set1_ps(SIMDE_FLOAT32_C(-0.284496736)); - const simde__m512 a3 = simde_mm512_set1_ps(SIMDE_FLOAT32_C(1.421413741)); - const simde__m512 a4 = simde_mm512_set1_ps(SIMDE_FLOAT32_C(-1.453152027)); - const simde__m512 a5 = simde_mm512_set1_ps(SIMDE_FLOAT32_C(1.061405429)); - const simde__m512 p = simde_mm512_set1_ps(SIMDE_FLOAT32_C(0.3275911)); - const simde__m512 one = simde_mm512_set1_ps(SIMDE_FLOAT32_C(1.0)); - - /* simde_math_fabsf(x) / sqrtf(2.0) */ - const simde__m512 x = simde_mm512_div_ps(simde_mm512_abs_ps(a), simde_mm512_sqrt_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C(2.0)))); - - /* 1.0 / (1.0 + p * x) */ - const simde__m512 t = simde_mm512_div_ps(one, simde_mm512_add_ps(one, simde_mm512_mul_ps(p, x))); - - /* 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * exp(-x * x) */ - simde__m512 y = simde_mm512_mul_ps(a5, t); - y = simde_mm512_add_ps(y, a4); - y = simde_mm512_mul_ps(y, t); - y = simde_mm512_add_ps(y, a3); - y = simde_mm512_mul_ps(y, t); - y = simde_mm512_add_ps(y, a2); - y = simde_mm512_mul_ps(y, t); - y = simde_mm512_add_ps(y, a1); - y = simde_mm512_mul_ps(y, t); - y = simde_mm512_mul_ps(y, simde_mm512_exp_ps(simde_mm512_mul_ps(x, simde_x_mm512_negate_ps(x)))); - y = simde_mm512_sub_ps(one, y); - - /* 0.5 * (1.0 + ((a < 0.0) ? -y : y)) */ - return simde_mm512_mul_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C(0.5)), simde_mm512_add_ps(one, simde_x_mm512_xorsign_ps(y, a))); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_cdfnorm_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_cdfnormf(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_cdfnorm_ps - #define _mm512_cdfnorm_ps(a) simde_mm512_cdfnorm_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_cdfnorm_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cdfnorm_pd(a); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - /* https://www.johndcook.com/blog/cpp_phi/ */ - const simde__m512d a1 = simde_mm512_set1_pd(SIMDE_FLOAT64_C( 0.254829592)); - const simde__m512d a2 = simde_mm512_set1_pd(SIMDE_FLOAT64_C(-0.284496736)); - const simde__m512d a3 = simde_mm512_set1_pd(SIMDE_FLOAT64_C(1.421413741)); - const simde__m512d a4 = simde_mm512_set1_pd(SIMDE_FLOAT64_C(-1.453152027)); - const simde__m512d a5 = simde_mm512_set1_pd(SIMDE_FLOAT64_C(1.061405429)); - const simde__m512d p = simde_mm512_set1_pd(SIMDE_FLOAT64_C(0.6475911)); - const simde__m512d one = simde_mm512_set1_pd(SIMDE_FLOAT64_C(1.0)); - - /* simde_math_fabs(x) / sqrt(2.0) */ - const simde__m512d x = simde_mm512_div_pd(simde_mm512_abs_pd(a), simde_mm512_sqrt_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C(2.0)))); - - /* 1.0 / (1.0 + p * x) */ - const simde__m512d t = simde_mm512_div_pd(one, simde_mm512_add_pd(one, simde_mm512_mul_pd(p, x))); - - /* 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * exp(-x * x) */ - simde__m512d y = simde_mm512_mul_pd(a5, t); - y = simde_mm512_add_pd(y, a4); - y = simde_mm512_mul_pd(y, t); - y = simde_mm512_add_pd(y, a3); - y = simde_mm512_mul_pd(y, t); - y = simde_mm512_add_pd(y, a2); - y = simde_mm512_mul_pd(y, t); - y = simde_mm512_add_pd(y, a1); - y = simde_mm512_mul_pd(y, t); - y = simde_mm512_mul_pd(y, simde_mm512_exp_pd(simde_mm512_mul_pd(x, simde_x_mm512_negate_pd(x)))); - y = simde_mm512_sub_pd(one, y); - - /* 0.5 * (1.0 + ((a < 0.0) ? -y : y)) */ - return simde_mm512_mul_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C(0.5)), simde_mm512_add_pd(one, simde_x_mm512_xorsign_pd(y, a))); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_cdfnorm_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_cdfnorm(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_cdfnorm_pd - #define _mm512_cdfnorm_pd(a) simde_mm512_cdfnorm_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_cdfnorm_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cdfnorm_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_cdfnorm_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cdfnorm_ps - #define _mm512_mask_cdfnorm_ps(src, k, a) simde_mm512_mask_cdfnorm_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_cdfnorm_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cdfnorm_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_cdfnorm_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cdfnorm_pd - #define _mm512_mask_cdfnorm_pd(src, k, a) simde_mm512_mask_cdfnorm_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_idivrem_epi32 (simde__m128i* mem_addr, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) - return _mm_idivrem_epi32(HEDLEY_REINTERPRET_CAST(__m128i*, mem_addr), a, b); - #else - simde__m128i r; - - r = simde_mm_div_epi32(a, b); - *mem_addr = simde_mm_sub_epi32(a, simde_mm_mullo_epi32(r, b)); - - return r; - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_idivrem_epi32 - #define _mm_idivrem_epi32(mem_addr, a, b) simde_mm_idivrem_epi32((mem_addr),(a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_idivrem_epi32 (simde__m256i* mem_addr, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_idivrem_epi32(HEDLEY_REINTERPRET_CAST(__m256i*, mem_addr), a, b); - #else - simde__m256i r; - - r = simde_mm256_div_epi32(a, b); - *mem_addr = simde_mm256_sub_epi32(a, simde_mm256_mullo_epi32(r, b)); - - return r; - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_idivrem_epi32 - #define _mm256_idivrem_epi32(mem_addr, a, b) simde_mm256_idivrem_epi32((mem_addr),(a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_hypot_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_hypot_ps(a, b); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_hypotf4_u05(a, b); - #else - return Sleef_hypotf4_u35(a, b); - #endif - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_hypotf(a_.f32[i], b_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_hypot_ps - #define _mm_hypot_ps(a, b) simde_mm_hypot_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_hypot_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_hypot_pd(a, b); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_hypotd2_u05(a, b); - #else - return Sleef_hypotd2_u35(a, b); - #endif - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_hypot(a_.f64[i], b_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_hypot_pd - #define _mm_hypot_pd(a, b) simde_mm_hypot_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_hypot_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_hypot_ps(a, b); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_hypotf8_u05(a, b); - #else - return Sleef_hypotf8_u35(a, b); - #endif - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_hypot_ps(a_.m128[i], b_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_hypotf(a_.f32[i], b_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_hypot_ps - #define _mm256_hypot_ps(a, b) simde_mm256_hypot_ps(a, b) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_hypot_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_hypot_pd(a, b); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_hypotd4_u05(a, b); - #else - return Sleef_hypotd4_u35(a, b); - #endif - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_hypot_pd(a_.m128d[i], b_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_hypot(a_.f64[i], b_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_hypot_pd - #define _mm256_hypot_pd(a, b) simde_mm256_hypot_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_hypot_ps (simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_hypot_ps(a, b); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_hypotf16_u05(a, b); - #else - return Sleef_hypotf16_u35(a, b); - #endif - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a), - b_ = simde__m512_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_hypot_ps(a_.m256[i], b_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_hypotf(a_.f32[i], b_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_hypot_ps - #define _mm512_hypot_ps(a, b) simde_mm512_hypot_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_hypot_pd (simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_hypot_pd(a, b); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_hypotd8_u05(a, b); - #else - return Sleef_hypotd8_u35(a, b); - #endif - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a), - b_ = simde__m512d_to_private(b); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_hypot_pd(a_.m256d[i], b_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_hypot(a_.f64[i], b_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_hypot_pd - #define _mm512_hypot_pd(a, b) simde_mm512_hypot_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_hypot_ps(simde__m512 src, simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_hypot_ps(src, k, a, b); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_hypot_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_hypot_ps - #define _mm512_mask_hypot_ps(src, k, a, b) simde_mm512_mask_hypot_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_hypot_pd(simde__m512d src, simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_hypot_pd(src, k, a, b); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_hypot_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_hypot_pd - #define _mm512_mask_hypot_pd(src, k, a, b) simde_mm512_mask_hypot_pd(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_invcbrt_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_invcbrt_ps(a); - #else - return simde_mm_rcp_ps(simde_mm_cbrt_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_invcbrt_ps - #define _mm_invcbrt_ps(a) simde_mm_invcbrt_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_invcbrt_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_invcbrt_pd(a); - #else - return simde_mm_div_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C(1.0)), simde_mm_cbrt_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_invcbrt_pd - #define _mm_invcbrt_pd(a) simde_mm_invcbrt_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_invcbrt_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_invcbrt_ps(a); - #else - return simde_mm256_rcp_ps(simde_mm256_cbrt_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_invcbrt_ps - #define _mm256_invcbrt_ps(a) simde_mm256_invcbrt_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_invcbrt_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_invcbrt_pd(a); - #else - return simde_mm256_div_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C(1.0)), simde_mm256_cbrt_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_invcbrt_pd - #define _mm256_invcbrt_pd(a) simde_mm256_invcbrt_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_invsqrt_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_invsqrt_ps(a); - #else - return simde_mm_rcp_ps(simde_mm_sqrt_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_invsqrt_ps - #define _mm_invsqrt_ps(a) simde_mm_invsqrt_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_invsqrt_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_invsqrt_pd(a); - #else - return simde_mm_div_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C(1.0)), simde_mm_sqrt_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_invsqrt_pd - #define _mm_invsqrt_pd(a) simde_mm_invsqrt_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_invsqrt_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_invsqrt_ps(a); - #else - return simde_mm256_rcp_ps(simde_mm256_sqrt_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_invsqrt_ps - #define _mm256_invsqrt_ps(a) simde_mm256_invsqrt_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_invsqrt_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_invsqrt_pd(a); - #else - return simde_mm256_div_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C(1.0)), simde_mm256_sqrt_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_invsqrt_pd - #define _mm256_invsqrt_pd(a) simde_mm256_invsqrt_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_invsqrt_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_invsqrt_ps(a); - #else - return simde_mm512_div_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C(1.0)), simde_mm512_sqrt_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_invsqrt_ps - #define _mm512_invsqrt_ps(a) simde_mm512_invsqrt_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_invsqrt_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_invsqrt_pd(a); - #else - return simde_mm512_div_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C(1.0)), simde_mm512_sqrt_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_invsqrt_pd - #define _mm512_invsqrt_pd(a) simde_mm512_invsqrt_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_invsqrt_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_invsqrt_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_invsqrt_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_invsqrt_ps - #define _mm512_mask_invsqrt_ps(src, k, a) simde_mm512_mask_invsqrt_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_invsqrt_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_invsqrt_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_invsqrt_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_invsqrt_pd - #define _mm512_mask_invsqrt_pd(src, k, a) simde_mm512_mask_invsqrt_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_log_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_log_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_logf4_u10(a); - #else - return Sleef_logf4_u35(a); - #endif - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_logf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_log_ps - #define _mm_log_ps(a) simde_mm_log_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_log_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_log_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_logd2_u10(a); - #else - return Sleef_logd2_u35(a); - #endif - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_log(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_log_pd - #define _mm_log_pd(a) simde_mm_log_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_log_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_log_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_logf8_u10(a); - #else - return Sleef_logf8_u35(a); - #endif - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_log_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_logf(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_log_ps - #define _mm256_log_ps(a) simde_mm256_log_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_log_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_log_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_logd4_u10(a); - #else - return Sleef_logd4_u35(a); - #endif - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_log_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_log(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_log_pd - #define _mm256_log_pd(a) simde_mm256_log_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_log_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_log_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_logf16_u10(a); - #else - return Sleef_logf16_u35(a); - #endif - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_log_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_logf(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_log_ps - #define _mm512_log_ps(a) simde_mm512_log_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_log_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_log_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_logd8_u10(a); - #else - return Sleef_logd8_u35(a); - #endif - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_log_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_log(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_log_pd - #define _mm512_log_pd(a) simde_mm512_log_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_log_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_log_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_log_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_log_ps - #define _mm512_mask_log_ps(src, k, a) simde_mm512_mask_log_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_log_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_log_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_log_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_log_pd - #define _mm512_mask_log_pd(src, k, a) simde_mm512_mask_log_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_cdfnorminv_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_cdfnorminv_ps(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - simde__m128 matched, retval = simde_mm_setzero_ps(); - - { /* if (a < 0 || a > 1) */ - matched = simde_mm_or_ps(simde_mm_cmplt_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.0))), simde_mm_cmpgt_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0)))); - - /* We don't actually need to do anything here since we initialize - * retval to 0.0. */ - } - - { /* else if (a == 0) */ - simde__m128 mask = simde_mm_cmpeq_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.0))); - mask = simde_mm_andnot_ps(matched, mask); - matched = simde_mm_or_ps(matched, mask); - - simde__m128 res = simde_mm_set1_ps(-SIMDE_MATH_INFINITYF); - - retval = simde_mm_or_ps(retval, simde_mm_and_ps(mask, res)); - } - - { /* else if (a == 1) */ - simde__m128 mask = simde_mm_cmpeq_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0))); - mask = simde_mm_andnot_ps(matched, mask); - matched = simde_mm_or_ps(matched, mask); - - simde__m128 res = simde_mm_set1_ps(SIMDE_MATH_INFINITYF); - - retval = simde_mm_or_ps(retval, simde_mm_and_ps(mask, res)); - } - - { /* Remaining conditions. - * - * Including the else case in this complicates things a lot, but - * we're using cheap operations to get rid of expensive multiply - * and add functions. This should be a small improvement on SSE - * prior to 4.1. On SSE 4.1 we can use _mm_blendv_ps which is - * very fast and this becomes a huge win. NEON, AltiVec, and - * WASM also have blend operations, so this should be a big win - * there, too. */ - - /* else if (a < 0.02425) */ - simde__m128 mask_lo = simde_mm_cmplt_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.02425))); - /* else if (a > 0.97575) */ - simde__m128 mask_hi = simde_mm_cmpgt_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.97575))); - - simde__m128 mask = simde_mm_or_ps(mask_lo, mask_hi); - matched = simde_mm_or_ps(matched, mask); - - /* else */ - simde__m128 mask_el = simde_x_mm_not_ps(matched); - mask = simde_mm_or_ps(mask, mask_el); - - /* r = a - 0.5f */ - simde__m128 r = simde_mm_sub_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.5))); - - /* lo: q = a - * hi: q = (1.0 - a) */ - simde__m128 q = simde_mm_and_ps(mask_lo, a); - q = simde_mm_or_ps(q, simde_mm_and_ps(mask_hi, simde_mm_sub_ps(simde_mm_set1_ps(1.0f), a))); - - /* q = simde_math_sqrtf(-2.0f * simde_math_logf(q)) */ - q = simde_mm_log_ps(q); - q = simde_mm_mul_ps(q, simde_mm_set1_ps(SIMDE_FLOAT32_C(-2.0))); - q = simde_mm_sqrt_ps(q); - - /* el: q = r * r */ - q = simde_x_mm_select_ps(q, simde_mm_mul_ps(r, r), mask_el); - - /* lo: float numerator = ((((((c_c[0] * q + c_c[1]) * q + c_c[2]) * q + c_c[3]) * q + c_c[4]) * q + c_c[5]) * 1.0f); */ - /* hi: float numerator = ((((((c_c[0] * q + c_c[1]) * q + c_c[2]) * q + c_c[3]) * q + c_c[4]) * q + c_c[5]) * -1.0f); */ - /* el: float numerator = ((((((c_a[0] * q + c_a[1]) * q + c_a[2]) * q + c_a[3]) * q + c_a[4]) * q + c_a[5]) * r); */ - simde__m128 numerator = simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C(-7.784894002430293e-03)), simde_mm_set1_ps(SIMDE_FLOAT32_C(-3.969683028665376e+01)), mask_el); - numerator = simde_mm_fmadd_ps(numerator, q, simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C(-3.223964580411365e-01)), simde_mm_set1_ps(SIMDE_FLOAT32_C( 2.209460984245205e+02)), mask_el)); - numerator = simde_mm_fmadd_ps(numerator, q, simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C(-2.400758277161838e+00)), simde_mm_set1_ps(SIMDE_FLOAT32_C(-2.759285104469687e+02)), mask_el)); - numerator = simde_mm_fmadd_ps(numerator, q, simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C(-2.549732539343734e+00)), simde_mm_set1_ps(SIMDE_FLOAT32_C( 1.383577518672690e+02)), mask_el)); - numerator = simde_mm_fmadd_ps(numerator, q, simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C( 4.374664141464968e+00)), simde_mm_set1_ps(SIMDE_FLOAT32_C(-3.066479806614716e+01)), mask_el)); - numerator = simde_mm_fmadd_ps(numerator, q, simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C( 2.938163982698783e+00)), simde_mm_set1_ps(SIMDE_FLOAT32_C( 2.506628277459239e+00)), mask_el)); - { - simde__m128 multiplier; - multiplier = simde_mm_and_ps(mask_lo, simde_mm_set1_ps(SIMDE_FLOAT32_C( 1.0))); - multiplier = simde_mm_or_ps(multiplier, simde_mm_and_ps(mask_hi, simde_mm_set1_ps(SIMDE_FLOAT32_C(-1.0)))); - multiplier = simde_mm_or_ps(multiplier, simde_mm_and_ps(mask_el, r)); - numerator = simde_mm_mul_ps(numerator, multiplier); - } - - /* lo/hi: float denominator = (((((c_d[0] * q + c_d[1]) * q + c_d[2]) * q + c_d[3]) * 1 + 0.0f) * q + 1); */ - /* el: float denominator = (((((c_b[0] * q + c_b[1]) * q + c_b[2]) * q + c_b[3]) * q + c_b[4]) * q + 1); */ - simde__m128 denominator = simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C( 7.784695709041462e-03)), simde_mm_set1_ps(SIMDE_FLOAT32_C(-5.447609879822406e+01)), mask_el); - denominator = simde_mm_fmadd_ps(denominator, q, simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C( 3.224671290700398e-01)), simde_mm_set1_ps(SIMDE_FLOAT32_C( 1.615858368580409e+02)), mask_el)); - denominator = simde_mm_fmadd_ps(denominator, q, simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C( 2.445134137142996e+00)), simde_mm_set1_ps(SIMDE_FLOAT32_C(-1.556989798598866e+02)), mask_el)); - denominator = simde_mm_fmadd_ps(denominator, q, simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C( 3.754408661907416e+00)), simde_mm_set1_ps(SIMDE_FLOAT32_C( 6.680131188771972e+01)), mask_el)); - denominator = simde_mm_fmadd_ps(denominator, simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C( 1.0)), q, mask_el), - simde_x_mm_select_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C( 0.0)), simde_mm_set1_ps(SIMDE_FLOAT32_C(-1.328068155288572e+01)), mask_el)); - denominator = simde_mm_fmadd_ps(denominator, q, simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0))); - - /* res = numerator / denominator; */ - simde__m128 res = simde_mm_div_ps(numerator, denominator); - - retval = simde_mm_or_ps(retval, simde_mm_and_ps(mask, res)); - } - - return retval; - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_cdfnorminvf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_cdfnorminv_ps - #define _mm_cdfnorminv_ps(a) simde_mm_cdfnorminv_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_cdfnorminv_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_cdfnorminv_pd(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - simde__m128d matched, retval = simde_mm_setzero_pd(); - - { /* if (a < 0 || a > 1) */ - matched = simde_mm_or_pd(simde_mm_cmplt_pd(a, simde_mm_set1_pd(SIMDE_FLOAT64_C(0.0))), simde_mm_cmpgt_pd(a, simde_mm_set1_pd(SIMDE_FLOAT64_C(1.0)))); - - /* We don't actually need to do anything here since we initialize - * retval to 0.0. */ - } - - { /* else if (a == 0) */ - simde__m128d mask = simde_mm_cmpeq_pd(a, simde_mm_set1_pd(SIMDE_FLOAT64_C(0.0))); - mask = simde_mm_andnot_pd(matched, mask); - matched = simde_mm_or_pd(matched, mask); - - simde__m128d res = simde_mm_set1_pd(-SIMDE_MATH_INFINITY); - - retval = simde_mm_or_pd(retval, simde_mm_and_pd(mask, res)); - } - - { /* else if (a == 1) */ - simde__m128d mask = simde_mm_cmpeq_pd(a, simde_mm_set1_pd(SIMDE_FLOAT64_C(1.0))); - mask = simde_mm_andnot_pd(matched, mask); - matched = simde_mm_or_pd(matched, mask); - - simde__m128d res = simde_mm_set1_pd(SIMDE_MATH_INFINITY); - - retval = simde_mm_or_pd(retval, simde_mm_and_pd(mask, res)); - } - - { /* Remaining conditions. - * - * Including the else case in this complicates things a lot, but - * we're using cheap operations to get rid of expensive multiply - * and add functions. This should be a small improvement on SSE - * prior to 4.1. On SSE 4.1 we can use _mm_blendv_pd which is - * very fast and this becomes a huge win. NEON, AltiVec, and - * WASM also have blend operations, so this should be a big win - * there, too. */ - - /* else if (a < 0.02425) */ - simde__m128d mask_lo = simde_mm_cmplt_pd(a, simde_mm_set1_pd(SIMDE_FLOAT64_C(0.02425))); - /* else if (a > 0.97575) */ - simde__m128d mask_hi = simde_mm_cmpgt_pd(a, simde_mm_set1_pd(SIMDE_FLOAT64_C(0.97575))); - - simde__m128d mask = simde_mm_or_pd(mask_lo, mask_hi); - matched = simde_mm_or_pd(matched, mask); - - /* else */ - simde__m128d mask_el = simde_x_mm_not_pd(matched); - mask = simde_mm_or_pd(mask, mask_el); - - /* r = a - 0.5 */ - simde__m128d r = simde_mm_sub_pd(a, simde_mm_set1_pd(SIMDE_FLOAT64_C(0.5))); - - /* lo: q = a - * hi: q = (1.0 - a) */ - simde__m128d q = simde_mm_and_pd(mask_lo, a); - q = simde_mm_or_pd(q, simde_mm_and_pd(mask_hi, simde_mm_sub_pd(simde_mm_set1_pd(1.0), a))); - - /* q = simde_math_sqrt(-2.0 * simde_math_log(q)) */ - q = simde_mm_log_pd(q); - q = simde_mm_mul_pd(q, simde_mm_set1_pd(SIMDE_FLOAT64_C(-2.0))); - q = simde_mm_sqrt_pd(q); - - /* el: q = r * r */ - q = simde_x_mm_select_pd(q, simde_mm_mul_pd(r, r), mask_el); - - /* lo: double numerator = ((((((c_c[0] * q + c_c[1]) * q + c_c[2]) * q + c_c[3]) * q + c_c[4]) * q + c_c[5]) * 1.0); */ - /* hi: double numerator = ((((((c_c[0] * q + c_c[1]) * q + c_c[2]) * q + c_c[3]) * q + c_c[4]) * q + c_c[5]) * -1.0); */ - /* el: double numerator = ((((((c_a[0] * q + c_a[1]) * q + c_a[2]) * q + c_a[3]) * q + c_a[4]) * q + c_a[5]) * r); */ - simde__m128d numerator = simde_x_mm_select_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C(-7.784894002430293e-03)), simde_mm_set1_pd(SIMDE_FLOAT64_C(-3.969683028665376e+01)), mask_el); - numerator = simde_mm_fmadd_pd(numerator, q, simde_x_mm_select_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C(-3.223964580411365e-01)), simde_mm_set1_pd(SIMDE_FLOAT64_C( 2.209460984245205e+02)), mask_el)); - numerator = simde_mm_fmadd_pd(numerator, q, simde_x_mm_select_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C(-2.400758277161838e+00)), simde_mm_set1_pd(SIMDE_FLOAT64_C(-2.759285104469687e+02)), mask_el)); - numerator = simde_mm_fmadd_pd(numerator, q, simde_x_mm_select_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C(-2.549732539343734e+00)), simde_mm_set1_pd(SIMDE_FLOAT64_C( 1.383577518672690e+02)), mask_el)); - numerator = simde_mm_fmadd_pd(numerator, q, simde_x_mm_select_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C( 4.374664141464968e+00)), simde_mm_set1_pd(SIMDE_FLOAT64_C(-3.066479806614716e+01)), mask_el)); - numerator = simde_mm_fmadd_pd(numerator, q, simde_x_mm_select_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C( 2.938163982698783e+00)), simde_mm_set1_pd(SIMDE_FLOAT64_C( 2.506628277459239e+00)), mask_el)); - { - simde__m128d multiplier; - multiplier = simde_mm_and_pd(mask_lo, simde_mm_set1_pd(SIMDE_FLOAT64_C( 1.0))); - multiplier = simde_mm_or_pd(multiplier, simde_mm_and_pd(mask_hi, simde_mm_set1_pd(SIMDE_FLOAT64_C(-1.0)))); - multiplier = simde_mm_or_pd(multiplier, simde_mm_and_pd(mask_el, r)); - numerator = simde_mm_mul_pd(numerator, multiplier); - } - - /* lo/hi: double denominator = (((((c_d[0] * q + c_d[1]) * q + c_d[2]) * q + c_d[3]) * 1 + 0.0f) * q + 1); */ - /* el: double denominator = (((((c_b[0] * q + c_b[1]) * q + c_b[2]) * q + c_b[3]) * q + c_b[4]) * q + 1); */ - simde__m128d denominator = simde_x_mm_select_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C( 7.784695709041462e-03)), simde_mm_set1_pd(SIMDE_FLOAT64_C(-5.447609879822406e+01)), mask_el); - denominator = simde_mm_fmadd_pd(denominator, q, simde_x_mm_select_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C( 3.224671290700398e-01)), simde_mm_set1_pd(SIMDE_FLOAT64_C( 1.615858368580409e+02)), mask_el)); - denominator = simde_mm_fmadd_pd(denominator, q, simde_x_mm_select_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C( 2.445134137142996e+00)), simde_mm_set1_pd(SIMDE_FLOAT64_C(-1.556989798598866e+02)), mask_el)); - denominator = simde_mm_fmadd_pd(denominator, q, simde_x_mm_select_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C( 3.754408661907416e+00)), simde_mm_set1_pd(SIMDE_FLOAT64_C( 6.680131188771972e+01)), mask_el)); - denominator = simde_mm_fmadd_pd(denominator, simde_x_mm_select_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C( 1.0)), q, mask_el), - simde_x_mm_select_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C( 0.0)), simde_mm_set1_pd(SIMDE_FLOAT64_C(-1.328068155288572e+01)), mask_el)); - denominator = simde_mm_fmadd_pd(denominator, q, simde_mm_set1_pd(SIMDE_FLOAT64_C(1.0))); - - /* res = numerator / denominator; */ - simde__m128d res = simde_mm_div_pd(numerator, denominator); - - retval = simde_mm_or_pd(retval, simde_mm_and_pd(mask, res)); - } - - return retval; - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_cdfnorminv(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_cdfnorminv_pd - #define _mm_cdfnorminv_pd(a) simde_mm_cdfnorminv_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_cdfnorminv_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cdfnorminv_ps(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(256) - simde__m256 matched, retval = simde_mm256_setzero_ps(); - - { /* if (a < 0 || a > 1) */ - matched = simde_mm256_or_ps(simde_mm256_cmp_ps(a, simde_mm256_set1_ps(SIMDE_FLOAT32_C(0.0)), SIMDE_CMP_LT_OQ), simde_mm256_cmp_ps(a, simde_mm256_set1_ps(SIMDE_FLOAT32_C(1.0)), SIMDE_CMP_GT_OQ)); - - /* We don't actually need to do anything here since we initialize - * retval to 0.0. */ - } - - { /* else if (a == 0) */ - simde__m256 mask = simde_mm256_cmp_ps(a, simde_mm256_set1_ps(SIMDE_FLOAT32_C(0.0)), SIMDE_CMP_EQ_OQ); - mask = simde_mm256_andnot_ps(matched, mask); - matched = simde_mm256_or_ps(matched, mask); - - simde__m256 res = simde_mm256_set1_ps(-SIMDE_MATH_INFINITYF); - - retval = simde_mm256_or_ps(retval, simde_mm256_and_ps(mask, res)); - } - - { /* else if (a == 1) */ - simde__m256 mask = simde_mm256_cmp_ps(a, simde_mm256_set1_ps(SIMDE_FLOAT32_C(1.0)), SIMDE_CMP_EQ_OQ); - mask = simde_mm256_andnot_ps(matched, mask); - matched = simde_mm256_or_ps(matched, mask); - - simde__m256 res = simde_mm256_set1_ps(SIMDE_MATH_INFINITYF); - - retval = simde_mm256_or_ps(retval, simde_mm256_and_ps(mask, res)); - } - - { /* Remaining conditions. - * - * Including the else case in this complicates things a lot, but - * we're using cheap operations to get rid of expensive multiply - * and add functions. This should be a small improvement on SSE - * prior to 4.1. On SSE 4.1 we can use _mm256_blendv_ps which is - * very fast and this becomes a huge win. NEON, AltiVec, and - * WASM also have blend operations, so this should be a big win - * there, too. */ - - /* else if (a < 0.02425) */ - simde__m256 mask_lo = simde_mm256_cmp_ps(a, simde_mm256_set1_ps(SIMDE_FLOAT32_C(0.02425)), SIMDE_CMP_LT_OQ); - /* else if (a > 0.97575) */ - simde__m256 mask_hi = simde_mm256_cmp_ps(a, simde_mm256_set1_ps(SIMDE_FLOAT32_C(0.97575)), SIMDE_CMP_GT_OQ); - - simde__m256 mask = simde_mm256_or_ps(mask_lo, mask_hi); - matched = simde_mm256_or_ps(matched, mask); - - /* else */ - simde__m256 mask_el = simde_x_mm256_not_ps(matched); - mask = simde_mm256_or_ps(mask, mask_el); - - /* r = a - 0.5f */ - simde__m256 r = simde_mm256_sub_ps(a, simde_mm256_set1_ps(SIMDE_FLOAT32_C(0.5))); - - /* lo: q = a - * hi: q = (1.0 - a) */ - simde__m256 q = simde_mm256_and_ps(mask_lo, a); - q = simde_mm256_or_ps(q, simde_mm256_and_ps(mask_hi, simde_mm256_sub_ps(simde_mm256_set1_ps(1.0f), a))); - - /* q = simde_math_sqrtf(-2.0f * simde_math_logf(q)) */ - q = simde_mm256_log_ps(q); - q = simde_mm256_mul_ps(q, simde_mm256_set1_ps(SIMDE_FLOAT32_C(-2.0))); - q = simde_mm256_sqrt_ps(q); - - /* el: q = r * r */ - q = simde_x_mm256_select_ps(q, simde_mm256_mul_ps(r, r), mask_el); - - /* lo: float numerator = ((((((c_c[0] * q + c_c[1]) * q + c_c[2]) * q + c_c[3]) * q + c_c[4]) * q + c_c[5]) * 1.0f); */ - /* hi: float numerator = ((((((c_c[0] * q + c_c[1]) * q + c_c[2]) * q + c_c[3]) * q + c_c[4]) * q + c_c[5]) * -1.0f); */ - /* el: float numerator = ((((((c_a[0] * q + c_a[1]) * q + c_a[2]) * q + c_a[3]) * q + c_a[4]) * q + c_a[5]) * r); */ - simde__m256 numerator = simde_x_mm256_select_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C(-7.784894002430293e-03)), simde_mm256_set1_ps(SIMDE_FLOAT32_C(-3.969683028665376e+01)), mask_el); - numerator = simde_mm256_fmadd_ps(numerator, q, simde_x_mm256_select_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C(-3.223964580411365e-01)), simde_mm256_set1_ps(SIMDE_FLOAT32_C( 2.209460984245205e+02)), mask_el)); - numerator = simde_mm256_fmadd_ps(numerator, q, simde_x_mm256_select_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C(-2.400758277161838e+00)), simde_mm256_set1_ps(SIMDE_FLOAT32_C(-2.759285104469687e+02)), mask_el)); - numerator = simde_mm256_fmadd_ps(numerator, q, simde_x_mm256_select_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C(-2.549732539343734e+00)), simde_mm256_set1_ps(SIMDE_FLOAT32_C( 1.383577518672690e+02)), mask_el)); - numerator = simde_mm256_fmadd_ps(numerator, q, simde_x_mm256_select_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C( 4.374664141464968e+00)), simde_mm256_set1_ps(SIMDE_FLOAT32_C(-3.066479806614716e+01)), mask_el)); - numerator = simde_mm256_fmadd_ps(numerator, q, simde_x_mm256_select_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C( 2.938163982698783e+00)), simde_mm256_set1_ps(SIMDE_FLOAT32_C( 2.506628277459239e+00)), mask_el)); - { - simde__m256 multiplier; - multiplier = simde_mm256_and_ps(mask_lo, simde_mm256_set1_ps(SIMDE_FLOAT32_C( 1.0))); - multiplier = simde_mm256_or_ps(multiplier, simde_mm256_and_ps(mask_hi, simde_mm256_set1_ps(SIMDE_FLOAT32_C(-1.0)))); - multiplier = simde_mm256_or_ps(multiplier, simde_mm256_and_ps(mask_el, r)); - numerator = simde_mm256_mul_ps(numerator, multiplier); - } - - /* lo/hi: float denominator = (((((c_d[0] * q + c_d[1]) * q + c_d[2]) * q + c_d[3]) * 1 + 0.0f) * q + 1); */ - /* el: float denominator = (((((c_b[0] * q + c_b[1]) * q + c_b[2]) * q + c_b[3]) * q + c_b[4]) * q + 1); */ - simde__m256 denominator = simde_x_mm256_select_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C( 7.784695709041462e-03)), simde_mm256_set1_ps(SIMDE_FLOAT32_C(-5.447609879822406e+01)), mask_el); - denominator = simde_mm256_fmadd_ps(denominator, q, simde_x_mm256_select_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C( 3.224671290700398e-01)), simde_mm256_set1_ps(SIMDE_FLOAT32_C( 1.615858368580409e+02)), mask_el)); - denominator = simde_mm256_fmadd_ps(denominator, q, simde_x_mm256_select_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C( 2.445134137142996e+00)), simde_mm256_set1_ps(SIMDE_FLOAT32_C(-1.556989798598866e+02)), mask_el)); - denominator = simde_mm256_fmadd_ps(denominator, q, simde_x_mm256_select_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C( 3.754408661907416e+00)), simde_mm256_set1_ps(SIMDE_FLOAT32_C( 6.680131188771972e+01)), mask_el)); - denominator = simde_mm256_fmadd_ps(denominator, simde_x_mm256_select_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C( 1.0)), q, mask_el), - simde_x_mm256_select_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C( 0.0)), simde_mm256_set1_ps(SIMDE_FLOAT32_C(-1.328068155288572e+01)), mask_el)); - denominator = simde_mm256_fmadd_ps(denominator, q, simde_mm256_set1_ps(SIMDE_FLOAT32_C(1.0))); - - /* res = numerator / denominator; */ - simde__m256 res = simde_mm256_div_ps(numerator, denominator); - - retval = simde_mm256_or_ps(retval, simde_mm256_and_ps(mask, res)); - } - - return retval; - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_cdfnorminv_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_cdfnorminvf(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_cdfnorminv_ps - #define _mm256_cdfnorminv_ps(a) simde_mm256_cdfnorminv_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_cdfnorminv_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_cdfnorminv_pd(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(256) - simde__m256d matched, retval = simde_mm256_setzero_pd(); - - { /* if (a < 0 || a > 1) */ - matched = simde_mm256_or_pd(simde_mm256_cmp_pd(a, simde_mm256_set1_pd(SIMDE_FLOAT64_C(0.0)), SIMDE_CMP_LT_OQ), simde_mm256_cmp_pd(a, simde_mm256_set1_pd(SIMDE_FLOAT64_C(1.0)), SIMDE_CMP_GT_OQ)); - - /* We don't actually need to do anything here since we initialize - * retval to 0.0. */ - } - - { /* else if (a == 0) */ - simde__m256d mask = simde_mm256_cmp_pd(a, simde_mm256_set1_pd(SIMDE_FLOAT64_C(0.0)), SIMDE_CMP_EQ_OQ); - mask = simde_mm256_andnot_pd(matched, mask); - matched = simde_mm256_or_pd(matched, mask); - - simde__m256d res = simde_mm256_set1_pd(-SIMDE_MATH_INFINITY); - - retval = simde_mm256_or_pd(retval, simde_mm256_and_pd(mask, res)); - } - - { /* else if (a == 1) */ - simde__m256d mask = simde_mm256_cmp_pd(a, simde_mm256_set1_pd(SIMDE_FLOAT64_C(1.0)), SIMDE_CMP_EQ_OQ); - mask = simde_mm256_andnot_pd(matched, mask); - matched = simde_mm256_or_pd(matched, mask); - - simde__m256d res = simde_mm256_set1_pd(SIMDE_MATH_INFINITY); - - retval = simde_mm256_or_pd(retval, simde_mm256_and_pd(mask, res)); - } - - { /* Remaining conditions. - * - * Including the else case in this complicates things a lot, but - * we're using cheap operations to get rid of expensive multiply - * and add functions. This should be a small improvement on SSE - * prior to 4.1. On SSE 4.1 we can use _mm256_blendv_pd which is - * very fast and this becomes a huge win. NEON, AltiVec, and - * WASM also have blend operations, so this should be a big win - * there, too. */ - - /* else if (a < 0.02425) */ - simde__m256d mask_lo = simde_mm256_cmp_pd(a, simde_mm256_set1_pd(SIMDE_FLOAT64_C(0.02425)), SIMDE_CMP_LT_OQ); - /* else if (a > 0.97575) */ - simde__m256d mask_hi = simde_mm256_cmp_pd(a, simde_mm256_set1_pd(SIMDE_FLOAT64_C(0.97575)), SIMDE_CMP_GT_OQ); - - simde__m256d mask = simde_mm256_or_pd(mask_lo, mask_hi); - matched = simde_mm256_or_pd(matched, mask); - - /* else */ - simde__m256d mask_el = simde_x_mm256_not_pd(matched); - mask = simde_mm256_or_pd(mask, mask_el); - - /* r = a - 0.5 */ - simde__m256d r = simde_mm256_sub_pd(a, simde_mm256_set1_pd(SIMDE_FLOAT64_C(0.5))); - - /* lo: q = a - * hi: q = (1.0 - a) */ - simde__m256d q = simde_mm256_and_pd(mask_lo, a); - q = simde_mm256_or_pd(q, simde_mm256_and_pd(mask_hi, simde_mm256_sub_pd(simde_mm256_set1_pd(1.0), a))); - - /* q = simde_math_sqrt(-2.0 * simde_math_log(q)) */ - q = simde_mm256_log_pd(q); - q = simde_mm256_mul_pd(q, simde_mm256_set1_pd(SIMDE_FLOAT64_C(-2.0))); - q = simde_mm256_sqrt_pd(q); - - /* el: q = r * r */ - q = simde_x_mm256_select_pd(q, simde_mm256_mul_pd(r, r), mask_el); - - /* lo: double numerator = ((((((c_c[0] * q + c_c[1]) * q + c_c[2]) * q + c_c[3]) * q + c_c[4]) * q + c_c[5]) * 1.0); */ - /* hi: double numerator = ((((((c_c[0] * q + c_c[1]) * q + c_c[2]) * q + c_c[3]) * q + c_c[4]) * q + c_c[5]) * -1.0); */ - /* el: double numerator = ((((((c_a[0] * q + c_a[1]) * q + c_a[2]) * q + c_a[3]) * q + c_a[4]) * q + c_a[5]) * r); */ - simde__m256d numerator = simde_x_mm256_select_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C(-7.784894002430293e-03)), simde_mm256_set1_pd(SIMDE_FLOAT64_C(-3.969683028665376e+01)), mask_el); - numerator = simde_mm256_fmadd_pd(numerator, q, simde_x_mm256_select_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C(-3.223964580411365e-01)), simde_mm256_set1_pd(SIMDE_FLOAT64_C( 2.209460984245205e+02)), mask_el)); - numerator = simde_mm256_fmadd_pd(numerator, q, simde_x_mm256_select_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C(-2.400758277161838e+00)), simde_mm256_set1_pd(SIMDE_FLOAT64_C(-2.759285104469687e+02)), mask_el)); - numerator = simde_mm256_fmadd_pd(numerator, q, simde_x_mm256_select_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C(-2.549732539343734e+00)), simde_mm256_set1_pd(SIMDE_FLOAT64_C( 1.383577518672690e+02)), mask_el)); - numerator = simde_mm256_fmadd_pd(numerator, q, simde_x_mm256_select_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C( 4.374664141464968e+00)), simde_mm256_set1_pd(SIMDE_FLOAT64_C(-3.066479806614716e+01)), mask_el)); - numerator = simde_mm256_fmadd_pd(numerator, q, simde_x_mm256_select_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C( 2.938163982698783e+00)), simde_mm256_set1_pd(SIMDE_FLOAT64_C( 2.506628277459239e+00)), mask_el)); - { - simde__m256d multiplier; - multiplier = simde_mm256_and_pd(mask_lo, simde_mm256_set1_pd(SIMDE_FLOAT64_C( 1.0))); - multiplier = simde_mm256_or_pd(multiplier, simde_mm256_and_pd(mask_hi, simde_mm256_set1_pd(SIMDE_FLOAT64_C(-1.0)))); - multiplier = simde_mm256_or_pd(multiplier, simde_mm256_and_pd(mask_el, r)); - numerator = simde_mm256_mul_pd(numerator, multiplier); - } - - /* lo/hi: double denominator = (((((c_d[0] * q + c_d[1]) * q + c_d[2]) * q + c_d[3]) * 1 + 0.0f) * q + 1); */ - /* el: double denominator = (((((c_b[0] * q + c_b[1]) * q + c_b[2]) * q + c_b[3]) * q + c_b[4]) * q + 1); */ - simde__m256d denominator = simde_x_mm256_select_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C( 7.784695709041462e-03)), simde_mm256_set1_pd(SIMDE_FLOAT64_C(-5.447609879822406e+01)), mask_el); - denominator = simde_mm256_fmadd_pd(denominator, q, simde_x_mm256_select_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C( 3.224671290700398e-01)), simde_mm256_set1_pd(SIMDE_FLOAT64_C( 1.615858368580409e+02)), mask_el)); - denominator = simde_mm256_fmadd_pd(denominator, q, simde_x_mm256_select_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C( 2.445134137142996e+00)), simde_mm256_set1_pd(SIMDE_FLOAT64_C(-1.556989798598866e+02)), mask_el)); - denominator = simde_mm256_fmadd_pd(denominator, q, simde_x_mm256_select_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C( 3.754408661907416e+00)), simde_mm256_set1_pd(SIMDE_FLOAT64_C( 6.680131188771972e+01)), mask_el)); - denominator = simde_mm256_fmadd_pd(denominator, simde_x_mm256_select_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C( 1.0)), q, mask_el), - simde_x_mm256_select_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C( 0.0)), simde_mm256_set1_pd(SIMDE_FLOAT64_C(-1.328068155288572e+01)), mask_el)); - denominator = simde_mm256_fmadd_pd(denominator, q, simde_mm256_set1_pd(SIMDE_FLOAT64_C(1.0))); - - /* res = numerator / denominator; */ - simde__m256d res = simde_mm256_div_pd(numerator, denominator); - - retval = simde_mm256_or_pd(retval, simde_mm256_and_pd(mask, res)); - } - - return retval; - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_cdfnorminv_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_cdfnorminv(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_cdfnorminv_pd - #define _mm256_cdfnorminv_pd(a) simde_mm256_cdfnorminv_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_cdfnorminv_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cdfnorminv_ps(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_cdfnorminv_ps(a_.m256[i]); - } - - return simde__m512_from_private(r_); - #else - - simde__m512 retval = simde_mm512_setzero_ps(); - simde__mmask16 matched; - - { /* if (a < 0 || a > 1) */ - matched = simde_mm512_cmp_ps_mask(a, simde_mm512_set1_ps(SIMDE_FLOAT32_C(0.0)), SIMDE_CMP_LT_OQ); - matched |= simde_mm512_cmp_ps_mask(a, simde_mm512_set1_ps(SIMDE_FLOAT32_C(1.0)), SIMDE_CMP_GT_OQ); - - /* We don't actually need to do anything here since we initialize - * retval to 0.0. */ - } - - { /* else if (a == 0) */ - simde__mmask16 mask = simde_mm512_cmp_ps_mask(a, simde_mm512_set1_ps(SIMDE_FLOAT32_C(0.0)), SIMDE_CMP_EQ_OQ); - matched |= mask; - - retval = simde_mm512_mask_mov_ps(retval, mask, simde_mm512_set1_ps(-SIMDE_MATH_INFINITYF)); - } - - { /* else if (a == 1) */ - simde__mmask16 mask = simde_mm512_cmp_ps_mask(a, simde_mm512_set1_ps(SIMDE_FLOAT32_C(0.0)), SIMDE_CMP_EQ_OQ); - matched |= mask; - - retval = simde_mm512_mask_mov_ps(retval, mask, simde_mm512_set1_ps(SIMDE_MATH_INFINITYF)); - } - - { /* else if (a < 0.02425) */ - simde__mmask16 mask_lo = simde_mm512_cmp_ps_mask(a, simde_mm512_set1_ps(SIMDE_FLOAT32_C(0.02425)), SIMDE_CMP_LT_OQ); - /* else if (a > 0.97575) */ - simde__mmask16 mask_hi = simde_mm512_cmp_ps_mask(a, simde_mm512_set1_ps(SIMDE_FLOAT32_C(0.97575)), SIMDE_CMP_GT_OQ); - - simde__mmask16 mask = mask_lo | mask_hi; - matched = matched | mask; - - /* else */ - simde__mmask16 mask_el = ~matched; - - /* r = a - 0.5f */ - simde__m512 r = simde_mm512_sub_ps(a, simde_mm512_set1_ps(SIMDE_FLOAT32_C(0.5))); - - /* lo: q = a - * hi: q = (1.0 - a) */ - simde__m512 q = simde_mm512_maskz_mov_ps(mask_lo, a); - q = simde_mm512_mask_sub_ps(q, mask_hi, simde_mm512_set1_ps(SIMDE_FLOAT32_C(1.0)), a); - - /* q = simde_math_sqrtf(-2.0f * simde_math_logf(q)) */ - q = simde_mm512_log_ps(q); - q = simde_mm512_mul_ps(q, simde_mm512_set1_ps(SIMDE_FLOAT32_C(-2.0))); - q = simde_mm512_sqrt_ps(q); - - /* el: q = r * r */ - q = simde_mm512_mask_mul_ps(q, mask_el, r, r); - - /* lo: float numerator = ((((((c_c[0] * q + c_c[1]) * q + c_c[2]) * q + c_c[3]) * q + c_c[4]) * q + c_c[5]) * 1.0f); */ - /* hi: float numerator = ((((((c_c[0] * q + c_c[1]) * q + c_c[2]) * q + c_c[3]) * q + c_c[4]) * q + c_c[5]) * -1.0f); */ - /* el: float numerator = ((((((c_a[0] * q + c_a[1]) * q + c_a[2]) * q + c_a[3]) * q + c_a[4]) * q + c_a[5]) * r); */ - simde__m512 numerator = simde_mm512_mask_mov_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C(-7.784894002430293e-03)), mask_el, simde_mm512_set1_ps(SIMDE_FLOAT32_C(-3.969683028665376e+01))); - numerator = simde_mm512_fmadd_ps(numerator, q, simde_mm512_mask_mov_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C(-3.223964580411365e-01)), mask_el, simde_mm512_set1_ps(SIMDE_FLOAT32_C( 2.209460984245205e+02)))); - numerator = simde_mm512_fmadd_ps(numerator, q, simde_mm512_mask_mov_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C(-2.400758277161838e+00)), mask_el, simde_mm512_set1_ps(SIMDE_FLOAT32_C(-2.759285104469687e+02)))); - numerator = simde_mm512_fmadd_ps(numerator, q, simde_mm512_mask_mov_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C(-2.549732539343734e+00)), mask_el, simde_mm512_set1_ps(SIMDE_FLOAT32_C( 1.383577518672690e+02)))); - numerator = simde_mm512_fmadd_ps(numerator, q, simde_mm512_mask_mov_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C( 4.374664141464968e+00)), mask_el, simde_mm512_set1_ps(SIMDE_FLOAT32_C(-3.066479806614716e+01)))); - numerator = simde_mm512_fmadd_ps(numerator, q, simde_mm512_mask_mov_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C( 2.938163982698783e+00)), mask_el, simde_mm512_set1_ps(SIMDE_FLOAT32_C( 2.506628277459239e+00)))); - { - simde__m512 multiplier; - multiplier = simde_mm512_set1_ps(SIMDE_FLOAT32_C( 1.0)); - multiplier = simde_mm512_mask_mov_ps(multiplier, mask_hi, simde_mm512_set1_ps(SIMDE_FLOAT32_C(-1.0))); - multiplier = simde_mm512_mask_mov_ps(multiplier, mask_el, r); - numerator = simde_mm512_mul_ps(numerator, multiplier); - } - - /* lo/hi: float denominator = (((((c_d[0] * q + c_d[1]) * q + c_d[2]) * q + c_d[3]) * 1 + 0.0f) * q + 1); */ - /* el: float denominator = (((((c_b[0] * q + c_b[1]) * q + c_b[2]) * q + c_b[3]) * q + c_b[4]) * q + 1); */ - simde__m512 denominator = simde_mm512_mask_mov_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C( 7.784695709041462e-03)), mask_el, simde_mm512_set1_ps(SIMDE_FLOAT32_C(-5.447609879822406e+01))); - denominator = simde_mm512_fmadd_ps(denominator, q, simde_mm512_mask_mov_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C( 3.224671290700398e-01)), mask_el, simde_mm512_set1_ps(SIMDE_FLOAT32_C( 1.615858368580409e+02)))); - denominator = simde_mm512_fmadd_ps(denominator, q, simde_mm512_mask_mov_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C( 2.445134137142996e+00)), mask_el, simde_mm512_set1_ps(SIMDE_FLOAT32_C(-1.556989798598866e+02)))); - denominator = simde_mm512_fmadd_ps(denominator, q, simde_mm512_mask_mov_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C( 3.754408661907416e+00)), mask_el, simde_mm512_set1_ps(SIMDE_FLOAT32_C( 6.680131188771972e+01)))); - denominator = simde_mm512_fmadd_ps(denominator, simde_mm512_mask_mov_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C( 1.0)), mask_el, q), - simde_mm512_mask_mov_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C( 0.0)), mask_el, simde_mm512_set1_ps(SIMDE_FLOAT32_C(-1.328068155288572e+01)))); - denominator = simde_mm512_fmadd_ps(denominator, q, simde_mm512_set1_ps(SIMDE_FLOAT32_C(1.0))); - - /* res = numerator / denominator; */ - retval = simde_mm512_mask_div_ps(retval, mask_lo | mask_hi | mask_el, numerator, denominator); - } - - return retval; - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_cdfnorminv_ps - #define _mm512_cdfnorminv_ps(a) simde_mm512_cdfnorminv_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_cdfnorminv_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_cdfnorminv_pd(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_cdfnorminv_pd(a_.m256d[i]); - } - - return simde__m512d_from_private(r_); - #else - - simde__m512d retval = simde_mm512_setzero_pd(); - simde__mmask8 matched; - - { /* if (a < 0 || a > 1) */ - matched = simde_mm512_cmp_pd_mask(a, simde_mm512_set1_pd(SIMDE_FLOAT64_C(0.0)), SIMDE_CMP_LT_OQ); - matched |= simde_mm512_cmp_pd_mask(a, simde_mm512_set1_pd(SIMDE_FLOAT64_C(1.0)), SIMDE_CMP_GT_OQ); - - /* We don't actually need to do anything here since we initialize - * retval to 0.0. */ - } - - { /* else if (a == 0) */ - simde__mmask8 mask = simde_mm512_cmp_pd_mask(a, simde_mm512_set1_pd(SIMDE_FLOAT64_C(0.0)), SIMDE_CMP_EQ_OQ); - matched |= mask; - - retval = simde_mm512_mask_mov_pd(retval, mask, simde_mm512_set1_pd(-SIMDE_MATH_INFINITY)); - } - - { /* else if (a == 1) */ - simde__mmask8 mask = simde_mm512_cmp_pd_mask(a, simde_mm512_set1_pd(SIMDE_FLOAT64_C(0.0)), SIMDE_CMP_EQ_OQ); - matched |= mask; - - retval = simde_mm512_mask_mov_pd(retval, mask, simde_mm512_set1_pd(SIMDE_MATH_INFINITY)); - } - - { /* else if (a < 0.02425) */ - simde__mmask8 mask_lo = simde_mm512_cmp_pd_mask(a, simde_mm512_set1_pd(SIMDE_FLOAT64_C(0.02425)), SIMDE_CMP_LT_OQ); - /* else if (a > 0.97575) */ - simde__mmask8 mask_hi = simde_mm512_cmp_pd_mask(a, simde_mm512_set1_pd(SIMDE_FLOAT64_C(0.97575)), SIMDE_CMP_GT_OQ); - - simde__mmask8 mask = mask_lo | mask_hi; - matched = matched | mask; - - /* else */ - simde__mmask8 mask_el = ~matched; - - /* r = a - 0.5f */ - simde__m512d r = simde_mm512_sub_pd(a, simde_mm512_set1_pd(SIMDE_FLOAT64_C(0.5))); - - /* lo: q = a - * hi: q = (1.0 - a) */ - simde__m512d q = a; - q = simde_mm512_mask_sub_pd(q, mask_hi, simde_mm512_set1_pd(SIMDE_FLOAT64_C(1.0)), a); - - /* q = simde_math_sqrtf(-2.0f * simde_math_logf(q)) */ - q = simde_mm512_log_pd(q); - q = simde_mm512_mul_pd(q, simde_mm512_set1_pd(SIMDE_FLOAT64_C(-2.0))); - q = simde_mm512_sqrt_pd(q); - - /* el: q = r * r */ - q = simde_mm512_mask_mul_pd(q, mask_el, r, r); - - /* lo: float numerator = ((((((c_c[0] * q + c_c[1]) * q + c_c[2]) * q + c_c[3]) * q + c_c[4]) * q + c_c[5]) * 1.0f); */ - /* hi: float numerator = ((((((c_c[0] * q + c_c[1]) * q + c_c[2]) * q + c_c[3]) * q + c_c[4]) * q + c_c[5]) * -1.0f); */ - /* el: float numerator = ((((((c_a[0] * q + c_a[1]) * q + c_a[2]) * q + c_a[3]) * q + c_a[4]) * q + c_a[5]) * r); */ - simde__m512d numerator = simde_mm512_mask_mov_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C(-7.784894002430293e-03)), mask_el, simde_mm512_set1_pd(SIMDE_FLOAT64_C(-3.969683028665376e+01))); - numerator = simde_mm512_fmadd_pd(numerator, q, simde_mm512_mask_mov_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C(-3.223964580411365e-01)), mask_el, simde_mm512_set1_pd(SIMDE_FLOAT64_C( 2.209460984245205e+02)))); - numerator = simde_mm512_fmadd_pd(numerator, q, simde_mm512_mask_mov_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C(-2.400758277161838e+00)), mask_el, simde_mm512_set1_pd(SIMDE_FLOAT64_C(-2.759285104469687e+02)))); - numerator = simde_mm512_fmadd_pd(numerator, q, simde_mm512_mask_mov_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C(-2.549732539343734e+00)), mask_el, simde_mm512_set1_pd(SIMDE_FLOAT64_C( 1.383577518672690e+02)))); - numerator = simde_mm512_fmadd_pd(numerator, q, simde_mm512_mask_mov_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C( 4.374664141464968e+00)), mask_el, simde_mm512_set1_pd(SIMDE_FLOAT64_C(-3.066479806614716e+01)))); - numerator = simde_mm512_fmadd_pd(numerator, q, simde_mm512_mask_mov_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C( 2.938163982698783e+00)), mask_el, simde_mm512_set1_pd(SIMDE_FLOAT64_C( 2.506628277459239e+00)))); - { - simde__m512d multiplier; - multiplier = simde_mm512_set1_pd(SIMDE_FLOAT64_C( 1.0)); - multiplier = simde_mm512_mask_mov_pd(multiplier, mask_hi, simde_mm512_set1_pd(SIMDE_FLOAT64_C(-1.0))); - multiplier = simde_mm512_mask_mov_pd(multiplier, mask_el, r); - numerator = simde_mm512_mul_pd(numerator, multiplier); - } - - /* lo/hi: float denominator = (((((c_d[0] * q + c_d[1]) * q + c_d[2]) * q + c_d[3]) * 1 + 0.0f) * q + 1); */ - /* el: float denominator = (((((c_b[0] * q + c_b[1]) * q + c_b[2]) * q + c_b[3]) * q + c_b[4]) * q + 1); */ - simde__m512d denominator = simde_mm512_mask_mov_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C( 7.784695709041462e-03)), mask_el, simde_mm512_set1_pd(SIMDE_FLOAT64_C(-5.447609879822406e+01))); - denominator = simde_mm512_fmadd_pd(denominator, q, simde_mm512_mask_mov_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C( 3.224671290700398e-01)), mask_el, simde_mm512_set1_pd(SIMDE_FLOAT64_C( 1.615858368580409e+02)))); - denominator = simde_mm512_fmadd_pd(denominator, q, simde_mm512_mask_mov_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C( 2.445134137142996e+00)), mask_el, simde_mm512_set1_pd(SIMDE_FLOAT64_C(-1.556989798598866e+02)))); - denominator = simde_mm512_fmadd_pd(denominator, q, simde_mm512_mask_mov_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C( 3.754408661907416e+00)), mask_el, simde_mm512_set1_pd(SIMDE_FLOAT64_C( 6.680131188771972e+01)))); - denominator = simde_mm512_fmadd_pd(denominator, simde_mm512_mask_mov_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C( 1.0)), mask_el, q), - simde_mm512_mask_mov_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C( 0.0)), mask_el, simde_mm512_set1_pd(SIMDE_FLOAT64_C(-1.328068155288572e+01)))); - denominator = simde_mm512_fmadd_pd(denominator, q, simde_mm512_set1_pd(SIMDE_FLOAT64_C(1.0))); - - /* res = numerator / denominator; */ - retval = simde_mm512_mask_div_pd(retval, mask_lo | mask_hi | mask_el, numerator, denominator); - } - - return retval; - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_cdfnorminv_pd - #define _mm512_cdfnorminv_pd(a) simde_mm512_cdfnorminv_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_cdfnorminv_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cdfnorminv_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_cdfnorminv_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cdfnorminv_ps - #define _mm512_mask_cdfnorminv_ps(src, k, a) simde_mm512_mask_cdfnorminv_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_cdfnorminv_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_cdfnorminv_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_cdfnorminv_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_cdfnorminv_pd - #define _mm512_mask_cdfnorminv_pd(src, k, a) simde_mm512_mask_cdfnorminv_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_erfinv_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_erfinv_ps(a); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - /* https://stackoverflow.com/questions/27229371/inverse-error-function-in-c */ - simde__m128 one = simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0)); - - simde__m128 lnx = simde_mm_log_ps(simde_mm_mul_ps(simde_mm_sub_ps(one, a), simde_mm_add_ps(one, a))); - - simde__m128 tt1 = simde_mm_mul_ps(simde_mm_set1_ps(HEDLEY_STATIC_CAST(simde_float32, SIMDE_MATH_PI)), simde_mm_set1_ps(SIMDE_FLOAT32_C(0.147))); - tt1 = simde_mm_div_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C(2.0)), tt1); - tt1 = simde_mm_add_ps(tt1, simde_mm_mul_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C(0.5)), lnx)); - - simde__m128 tt2 = simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0) / SIMDE_FLOAT32_C(0.147)); - tt2 = simde_mm_mul_ps(tt2, lnx); - - simde__m128 r = simde_mm_mul_ps(tt1, tt1); - r = simde_mm_sub_ps(r, tt2); - r = simde_mm_sqrt_ps(r); - r = simde_mm_add_ps(simde_x_mm_negate_ps(tt1), r); - r = simde_mm_sqrt_ps(r); - - return simde_x_mm_xorsign_ps(r, a); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_erfinvf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_erfinv_ps - #define _mm_erfinv_ps(a) simde_mm_erfinv_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_erfinv_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_erfinv_pd(a); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - simde__m128d one = simde_mm_set1_pd(SIMDE_FLOAT64_C(1.0)); - - simde__m128d lnx = simde_mm_log_pd(simde_mm_mul_pd(simde_mm_sub_pd(one, a), simde_mm_add_pd(one, a))); - - simde__m128d tt1 = simde_mm_mul_pd(simde_mm_set1_pd(SIMDE_MATH_PI), simde_mm_set1_pd(SIMDE_FLOAT64_C(0.147))); - tt1 = simde_mm_div_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C(2.0)), tt1); - tt1 = simde_mm_add_pd(tt1, simde_mm_mul_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C(0.5)), lnx)); - - simde__m128d tt2 = simde_mm_set1_pd(SIMDE_FLOAT64_C(1.0) / SIMDE_FLOAT64_C(0.147)); - tt2 = simde_mm_mul_pd(tt2, lnx); - - simde__m128d r = simde_mm_mul_pd(tt1, tt1); - r = simde_mm_sub_pd(r, tt2); - r = simde_mm_sqrt_pd(r); - r = simde_mm_add_pd(simde_x_mm_negate_pd(tt1), r); - r = simde_mm_sqrt_pd(r); - - return simde_x_mm_xorsign_pd(r, a); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_erfinv(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_erfinv_pd - #define _mm_erfinv_pd(a) simde_mm_erfinv_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_erfinv_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_erfinv_ps(a); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - simde__m256 one = simde_mm256_set1_ps(SIMDE_FLOAT32_C(1.0)); - simde__m256 sgn = simde_x_mm256_copysign_ps(one, a); - - a = simde_mm256_mul_ps(simde_mm256_sub_ps(one, a), simde_mm256_add_ps(one, a)); - simde__m256 lnx = simde_mm256_log_ps(a); - - simde__m256 tt1 = simde_mm256_mul_ps(simde_mm256_set1_ps(HEDLEY_STATIC_CAST(simde_float32, SIMDE_MATH_PI)), simde_mm256_set1_ps(SIMDE_FLOAT32_C(0.147))); - tt1 = simde_mm256_div_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C(2.0)), tt1); - tt1 = simde_mm256_add_ps(tt1, simde_mm256_mul_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C(0.5)), lnx)); - - simde__m256 tt2 = simde_mm256_set1_ps(SIMDE_FLOAT32_C(1.0) / SIMDE_FLOAT32_C(0.147)); - tt2 = simde_mm256_mul_ps(tt2, lnx); - - simde__m256 r = simde_mm256_mul_ps(tt1, tt1); - r = simde_mm256_sub_ps(r, tt2); - r = simde_mm256_sqrt_ps(r); - r = simde_mm256_add_ps(simde_x_mm256_negate_ps(tt1), r); - r = simde_mm256_sqrt_ps(r); - - return simde_mm256_mul_ps(sgn, r); - #else - simde__m256_private - a_ = simde__m256_to_private(a), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_erfinvf(a_.f32[i]); - } - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_erfinv_ps - #define _mm256_erfinv_ps(a) simde_mm256_erfinv_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_erfinv_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_erfinv_pd(a); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - simde__m256d one = simde_mm256_set1_pd(SIMDE_FLOAT64_C(1.0)); - simde__m256d sgn = simde_x_mm256_copysign_pd(one, a); - - a = simde_mm256_mul_pd(simde_mm256_sub_pd(one, a), simde_mm256_add_pd(one, a)); - simde__m256d lnx = simde_mm256_log_pd(a); - - simde__m256d tt1 = simde_mm256_mul_pd(simde_mm256_set1_pd(SIMDE_MATH_PI), simde_mm256_set1_pd(SIMDE_FLOAT64_C(0.147))); - tt1 = simde_mm256_div_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C(2.0)), tt1); - tt1 = simde_mm256_add_pd(tt1, simde_mm256_mul_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C(0.5)), lnx)); - - simde__m256d tt2 = simde_mm256_set1_pd(SIMDE_FLOAT64_C(1.0) / SIMDE_FLOAT64_C(0.147)); - tt2 = simde_mm256_mul_pd(tt2, lnx); - - simde__m256d r = simde_mm256_mul_pd(tt1, tt1); - r = simde_mm256_sub_pd(r, tt2); - r = simde_mm256_sqrt_pd(r); - r = simde_mm256_add_pd(simde_x_mm256_negate_pd(tt1), r); - r = simde_mm256_sqrt_pd(r); - - return simde_mm256_mul_pd(sgn, r); - #else - simde__m256d_private - a_ = simde__m256d_to_private(a), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_erfinv(a_.f64[i]); - } - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_erfinv_pd - #define _mm256_erfinv_pd(a) simde_mm256_erfinv_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_erfinv_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_erfinv_ps(a); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - simde__m512 one = simde_mm512_set1_ps(SIMDE_FLOAT32_C(1.0)); - simde__m512 sgn = simde_x_mm512_copysign_ps(one, a); - - a = simde_mm512_mul_ps(simde_mm512_sub_ps(one, a), simde_mm512_add_ps(one, a)); - simde__m512 lnx = simde_mm512_log_ps(a); - - simde__m512 tt1 = simde_mm512_mul_ps(simde_mm512_set1_ps(HEDLEY_STATIC_CAST(simde_float32, SIMDE_MATH_PI)), simde_mm512_set1_ps(SIMDE_FLOAT32_C(0.147))); - tt1 = simde_mm512_div_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C(2.0)), tt1); - tt1 = simde_mm512_add_ps(tt1, simde_mm512_mul_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C(0.5)), lnx)); - - simde__m512 tt2 = simde_mm512_set1_ps(SIMDE_FLOAT32_C(1.0) / SIMDE_FLOAT32_C(0.147)); - tt2 = simde_mm512_mul_ps(tt2, lnx); - - simde__m512 r = simde_mm512_mul_ps(tt1, tt1); - r = simde_mm512_sub_ps(r, tt2); - r = simde_mm512_sqrt_ps(r); - r = simde_mm512_add_ps(simde_x_mm512_negate_ps(tt1), r); - r = simde_mm512_sqrt_ps(r); - - return simde_mm512_mul_ps(sgn, r); - #else - simde__m512_private - a_ = simde__m512_to_private(a), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_erfinvf(a_.f32[i]); - } - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_erfinv_ps - #define _mm512_erfinv_ps(a) simde_mm512_erfinv_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_erfinv_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_erfinv_pd(a); - #elif (SIMDE_NATURAL_VECTOR_SIZE > 0) - simde__m512d one = simde_mm512_set1_pd(SIMDE_FLOAT64_C(1.0)); - simde__m512d sgn = simde_x_mm512_copysign_pd(one, a); - - a = simde_mm512_mul_pd(simde_mm512_sub_pd(one, a), simde_mm512_add_pd(one, a)); - simde__m512d lnx = simde_mm512_log_pd(a); - - simde__m512d tt1 = simde_mm512_mul_pd(simde_mm512_set1_pd(SIMDE_MATH_PI), simde_mm512_set1_pd(SIMDE_FLOAT64_C(0.147))); - tt1 = simde_mm512_div_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C(2.0)), tt1); - tt1 = simde_mm512_add_pd(tt1, simde_mm512_mul_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C(0.5)), lnx)); - - simde__m512d tt2 = simde_mm512_set1_pd(SIMDE_FLOAT64_C(1.0) / SIMDE_FLOAT64_C(0.147)); - tt2 = simde_mm512_mul_pd(tt2, lnx); - - simde__m512d r = simde_mm512_mul_pd(tt1, tt1); - r = simde_mm512_sub_pd(r, tt2); - r = simde_mm512_sqrt_pd(r); - r = simde_mm512_add_pd(simde_x_mm512_negate_pd(tt1), r); - r = simde_mm512_sqrt_pd(r); - - return simde_mm512_mul_pd(sgn, r); - #else - simde__m512d_private - a_ = simde__m512d_to_private(a), - r_; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_erfinv(a_.f64[i]); - } - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_erfinv_pd - #define _mm512_erfinv_pd(a) simde_mm512_erfinv_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_erfinv_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_erfinv_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_erfinv_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_erfinv_ps - #define _mm512_mask_erfinv_ps(src, k, a) simde_mm512_mask_erfinv_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_erfinv_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_erfinv_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_erfinv_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_erfinv_pd - #define _mm512_mask_erfinv_pd(src, k, a) simde_mm512_mask_erfinv_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_erfcinv_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_erfcinv_ps(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - simde__m128 matched, retval = simde_mm_setzero_ps(); - - { /* if (a < 2.0f && a > 0.0625f) */ - matched = simde_mm_cmplt_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(2.0))); - matched = simde_mm_and_ps(matched, simde_mm_cmpgt_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.0625)))); - - if (!simde_mm_test_all_zeros(simde_mm_castps_si128(matched), simde_x_mm_setone_si128())) { - retval = simde_mm_erfinv_ps(simde_mm_sub_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0)), a)); - } - - if (simde_mm_test_all_ones(simde_mm_castps_si128(matched))) { - return retval; - } - } - - { /* else if (a < 0.0625f && a > 0.0f) */ - simde__m128 mask = simde_mm_cmplt_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.0625))); - mask = simde_mm_and_ps(mask, simde_mm_cmpgt_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.0)))); - mask = simde_mm_andnot_ps(matched, mask); - - if (!simde_mm_test_all_zeros(simde_mm_castps_si128(mask), simde_x_mm_setone_si128())) { - matched = simde_mm_or_ps(matched, mask); - - /* t = 1/(sqrt(-log(a))) */ - simde__m128 t = simde_x_mm_negate_ps(simde_mm_log_ps(a)); - t = simde_mm_sqrt_ps(t); - t = simde_mm_div_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0)), t); - - const simde__m128 p[] = { - simde_mm_set1_ps(SIMDE_FLOAT32_C( 0.1550470003116)), - simde_mm_set1_ps(SIMDE_FLOAT32_C( 1.382719649631)), - simde_mm_set1_ps(SIMDE_FLOAT32_C( 0.690969348887)), - simde_mm_set1_ps(SIMDE_FLOAT32_C(-1.128081391617)), - simde_mm_set1_ps(SIMDE_FLOAT32_C( 0.680544246825)), - simde_mm_set1_ps(SIMDE_FLOAT32_C(-0.164441567910)) - }; - - const simde__m128 q[] = { - simde_mm_set1_ps(SIMDE_FLOAT32_C( 0.155024849822)), - simde_mm_set1_ps(SIMDE_FLOAT32_C( 1.385228141995)), - simde_mm_set1_ps(SIMDE_FLOAT32_C( 1.000000000000)) - }; - - /* float numerator = p[0] / t + p[1] + t * (p[2] + t * (p[3] + t * (p[4] + t * p[5])))) */ - simde__m128 numerator = simde_mm_fmadd_ps(p[5], t, p[4]); - numerator = simde_mm_fmadd_ps(numerator, t, p[3]); - numerator = simde_mm_fmadd_ps(numerator, t, p[2]); - numerator = simde_mm_fmadd_ps(numerator, t, p[1]); - numerator = simde_mm_add_ps(numerator, simde_mm_div_ps(p[0], t)); - - /* float denominator = (q[0] + t * (q[1] + t * (q[2]))) */ - simde__m128 denominator = simde_mm_fmadd_ps(q[2], t, q[1]); - denominator = simde_mm_fmadd_ps(denominator, t, q[0]); - - simde__m128 res = simde_mm_div_ps(numerator, denominator); - - retval = simde_mm_or_ps(retval, simde_mm_and_ps(mask, res)); - } - } - - { /* else if (a < 0.0f) */ - simde__m128 mask = simde_mm_cmplt_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.0))); - mask = simde_mm_andnot_ps(matched, mask); - - if (!simde_mm_test_all_zeros(simde_mm_castps_si128(mask), simde_x_mm_setone_si128())) { - matched = simde_mm_or_ps(matched, mask); - - /* t = 1/(sqrt(-log(a))) */ - simde__m128 t = simde_x_mm_negate_ps(simde_mm_log_ps(a)); - t = simde_mm_sqrt_ps(t); - t = simde_mm_div_ps(simde_mm_set1_ps(SIMDE_FLOAT32_C(1.0)), t); - - const simde__m128 p[] = { - simde_mm_set1_ps(SIMDE_FLOAT32_C( 0.00980456202915)), - simde_mm_set1_ps(SIMDE_FLOAT32_C( 0.36366788917100)), - simde_mm_set1_ps(SIMDE_FLOAT32_C( 0.97302949837000)), - simde_mm_set1_ps(SIMDE_FLOAT32_C( -0.5374947401000)) - }; - - const simde__m128 q[] = { - simde_mm_set1_ps(SIMDE_FLOAT32_C( 0.00980451277802)), - simde_mm_set1_ps(SIMDE_FLOAT32_C( 0.36369997154400)), - simde_mm_set1_ps(SIMDE_FLOAT32_C( 1.00000000000000)) - }; - - /* float numerator = (p[0] / t + p[1] + t * (p[2] + t * p[3])) */ - simde__m128 numerator = simde_mm_fmadd_ps(p[3], t, p[2]); - numerator = simde_mm_fmadd_ps(numerator, t, p[1]); - numerator = simde_mm_add_ps(numerator, simde_mm_div_ps(p[0], t)); - - /* float denominator = (q[0] + t * (q[1] + t * (q[2]))) */ - simde__m128 denominator = simde_mm_fmadd_ps(q[2], t, q[1]); - denominator = simde_mm_fmadd_ps(denominator, t, q[0]); - - simde__m128 res = simde_mm_div_ps(numerator, denominator); - - retval = simde_mm_or_ps(retval, simde_mm_and_ps(mask, res)); - - if (simde_mm_test_all_ones(simde_mm_castps_si128(matched))) { - return retval; - } - } - } - - { /* else if (a == 0.0f) */ - simde__m128 mask = simde_mm_cmpeq_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(0.0))); - mask = simde_mm_andnot_ps(matched, mask); - matched = simde_mm_or_ps(matched, mask); - - simde__m128 res = simde_mm_set1_ps(SIMDE_MATH_INFINITYF); - - retval = simde_mm_or_ps(retval, simde_mm_and_ps(mask, res)); - } - - { /* else */ - /* (a >= 2.0f) */ - retval = simde_mm_or_ps(retval, simde_mm_andnot_ps(matched, simde_mm_set1_ps(-SIMDE_MATH_INFINITYF))); - } - - return retval; - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_erfcinvf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_erfcinv_ps - #define _mm_erfcinv_ps(a) simde_mm_erfcinv_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_erfcinv_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_erfcinv_pd(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(128) - simde__m128d matched, retval = simde_mm_setzero_pd(); - - { /* if (a < 2.0 && a > 0.0625) */ - matched = simde_mm_cmplt_pd(a, simde_mm_set1_pd(SIMDE_FLOAT64_C(2.0))); - matched = simde_mm_and_pd(matched, simde_mm_cmpgt_pd(a, simde_mm_set1_pd(SIMDE_FLOAT64_C(0.0625)))); - - if (!simde_mm_test_all_zeros(simde_mm_castpd_si128(matched), simde_x_mm_setone_si128())) { - retval = simde_mm_erfinv_pd(simde_mm_sub_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C(1.0)), a)); - } - - if (simde_mm_test_all_ones(simde_mm_castpd_si128(matched))) { - return retval; - } - } - - { /* else if (a < 0.0625 && a > 0.0) */ - simde__m128d mask = simde_mm_cmplt_pd(a, simde_mm_set1_pd(SIMDE_FLOAT64_C(0.0625))); - mask = simde_mm_and_pd(mask, simde_mm_cmpgt_pd(a, simde_mm_set1_pd(SIMDE_FLOAT64_C(0.0)))); - mask = simde_mm_andnot_pd(matched, mask); - - if (!simde_mm_test_all_zeros(simde_mm_castpd_si128(mask), simde_x_mm_setone_si128())) { - matched = simde_mm_or_pd(matched, mask); - - /* t = 1/(sqrt(-log(a))) */ - simde__m128d t = simde_x_mm_negate_pd(simde_mm_log_pd(a)); - t = simde_mm_sqrt_pd(t); - t = simde_mm_div_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C(1.0)), t); - - const simde__m128d p[] = { - simde_mm_set1_pd(SIMDE_FLOAT64_C( 0.1550470003116)), - simde_mm_set1_pd(SIMDE_FLOAT64_C( 1.382719649631)), - simde_mm_set1_pd(SIMDE_FLOAT64_C( 0.690969348887)), - simde_mm_set1_pd(SIMDE_FLOAT64_C(-1.128081391617)), - simde_mm_set1_pd(SIMDE_FLOAT64_C( 0.680544246825)), - simde_mm_set1_pd(SIMDE_FLOAT64_C(-0.164441567910)) - }; - - const simde__m128d q[] = { - simde_mm_set1_pd(SIMDE_FLOAT64_C( 0.155024849822)), - simde_mm_set1_pd(SIMDE_FLOAT64_C( 1.385228141995)), - simde_mm_set1_pd(SIMDE_FLOAT64_C( 1.000000000000)) - }; - - /* float numerator = p[0] / t + p[1] + t * (p[2] + t * (p[3] + t * (p[4] + t * p[5])))) */ - simde__m128d numerator = simde_mm_fmadd_pd(p[5], t, p[4]); - numerator = simde_mm_fmadd_pd(numerator, t, p[3]); - numerator = simde_mm_fmadd_pd(numerator, t, p[2]); - numerator = simde_mm_fmadd_pd(numerator, t, p[1]); - numerator = simde_mm_add_pd(numerator, simde_mm_div_pd(p[0], t)); - - /* float denominator = (q[0] + t * (q[1] + t * (q[2]))) */ - simde__m128d denominator = simde_mm_fmadd_pd(q[2], t, q[1]); - denominator = simde_mm_fmadd_pd(denominator, t, q[0]); - - simde__m128d res = simde_mm_div_pd(numerator, denominator); - - retval = simde_mm_or_pd(retval, simde_mm_and_pd(mask, res)); - } - } - - { /* else if (a < 0.0) */ - simde__m128d mask = simde_mm_cmplt_pd(a, simde_mm_set1_pd(SIMDE_FLOAT64_C(0.0))); - mask = simde_mm_andnot_pd(matched, mask); - - if (!simde_mm_test_all_zeros(simde_mm_castpd_si128(mask), simde_x_mm_setone_si128())) { - matched = simde_mm_or_pd(matched, mask); - - /* t = 1/(sqrt(-log(a))) */ - simde__m128d t = simde_x_mm_negate_pd(simde_mm_log_pd(a)); - t = simde_mm_sqrt_pd(t); - t = simde_mm_div_pd(simde_mm_set1_pd(SIMDE_FLOAT64_C(1.0)), t); - - const simde__m128d p[] = { - simde_mm_set1_pd(SIMDE_FLOAT64_C( 0.00980456202915)), - simde_mm_set1_pd(SIMDE_FLOAT64_C( 0.36366788917100)), - simde_mm_set1_pd(SIMDE_FLOAT64_C( 0.97302949837000)), - simde_mm_set1_pd(SIMDE_FLOAT64_C( -0.5374947401000)) - }; - - const simde__m128d q[] = { - simde_mm_set1_pd(SIMDE_FLOAT64_C( 0.00980451277802)), - simde_mm_set1_pd(SIMDE_FLOAT64_C( 0.36369997154400)), - simde_mm_set1_pd(SIMDE_FLOAT64_C( 1.00000000000000)) - }; - - /* float numerator = (p[0] / t + p[1] + t * (p[2] + t * p[3])) */ - simde__m128d numerator = simde_mm_fmadd_pd(p[3], t, p[2]); - numerator = simde_mm_fmadd_pd(numerator, t, p[1]); - numerator = simde_mm_add_pd(numerator, simde_mm_div_pd(p[0], t)); - - /* float denominator = (q[0] + t * (q[1] + t * (q[2]))) */ - simde__m128d denominator = simde_mm_fmadd_pd(q[2], t, q[1]); - denominator = simde_mm_fmadd_pd(denominator, t, q[0]); - - simde__m128d res = simde_mm_div_pd(numerator, denominator); - - retval = simde_mm_or_pd(retval, simde_mm_and_pd(mask, res)); - - if (simde_mm_test_all_ones(simde_mm_castpd_si128(matched))) { - return retval; - } - } - } - - { /* else if (a == 0.0) */ - simde__m128d mask = simde_mm_cmpeq_pd(a, simde_mm_set1_pd(SIMDE_FLOAT64_C(0.0))); - mask = simde_mm_andnot_pd(matched, mask); - matched = simde_mm_or_pd(matched, mask); - - simde__m128d res = simde_mm_set1_pd(SIMDE_MATH_INFINITY); - - retval = simde_mm_or_pd(retval, simde_mm_and_pd(mask, res)); - } - - { /* else */ - /* (a >= 2.0) */ - retval = simde_mm_or_pd(retval, simde_mm_andnot_pd(matched, simde_mm_set1_pd(-SIMDE_MATH_INFINITY))); - } - - return retval; - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_erfcinv(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_erfcinv_pd - #define _mm_erfcinv_pd(a) simde_mm_erfcinv_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_erfcinv_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_erfcinv_ps(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(256) - simde__m256 matched, retval = simde_mm256_setzero_ps(); - - { /* if (a < 2.0f && a > 0.0625f) */ - matched = simde_mm256_cmp_ps(a, simde_mm256_set1_ps(SIMDE_FLOAT32_C(2.0)), SIMDE_CMP_LT_OQ); - matched = simde_mm256_and_ps(matched, simde_mm256_cmp_ps(a, simde_mm256_set1_ps(SIMDE_FLOAT32_C(0.0625)), SIMDE_CMP_GT_OQ)); - - if (!simde_mm256_testz_ps(matched, matched)) { - retval = simde_mm256_erfinv_ps(simde_mm256_sub_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C(1.0)), a)); - } - - if (simde_x_mm256_test_all_ones(simde_mm256_castps_si256(matched))) { - return retval; - } - } - - { /* else if (a < 0.0625f && a > 0.0f) */ - simde__m256 mask = simde_mm256_cmp_ps(a, simde_mm256_set1_ps(SIMDE_FLOAT32_C(0.0625)), SIMDE_CMP_LT_OQ); - mask = simde_mm256_and_ps(mask, simde_mm256_cmp_ps(a, simde_mm256_set1_ps(SIMDE_FLOAT32_C(0.0)), SIMDE_CMP_GT_OQ)); - mask = simde_mm256_andnot_ps(matched, mask); - - if (!simde_mm256_testz_ps(mask, mask)) { - matched = simde_mm256_or_ps(matched, mask); - - /* t = 1/(sqrt(-log(a))) */ - simde__m256 t = simde_x_mm256_negate_ps(simde_mm256_log_ps(a)); - t = simde_mm256_sqrt_ps(t); - t = simde_mm256_div_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C(1.0)), t); - - const simde__m256 p[] = { - simde_mm256_set1_ps(SIMDE_FLOAT32_C( 0.1550470003116)), - simde_mm256_set1_ps(SIMDE_FLOAT32_C( 1.382719649631)), - simde_mm256_set1_ps(SIMDE_FLOAT32_C( 0.690969348887)), - simde_mm256_set1_ps(SIMDE_FLOAT32_C(-1.128081391617)), - simde_mm256_set1_ps(SIMDE_FLOAT32_C( 0.680544246825)), - simde_mm256_set1_ps(SIMDE_FLOAT32_C(-0.16444156791)) - }; - - const simde__m256 q[] = { - simde_mm256_set1_ps(SIMDE_FLOAT32_C( 0.155024849822)), - simde_mm256_set1_ps(SIMDE_FLOAT32_C( 1.385228141995)), - simde_mm256_set1_ps(SIMDE_FLOAT32_C( 1.000000000000)) - }; - - /* float numerator = p[0] / t + p[1] + t * (p[2] + t * (p[3] + t * (p[4] + t * p[5])))) */ - simde__m256 numerator = simde_mm256_fmadd_ps(p[5], t, p[4]); - numerator = simde_mm256_fmadd_ps(numerator, t, p[3]); - numerator = simde_mm256_fmadd_ps(numerator, t, p[2]); - numerator = simde_mm256_fmadd_ps(numerator, t, p[1]); - numerator = simde_mm256_add_ps(numerator, simde_mm256_div_ps(p[0], t)); - - /* float denominator = (q[0] + t * (q[1] + t * (q[2]))) */ - simde__m256 denominator = simde_mm256_fmadd_ps(q[2], t, q[1]); - denominator = simde_mm256_fmadd_ps(denominator, t, q[0]); - - simde__m256 res = simde_mm256_div_ps(numerator, denominator); - - retval = simde_mm256_or_ps(retval, simde_mm256_and_ps(mask, res)); - } - } - - { /* else if (a < 0.0f) */ - simde__m256 mask = simde_mm256_cmp_ps(a, simde_mm256_set1_ps(SIMDE_FLOAT32_C(0.0)), SIMDE_CMP_LT_OQ); - mask = simde_mm256_andnot_ps(matched, mask); - - if (!simde_mm256_testz_ps(mask, mask)) { - matched = simde_mm256_or_ps(matched, mask); - - /* t = 1/(sqrt(-log(a))) */ - simde__m256 t = simde_x_mm256_negate_ps(simde_mm256_log_ps(a)); - t = simde_mm256_sqrt_ps(t); - t = simde_mm256_div_ps(simde_mm256_set1_ps(SIMDE_FLOAT32_C(1.0)), t); - - const simde__m256 p[] = { - simde_mm256_set1_ps(SIMDE_FLOAT32_C( 0.00980456202915)), - simde_mm256_set1_ps(SIMDE_FLOAT32_C( 0.36366788917100)), - simde_mm256_set1_ps(SIMDE_FLOAT32_C( 0.97302949837000)), - simde_mm256_set1_ps(SIMDE_FLOAT32_C(-0.5374947401000)) - }; - - const simde__m256 q[] = { - simde_mm256_set1_ps(SIMDE_FLOAT32_C( 0.00980451277802)), - simde_mm256_set1_ps(SIMDE_FLOAT32_C( 0.36369997154400)), - simde_mm256_set1_ps(SIMDE_FLOAT32_C( 1.00000000000000)) - }; - - /* float numerator = (p[0] / t + p[1] + t * (p[2] + t * p[3])) */ - simde__m256 numerator = simde_mm256_fmadd_ps(p[3], t, p[2]); - numerator = simde_mm256_fmadd_ps(numerator, t, p[1]); - numerator = simde_mm256_add_ps(numerator, simde_mm256_div_ps(p[0], t)); - - /* float denominator = (q[0] + t * (q[1] + t * (q[2]))) */ - simde__m256 denominator = simde_mm256_fmadd_ps(q[2], t, q[1]); - denominator = simde_mm256_fmadd_ps(denominator, t, q[0]); - - simde__m256 res = simde_mm256_div_ps(numerator, denominator); - - retval = simde_mm256_or_ps(retval, simde_mm256_and_ps(mask, res)); - - if (simde_x_mm256_test_all_ones(simde_mm256_castps_si256(matched))) { - return retval; - } - } - } - - { /* else if (a == 0.0f) */ - simde__m256 mask = simde_mm256_cmp_ps(a, simde_mm256_set1_ps(SIMDE_FLOAT32_C(0.0)), SIMDE_CMP_EQ_OQ); - mask = simde_mm256_andnot_ps(matched, mask); - matched = simde_mm256_or_ps(matched, mask); - - simde__m256 res = simde_mm256_set1_ps(SIMDE_MATH_INFINITYF); - - retval = simde_mm256_or_ps(retval, simde_mm256_and_ps(mask, res)); - } - - { /* else */ - /* (a >= 2.0f) */ - retval = simde_mm256_or_ps(retval, simde_mm256_andnot_ps(matched, simde_mm256_set1_ps(-SIMDE_MATH_INFINITYF))); - } - - return retval; - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_erfcinv_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_erfcinvf(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_erfcinv_ps - #define _mm256_erfcinv_ps(a) simde_mm256_erfcinv_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_erfcinv_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_erfcinv_pd(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_GE(256) - simde__m256d matched, retval = simde_mm256_setzero_pd(); - - { /* if (a < 2.0 && a > 0.0625) */ - matched = simde_mm256_cmp_pd(a, simde_mm256_set1_pd(SIMDE_FLOAT64_C(2.0)), SIMDE_CMP_LT_OQ); - matched = simde_mm256_and_pd(matched, simde_mm256_cmp_pd(a, simde_mm256_set1_pd(SIMDE_FLOAT64_C(0.0625)), SIMDE_CMP_GT_OQ)); - - if (!simde_mm256_testz_pd(matched, matched)) { - retval = simde_mm256_erfinv_pd(simde_mm256_sub_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C(1.0)), a)); - } - - if (simde_x_mm256_test_all_ones(simde_mm256_castpd_si256(matched))) { - return retval; - } - } - - { /* else if (a < 0.0625 && a > 0.0) */ - simde__m256d mask = simde_mm256_cmp_pd(a, simde_mm256_set1_pd(SIMDE_FLOAT64_C(0.0625)), SIMDE_CMP_LT_OQ); - mask = simde_mm256_and_pd(mask, simde_mm256_cmp_pd(a, simde_mm256_set1_pd(SIMDE_FLOAT64_C(0.0)), SIMDE_CMP_GT_OQ)); - mask = simde_mm256_andnot_pd(matched, mask); - - if (!simde_mm256_testz_pd(mask, mask)) { - matched = simde_mm256_or_pd(matched, mask); - - /* t = 1/(sqrt(-log(a))) */ - simde__m256d t = simde_x_mm256_negate_pd(simde_mm256_log_pd(a)); - t = simde_mm256_sqrt_pd(t); - t = simde_mm256_div_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C(1.0)), t); - - const simde__m256d p[] = { - simde_mm256_set1_pd(SIMDE_FLOAT64_C( 0.1550470003116)), - simde_mm256_set1_pd(SIMDE_FLOAT64_C( 1.382719649631)), - simde_mm256_set1_pd(SIMDE_FLOAT64_C( 0.690969348887)), - simde_mm256_set1_pd(SIMDE_FLOAT64_C(-1.128081391617)), - simde_mm256_set1_pd(SIMDE_FLOAT64_C( 0.680544246825)), - simde_mm256_set1_pd(SIMDE_FLOAT64_C(-0.16444156791)) - }; - - const simde__m256d q[] = { - simde_mm256_set1_pd(SIMDE_FLOAT64_C( 0.155024849822)), - simde_mm256_set1_pd(SIMDE_FLOAT64_C( 1.385228141995)), - simde_mm256_set1_pd(SIMDE_FLOAT64_C( 1.000000000000)) - }; - - /* float numerator = p[0] / t + p[1] + t * (p[2] + t * (p[3] + t * (p[4] + t * p[5])))) */ - simde__m256d numerator = simde_mm256_fmadd_pd(p[5], t, p[4]); - numerator = simde_mm256_fmadd_pd(numerator, t, p[3]); - numerator = simde_mm256_fmadd_pd(numerator, t, p[2]); - numerator = simde_mm256_fmadd_pd(numerator, t, p[1]); - numerator = simde_mm256_add_pd(numerator, simde_mm256_div_pd(p[0], t)); - - /* float denominator = (q[0] + t * (q[1] + t * (q[2]))) */ - simde__m256d denominator = simde_mm256_fmadd_pd(q[2], t, q[1]); - denominator = simde_mm256_fmadd_pd(denominator, t, q[0]); - - simde__m256d res = simde_mm256_div_pd(numerator, denominator); - - retval = simde_mm256_or_pd(retval, simde_mm256_and_pd(mask, res)); - } - } - - { /* else if (a < 0.0) */ - simde__m256d mask = simde_mm256_cmp_pd(a, simde_mm256_set1_pd(SIMDE_FLOAT64_C(0.0)), SIMDE_CMP_LT_OQ); - mask = simde_mm256_andnot_pd(matched, mask); - - if (!simde_mm256_testz_pd(mask, mask)) { - matched = simde_mm256_or_pd(matched, mask); - - /* t = 1/(sqrt(-log(a))) */ - simde__m256d t = simde_x_mm256_negate_pd(simde_mm256_log_pd(a)); - t = simde_mm256_sqrt_pd(t); - t = simde_mm256_div_pd(simde_mm256_set1_pd(SIMDE_FLOAT64_C(1.0)), t); - - const simde__m256d p[] = { - simde_mm256_set1_pd(SIMDE_FLOAT64_C( 0.00980456202915)), - simde_mm256_set1_pd(SIMDE_FLOAT64_C( 0.36366788917100)), - simde_mm256_set1_pd(SIMDE_FLOAT64_C( 0.97302949837000)), - simde_mm256_set1_pd(SIMDE_FLOAT64_C(-0.5374947401000)) - }; - - const simde__m256d q[] = { - simde_mm256_set1_pd(SIMDE_FLOAT64_C( 0.00980451277802)), - simde_mm256_set1_pd(SIMDE_FLOAT64_C( 0.36369997154400)), - simde_mm256_set1_pd(SIMDE_FLOAT64_C( 1.00000000000000)) - }; - - /* float numerator = (p[0] / t + p[1] + t * (p[2] + t * p[3])) */ - simde__m256d numerator = simde_mm256_fmadd_pd(p[3], t, p[2]); - numerator = simde_mm256_fmadd_pd(numerator, t, p[1]); - numerator = simde_mm256_add_pd(numerator, simde_mm256_div_pd(p[0], t)); - - /* float denominator = (q[0] + t * (q[1] + t * (q[2]))) */ - simde__m256d denominator = simde_mm256_fmadd_pd(q[2], t, q[1]); - denominator = simde_mm256_fmadd_pd(denominator, t, q[0]); - - simde__m256d res = simde_mm256_div_pd(numerator, denominator); - - retval = simde_mm256_or_pd(retval, simde_mm256_and_pd(mask, res)); - - if (simde_x_mm256_test_all_ones(simde_mm256_castpd_si256(matched))) { - return retval; - } - } - } - - { /* else if (a == 0.0) */ - simde__m256d mask = simde_mm256_cmp_pd(a, simde_mm256_set1_pd(SIMDE_FLOAT64_C(0.0)), SIMDE_CMP_EQ_OQ); - mask = simde_mm256_andnot_pd(matched, mask); - matched = simde_mm256_or_pd(matched, mask); - - simde__m256d res = simde_mm256_set1_pd(SIMDE_MATH_INFINITY); - - retval = simde_mm256_or_pd(retval, simde_mm256_and_pd(mask, res)); - } - - { /* else */ - /* (a >= 2.0) */ - retval = simde_mm256_or_pd(retval, simde_mm256_andnot_pd(matched, simde_mm256_set1_pd(-SIMDE_MATH_INFINITY))); - } - - return retval; - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_erfcinv_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_erfcinv(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_erfcinv_pd - #define _mm256_erfcinv_pd(a) simde_mm256_erfcinv_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_erfcinv_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_erfcinv_ps(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) && (!defined(SIMDE_ARCH_ARM) || defined(SIMDE_ARCH_AARCH64)) - /* The results on Arm are *slightly* off, which causes problems for - * the edge cases; for example, if you pass 2.0 sqrt will be called - * with a value of -0.0 instead of 0.0, resulting in a NaN. */ - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_erfcinv_ps(a_.m256[i]); - } - return simde__m512_from_private(r_); - #else - simde__m512 retval = simde_mm512_setzero_ps(); - simde__mmask16 matched; - - { /* if (a < 2.0f && a > 0.0625f) */ - matched = simde_mm512_cmp_ps_mask(a, simde_mm512_set1_ps(SIMDE_FLOAT32_C(2.0)), SIMDE_CMP_LT_OQ); - matched &= simde_mm512_cmp_ps_mask(a, simde_mm512_set1_ps(SIMDE_FLOAT32_C(0.0625)), SIMDE_CMP_GT_OQ); - - if (matched != 0) { - retval = simde_mm512_erfinv_ps(simde_mm512_sub_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C(1.0)), a)); - } - - if (matched == 1) { - return retval; - } - } - - { /* else if (a < 0.0625f && a > 0.0f) */ - simde__mmask16 mask = simde_mm512_cmp_ps_mask(a, simde_mm512_set1_ps(SIMDE_FLOAT32_C(0.0625)), SIMDE_CMP_LT_OQ); - mask &= simde_mm512_cmp_ps_mask(a, simde_mm512_set1_ps(SIMDE_FLOAT32_C(0.0)), SIMDE_CMP_GT_OQ); - mask = ~matched & mask; - - if (mask != 0) { - matched = matched | mask; - - /* t = 1/(sqrt(-log(a))) */ - simde__m512 t = simde_x_mm512_negate_ps(simde_mm512_log_ps(a)); - t = simde_mm512_sqrt_ps(t); - t = simde_mm512_div_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C(1.0)), t); - - const simde__m512 p[] = { - simde_mm512_set1_ps(SIMDE_FLOAT32_C( 0.1550470003116)), - simde_mm512_set1_ps(SIMDE_FLOAT32_C( 1.382719649631)), - simde_mm512_set1_ps(SIMDE_FLOAT32_C( 0.690969348887)), - simde_mm512_set1_ps(SIMDE_FLOAT32_C(-1.128081391617)), - simde_mm512_set1_ps(SIMDE_FLOAT32_C( 0.680544246825)), - simde_mm512_set1_ps(SIMDE_FLOAT32_C(-0.16444156791)) - }; - - const simde__m512 q[] = { - simde_mm512_set1_ps(SIMDE_FLOAT32_C( 0.155024849822)), - simde_mm512_set1_ps(SIMDE_FLOAT32_C( 1.385228141995)), - simde_mm512_set1_ps(SIMDE_FLOAT32_C( 1.000000000000)) - }; - - /* float numerator = p[0] / t + p[1] + t * (p[2] + t * (p[3] + t * (p[4] + t * p[5])))) */ - simde__m512 numerator = simde_mm512_fmadd_ps(p[5], t, p[4]); - numerator = simde_mm512_fmadd_ps(numerator, t, p[3]); - numerator = simde_mm512_fmadd_ps(numerator, t, p[2]); - numerator = simde_mm512_fmadd_ps(numerator, t, p[1]); - numerator = simde_mm512_add_ps(numerator, simde_mm512_div_ps(p[0], t)); - - /* float denominator = (q[0] + t * (q[1] + t * (q[2]))) */ - simde__m512 denominator = simde_mm512_fmadd_ps(q[2], t, q[1]); - denominator = simde_mm512_fmadd_ps(denominator, t, q[0]); - - simde__m512 res = simde_mm512_div_ps(numerator, denominator); - - retval = simde_mm512_or_ps(retval, simde_mm512_maskz_mov_ps(mask, res)); - } - } - - { /* else if (a < 0.0f) */ - simde__mmask16 mask = simde_mm512_cmp_ps_mask(a, simde_mm512_set1_ps(SIMDE_FLOAT32_C(0.0)), SIMDE_CMP_LT_OQ); - mask = ~matched & mask; - - if (mask != 0) { - matched = matched | mask; - - /* t = 1/(sqrt(-log(a))) */ - simde__m512 t = simde_x_mm512_negate_ps(simde_mm512_log_ps(a)); - t = simde_mm512_sqrt_ps(t); - t = simde_mm512_div_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C(1.0)), t); - - const simde__m512 p[] = { - simde_mm512_set1_ps(SIMDE_FLOAT32_C( 0.00980456202915)), - simde_mm512_set1_ps(SIMDE_FLOAT32_C( 0.36366788917100)), - simde_mm512_set1_ps(SIMDE_FLOAT32_C( 0.97302949837000)), - simde_mm512_set1_ps(SIMDE_FLOAT32_C( -0.5374947401000)) - }; - - const simde__m512 q[] = { - simde_mm512_set1_ps(SIMDE_FLOAT32_C( 0.00980451277802)), - simde_mm512_set1_ps(SIMDE_FLOAT32_C( 0.36369997154400)), - simde_mm512_set1_ps(SIMDE_FLOAT32_C( 1.00000000000000)) - }; - - /* float numerator = (p[0] / t + p[1] + t * (p[2] + t * p[3])) */ - simde__m512 numerator = simde_mm512_fmadd_ps(p[3], t, p[2]); - numerator = simde_mm512_fmadd_ps(numerator, t, p[1]); - numerator = simde_mm512_add_ps(numerator, simde_mm512_div_ps(p[0], t)); - - /* float denominator = (q[0] + t * (q[1] + t * (q[2]))) */ - simde__m512 denominator = simde_mm512_fmadd_ps(q[2], t, q[1]); - denominator = simde_mm512_fmadd_ps(denominator, t, q[0]); - - simde__m512 res = simde_mm512_div_ps(numerator, denominator); - - retval = simde_mm512_or_ps(retval, simde_mm512_maskz_mov_ps(mask, res)); - - if (matched == 1) { - return retval; - } - } - } - - { /* else if (a == 0.0f) */ - simde__mmask16 mask = simde_mm512_cmp_ps_mask(a, simde_mm512_set1_ps(SIMDE_FLOAT32_C(0.0)), SIMDE_CMP_EQ_OQ); - mask = ~matched & mask; - matched = matched | mask; - - simde__m512 res = simde_mm512_set1_ps(SIMDE_MATH_INFINITYF); - - retval = simde_mm512_or_ps(retval, simde_mm512_maskz_mov_ps(mask, res)); - } - - { /* else */ - /* (a >= 2.0f) */ - retval = simde_mm512_or_ps(retval, simde_mm512_maskz_mov_ps(~matched, simde_mm512_set1_ps(-SIMDE_MATH_INFINITYF))); - } - - return retval; - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_erfcinv_ps - #define _mm512_erfcinv_ps(a) simde_mm512_erfcinv_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_erfcinv_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_erfcinv_pd(a); - #elif SIMDE_NATURAL_VECTOR_SIZE_LE(256) - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_erfcinv_pd(a_.m256d[i]); - } - return simde__m512d_from_private(r_); - #else - simde__m512d retval = simde_mm512_setzero_pd(); - simde__mmask8 matched; - - { /* if (a < 2.0f && a > 0.0625f) */ - matched = simde_mm512_cmp_pd_mask(a, simde_mm512_set1_pd(SIMDE_FLOAT64_C(2.0)), SIMDE_CMP_LT_OQ); - matched &= simde_mm512_cmp_pd_mask(a, simde_mm512_set1_pd(SIMDE_FLOAT64_C(0.0625)), SIMDE_CMP_GT_OQ); - - if (matched != 0) { - retval = simde_mm512_erfinv_pd(simde_mm512_sub_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C(1.0)), a)); - } - - if (matched == 1) { - return retval; - } - } - - { /* else if (a < 0.0625f && a > 0.0f) */ - simde__mmask8 mask = simde_mm512_cmp_pd_mask(a, simde_mm512_set1_pd(SIMDE_FLOAT64_C(0.0625)), SIMDE_CMP_LT_OQ); - mask &= simde_mm512_cmp_pd_mask(a, simde_mm512_set1_pd(SIMDE_FLOAT64_C(0.0)), SIMDE_CMP_GT_OQ); - mask = ~matched & mask; - - if (mask != 0) { - matched = matched | mask; - - /* t = 1/(sqrt(-log(a))) */ - simde__m512d t = simde_x_mm512_negate_pd(simde_mm512_log_pd(a)); - t = simde_mm512_sqrt_pd(t); - t = simde_mm512_div_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C(1.0)), t); - - const simde__m512d p[] = { - simde_mm512_set1_pd(SIMDE_FLOAT64_C( 0.1550470003116)), - simde_mm512_set1_pd(SIMDE_FLOAT64_C( 1.382719649631)), - simde_mm512_set1_pd(SIMDE_FLOAT64_C( 0.690969348887)), - simde_mm512_set1_pd(SIMDE_FLOAT64_C(-1.128081391617)), - simde_mm512_set1_pd(SIMDE_FLOAT64_C( 0.680544246825)), - simde_mm512_set1_pd(SIMDE_FLOAT64_C(-0.16444156791)) - }; - - const simde__m512d q[] = { - simde_mm512_set1_pd(SIMDE_FLOAT64_C( 0.155024849822)), - simde_mm512_set1_pd(SIMDE_FLOAT64_C( 1.385228141995)), - simde_mm512_set1_pd(SIMDE_FLOAT64_C( 1.000000000000)) - }; - - /* float numerator = p[0] / t + p[1] + t * (p[2] + t * (p[3] + t * (p[4] + t * p[5])))) */ - simde__m512d numerator = simde_mm512_fmadd_pd(p[5], t, p[4]); - numerator = simde_mm512_fmadd_pd(numerator, t, p[3]); - numerator = simde_mm512_fmadd_pd(numerator, t, p[2]); - numerator = simde_mm512_fmadd_pd(numerator, t, p[1]); - numerator = simde_mm512_add_pd(numerator, simde_mm512_div_pd(p[0], t)); - - /* float denominator = (q[0] + t * (q[1] + t * (q[2]))) */ - simde__m512d denominator = simde_mm512_fmadd_pd(q[2], t, q[1]); - denominator = simde_mm512_fmadd_pd(denominator, t, q[0]); - - simde__m512d res = simde_mm512_div_pd(numerator, denominator); - - retval = simde_mm512_or_pd(retval, simde_mm512_maskz_mov_pd(mask, res)); - } - } - - { /* else if (a < 0.0f) */ - simde__mmask8 mask = simde_mm512_cmp_pd_mask(a, simde_mm512_set1_pd(SIMDE_FLOAT64_C(0.0)), SIMDE_CMP_LT_OQ); - mask = ~matched & mask; - - if (mask != 0) { - matched = matched | mask; - - /* t = 1/(sqrt(-log(a))) */ - simde__m512d t = simde_x_mm512_negate_pd(simde_mm512_log_pd(a)); - t = simde_mm512_sqrt_pd(t); - t = simde_mm512_div_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C(1.0)), t); - - const simde__m512d p[] = { - simde_mm512_set1_pd(SIMDE_FLOAT64_C( 0.00980456202915)), - simde_mm512_set1_pd(SIMDE_FLOAT64_C( 0.36366788917100)), - simde_mm512_set1_pd(SIMDE_FLOAT64_C( 0.97302949837000)), - simde_mm512_set1_pd(SIMDE_FLOAT64_C( -0.5374947401000)) - }; - - const simde__m512d q[] = { - simde_mm512_set1_pd(SIMDE_FLOAT64_C( 0.00980451277802)), - simde_mm512_set1_pd(SIMDE_FLOAT64_C( 0.36369997154400)), - simde_mm512_set1_pd(SIMDE_FLOAT64_C( 1.00000000000000)) - }; - - /* float numerator = (p[0] / t + p[1] + t * (p[2] + t * p[3])) */ - simde__m512d numerator = simde_mm512_fmadd_pd(p[3], t, p[2]); - numerator = simde_mm512_fmadd_pd(numerator, t, p[1]); - numerator = simde_mm512_add_pd(numerator, simde_mm512_div_pd(p[0], t)); - - /* float denominator = (q[0] + t * (q[1] + t * (q[2]))) */ - simde__m512d denominator = simde_mm512_fmadd_pd(q[2], t, q[1]); - denominator = simde_mm512_fmadd_pd(denominator, t, q[0]); - - simde__m512d res = simde_mm512_div_pd(numerator, denominator); - - retval = simde_mm512_or_pd(retval, simde_mm512_maskz_mov_pd(mask, res)); - - if (matched == 1) { - return retval; - } - } - } - - { /* else if (a == 0.0f) */ - simde__mmask8 mask = simde_mm512_cmp_pd_mask(a, simde_mm512_set1_pd(SIMDE_FLOAT64_C(0.0)), SIMDE_CMP_EQ_OQ); - mask = ~matched & mask; - matched = matched | mask; - - simde__m512d res = simde_mm512_set1_pd(SIMDE_MATH_INFINITY); - - retval = simde_mm512_or_pd(retval, simde_mm512_maskz_mov_pd(mask, res)); - } - - { /* else */ - /* (a >= 2.0f) */ - retval = simde_mm512_or_pd(retval, simde_mm512_maskz_mov_pd(~matched, simde_mm512_set1_pd(-SIMDE_MATH_INFINITY))); - } - - return retval; - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_erfcinv_pd - #define _mm512_erfcinv_pd(a) simde_mm512_erfcinv_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_erfcinv_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_erfcinv_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_erfcinv_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_erfcinv_ps - #define _mm512_mask_erfcinv_ps(src, k, a) simde_mm512_mask_erfcinv_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_erfcinv_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_erfcinv_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_erfcinv_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_erfcinv_pd - #define _mm512_mask_erfcinv_pd(src, k, a) simde_mm512_mask_erfcinv_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_logb_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_logb_ps(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_logbf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_logb_ps - #define _mm_logb_ps(a) simde_mm_logb_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_logb_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_logb_pd(a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_logb(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_logb_pd - #define _mm_logb_pd(a) simde_mm_logb_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_logb_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_logb_ps(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_logb_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_logbf(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_logb_ps - #define _mm256_logb_ps(a) simde_mm256_logb_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_logb_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_logb_pd(a); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_logb_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_logb(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_logb_pd - #define _mm256_logb_pd(a) simde_mm256_logb_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_logb_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_logb_ps(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_logb_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_logbf(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_logb_ps - #define _mm512_logb_ps(a) simde_mm512_logb_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_logb_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_logb_pd(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_logb_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_logb(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_logb_pd - #define _mm512_logb_pd(a) simde_mm512_logb_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_logb_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_logb_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_logb_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_logb_ps - #define _mm512_mask_logb_ps(src, k, a) simde_mm512_mask_logb_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_logb_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_logb_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_logb_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_logb_pd - #define _mm512_mask_logb_pd(src, k, a) simde_mm512_mask_logb_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_log2_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_log2_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_MATH_SLEEF_VERSION_CHECK(3,4,0) && (SIMDE_ACCURACY_PREFERENCE <= 1) - return Sleef_log2f4_u35(a); - #else - return Sleef_log2f4_u10(a); - #endif - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_log2f(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_log2_ps - #define _mm_log2_ps(a) simde_mm_log2_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_log2_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_log2_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_MATH_SLEEF_VERSION_CHECK(3,4,0) && (SIMDE_ACCURACY_PREFERENCE <= 1) - return Sleef_log2d2_u35(a); - #else - return Sleef_log2d2_u10(a); - #endif - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_log2(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_log2_pd - #define _mm_log2_pd(a) simde_mm_log2_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_log2_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_log2_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_MATH_SLEEF_VERSION_CHECK(3,4,0) && (SIMDE_ACCURACY_PREFERENCE <= 1) - return Sleef_log2f8_u35(a); - #else - return Sleef_log2f8_u10(a); - #endif - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_log2_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_log2f(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_log2_ps - #define _mm256_log2_ps(a) simde_mm256_log2_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_log2_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_log2_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_MATH_SLEEF_VERSION_CHECK(3,4,0) && (SIMDE_ACCURACY_PREFERENCE <= 1) - return Sleef_log2d4_u35(a); - #else - return Sleef_log2d4_u10(a); - #endif - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_log2_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_log2(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_log2_pd - #define _mm256_log2_pd(a) simde_mm256_log2_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_log2_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_log2_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_MATH_SLEEF_VERSION_CHECK(3,4,0) && (SIMDE_ACCURACY_PREFERENCE <= 1) - return Sleef_log2f16_u35(a); - #else - return Sleef_log2f16_u10(a); - #endif - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_log2_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_log2f(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_log2_ps - #define _mm512_log2_ps(a) simde_mm512_log2_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_log2_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_log2_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_MATH_SLEEF_VERSION_CHECK(3,4,0) && (SIMDE_ACCURACY_PREFERENCE <= 1) - return Sleef_log2d8_u35(a); - #else - return Sleef_log2d8_u10(a); - #endif - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_log2_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_log2(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_log2_pd - #define _mm512_log2_pd(a) simde_mm512_log2_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_log2_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_log2_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_log2_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_log2_ps - #define _mm512_mask_log2_ps(src, k, a) simde_mm512_mask_log2_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_log2_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_log2_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_log2_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_log2_pd - #define _mm512_mask_log2_pd(src, k, a) simde_mm512_mask_log2_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_log1p_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_log1p_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_log1pf4_u10(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_log1pf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_log1p_ps - #define _mm_log1p_ps(a) simde_mm_log1p_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_log1p_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_log1p_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_log1pd2_u10(a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_log1p(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_log1p_pd - #define _mm_log1p_pd(a) simde_mm_log1p_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_log1p_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_log1p_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_log1pf8_u10(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_log1p_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_log1pf(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_log1p_ps - #define _mm256_log1p_ps(a) simde_mm256_log1p_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_log1p_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_log1p_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_log1pd4_u10(a); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_log1p_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_log1p(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_log1p_pd - #define _mm256_log1p_pd(a) simde_mm256_log1p_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_log1p_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_log1p_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_log1pf16_u10(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_log1p_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_log1pf(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_log1p_ps - #define _mm512_log1p_ps(a) simde_mm512_log1p_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_log1p_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_log1p_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_log1pd8_u10(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_log1p_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_log1p(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_log1p_pd - #define _mm512_log1p_pd(a) simde_mm512_log1p_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_log1p_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_log1p_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_log1p_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_log1p_ps - #define _mm512_mask_log1p_ps(src, k, a) simde_mm512_mask_log1p_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_log1p_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_log1p_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_log1p_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_log1p_pd - #define _mm512_mask_log1p_pd(src, k, a) simde_mm512_mask_log1p_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_log10_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_log10_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_log10f4_u10(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_log10f(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_log10_ps - #define _mm_log10_ps(a) simde_mm_log10_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_log10_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_log10_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_log10d2_u10(a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_log10(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_log10_pd - #define _mm_log10_pd(a) simde_mm_log10_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_log10_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_log10_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_log10f8_u10(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_log10_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_log10f(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_log10_ps - #define _mm256_log10_ps(a) simde_mm256_log10_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_log10_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_log10_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_log10d4_u10(a); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_log10_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_log10(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_log10_pd - #define _mm256_log10_pd(a) simde_mm256_log10_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_log10_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_log10_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_log10f16_u10(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_log10_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_log10f(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_log10_ps - #define _mm512_log10_ps(a) simde_mm512_log10_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_log10_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_log10_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_log10d8_u10(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_log10_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_log10(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_log10_pd - #define _mm512_log10_pd(a) simde_mm512_log10_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_log10_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_log10_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_log10_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_log10_ps - #define _mm512_mask_log10_ps(src, k, a) simde_mm512_mask_log10_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_log10_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_log10_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_log10_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_log10_pd - #define _mm512_mask_log10_pd(src, k, a) simde_mm512_mask_log10_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_nearbyint_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_nearbyint_ps(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_nearbyintf(a_.f32[i]); - } - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_nearbyint_ps - #define _mm512_nearbyint_ps(a) simde_mm512_nearbyint_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_nearbyint_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_nearbyint_pd(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_nearbyint(a_.f64[i]); - } - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_nearbyint_pd - #define _mm512_nearbyint_pd(a) simde_mm512_nearbyint_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_nearbyint_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_nearbyint_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_nearbyint_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_nearbyint_ps - #define _mm512_mask_nearbyint_ps(src, k, a) simde_mm512_mask_nearbyint_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_nearbyint_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_nearbyint_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_nearbyint_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_nearbyint_pd - #define _mm512_mask_nearbyint_pd(src, k, a) simde_mm512_mask_nearbyint_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_pow_ps (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_pow_ps(a, b); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_powf4_u10(a, b); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_powf(a_.f32[i], b_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_pow_ps - #define _mm_pow_ps(a, b) simde_mm_pow_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_pow_pd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_pow_pd(a, b); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_powd2_u10(a, b); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_pow(a_.f64[i], b_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_pow_pd - #define _mm_pow_pd(a, b) simde_mm_pow_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_pow_ps (simde__m256 a, simde__m256 b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_pow_ps(a, b); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_powf8_u10(a, b); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_powf(a_.f32[i], b_.f32[i]); - } - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_pow_ps - #define _mm256_pow_ps(a, b) simde_mm256_pow_ps(a, b) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_pow_pd (simde__m256d a, simde__m256d b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_pow_pd(a, b); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_powd4_u10(a, b); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_pow(a_.f64[i], b_.f64[i]); - } - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_pow_pd - #define _mm256_pow_pd(a, b) simde_mm256_pow_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_pow_ps (simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_pow_ps(a, b); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_powf16_u10(a, b); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a), - b_ = simde__m512_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_powf(a_.f32[i], b_.f32[i]); - } - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_pow_ps - #define _mm512_pow_ps(a, b) simde_mm512_pow_ps(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_pow_pd (simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_pow_pd(a, b); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_powd8_u10(a, b); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a), - b_ = simde__m512d_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_pow(a_.f64[i], b_.f64[i]); - } - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_pow_pd - #define _mm512_pow_pd(a, b) simde_mm512_pow_pd(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_pow_ps(simde__m512 src, simde__mmask16 k, simde__m512 a, simde__m512 b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_pow_ps(src, k, a, b); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_pow_ps(a, b)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_pow_ps - #define _mm512_mask_pow_ps(src, k, a, b) simde_mm512_mask_pow_ps(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_pow_pd(simde__m512d src, simde__mmask8 k, simde__m512d a, simde__m512d b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_pow_pd(src, k, a, b); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_pow_pd(a, b)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_pow_pd - #define _mm512_mask_pow_pd(src, k, a, b) simde_mm512_mask_pow_pd(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_clog_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_clog_ps(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - simde__m128_private pow_res_ = simde__m128_to_private(simde_mm_pow_ps(a, simde_mm_set1_ps(SIMDE_FLOAT32_C(2.0)))); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) { - r_.f32[ i ] = simde_math_logf(simde_math_sqrtf(pow_res_.f32[i] + pow_res_.f32[i+1])); - r_.f32[i + 1] = simde_math_atan2f(a_.f32[i + 1], a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_clog_ps - #define _mm_clog_ps(a) simde_mm_clog_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_clog_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_clog_ps(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - simde__m256_private pow_res_ = simde__m256_to_private(simde_mm256_pow_ps(a, simde_mm256_set1_ps(SIMDE_FLOAT32_C(2.0)))); - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i += 2) { - r_.f32[ i ] = simde_math_logf(simde_math_sqrtf(pow_res_.f32[i] + pow_res_.f32[i + 1])); - r_.f32[i + 1] = simde_math_atan2f(a_.f32[i + 1], a_.f32[i]); - } - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_clog_ps - #define _mm256_clog_ps(a) simde_mm256_clog_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_csqrt_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_csqrt_ps(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - simde__m128 pow_res= simde_mm_pow_ps(a,simde_mm_set1_ps(SIMDE_FLOAT32_C(2.0))); - simde__m128_private pow_res_=simde__m128_to_private(pow_res); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i+=2) { - simde_float32 sign = simde_math_copysignf(SIMDE_FLOAT32_C(1.0), a_.f32[i + 1]); - simde_float32 temp = simde_math_sqrtf(pow_res_.f32[i] + pow_res_.f32[i+1]); - - r_.f32[ i ] = simde_math_sqrtf(( a_.f32[i] + temp) / SIMDE_FLOAT32_C(2.0)); - r_.f32[i + 1] = sign * simde_math_sqrtf((-a_.f32[i] + temp) / SIMDE_FLOAT32_C(2.0)); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_csqrt_ps - #define _mm_csqrt_ps(a) simde_mm_csqrt_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_csqrt_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_csqrt_ps(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - simde__m256 pow_res= simde_mm256_pow_ps(a,simde_mm256_set1_ps(SIMDE_FLOAT32_C(2.0))); - simde__m256_private pow_res_=simde__m256_to_private(pow_res); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i+=2) { - simde_float32 sign = simde_math_copysignf(SIMDE_FLOAT32_C(1.0), a_.f32[i + 1]); - simde_float32 temp = simde_math_sqrtf(pow_res_.f32[i] + pow_res_.f32[i+1]); - - r_.f32[ i ] = simde_math_sqrtf(( a_.f32[i] + temp) / SIMDE_FLOAT32_C(2.0)); - r_.f32[i + 1] = sign * simde_math_sqrtf((-a_.f32[i] + temp) / SIMDE_FLOAT32_C(2.0)); - } - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_csqrt_ps - #define _mm256_csqrt_ps(a) simde_mm256_csqrt_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_rem_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) - return _mm_rem_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.i8 = a_.i8 % b_.i8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] % b_.i8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_rem_epi8 - #define _mm_rem_epi8(a, b) simde_mm_rem_epi8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_rem_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) - return _mm_rem_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.i16 = a_.i16 % b_.i16; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] % b_.i16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_rem_epi16 - #define _mm_rem_epi16(a, b) simde_mm_rem_epi16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_rem_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) - return _mm_rem_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.i32 = a_.i32 % b_.i32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] % b_.i32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#define simde_mm_irem_epi32(a, b) simde_mm_rem_epi32(a, b) -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_rem_epi32 - #define _mm_rem_epi32(a, b) simde_mm_rem_epi32(a, b) - #undef _mm_irem_epi32 - #define _mm_irem_epi32(a, b) simde_mm_rem_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_rem_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) - return _mm_rem_epi64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.i64 = a_.i64 % b_.i64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] % b_.i64[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_rem_epi64 - #define _mm_rem_epi64(a, b) simde_mm_rem_epi64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_rem_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) - return _mm_rem_epu8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.u8 = a_.u8 % b_.u8; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = a_.u8[i] % b_.u8[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_rem_epu8 - #define _mm_rem_epu8(a, b) simde_mm_rem_epu8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_rem_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) - return _mm_rem_epu16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.u16 = a_.u16 % b_.u16; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = a_.u16[i] % b_.u16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_rem_epu16 - #define _mm_rem_epu16(a, b) simde_mm_rem_epu16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_rem_epu32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) - return _mm_rem_epu32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.u32 = a_.u32 % b_.u32; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] % b_.u32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#define simde_mm_urem_epi32(a, b) simde_mm_rem_epu32(a, b) -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_rem_epu32 - #define _mm_rem_epu32(a, b) simde_mm_rem_epu32(a, b) - #undef _mm_urem_epi32 - #define _mm_urem_epi32(a, b) simde_mm_rem_epu32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_rem_epu64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) - return _mm_rem_epu64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.u64 = a_.u64 % b_.u64; - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = a_.u64[i] % b_.u64[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_rem_epu64 - #define _mm_rem_epu64(a, b) simde_mm_rem_epu64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_rem_epi8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_rem_epi8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.i8 = a_.i8 % b_.i8; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_rem_epi8(a_.m128i[i], b_.m128i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] % b_.i8[i]; - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_rem_epi8 - #define _mm256_rem_epi8(a, b) simde_mm256_rem_epi8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_rem_epi16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_rem_epi16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.i16 = a_.i16 % b_.i16; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_rem_epi16(a_.m128i[i], b_.m128i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] % b_.i16[i]; - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_rem_epi16 - #define _mm256_rem_epi16(a, b) simde_mm256_rem_epi16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_rem_epi32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_rem_epi32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.i32 = a_.i32 % b_.i32; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_rem_epi32(a_.m128i[i], b_.m128i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] % b_.i32[i]; - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#define simde_mm256_irem_epi32(a, b) simde_mm256_rem_epi32(a, b) -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_rem_epi32 - #define _mm256_rem_epi32(a, b) simde_mm256_rem_epi32(a, b) - #undef _mm256_irem_epi32 - #define _mm256_irem_epi32(a, b) simde_mm256_rem_epi32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_rem_epi64 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_rem_epi64(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.i64 = a_.i64 % b_.i64; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_rem_epi64(a_.m128i[i], b_.m128i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] % b_.i64[i]; - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_rem_epi64 - #define _mm256_rem_epi64(a, b) simde_mm256_rem_epi64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_rem_epu8 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_rem_epu8(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.u8 = a_.u8 % b_.u8; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_rem_epu8(a_.m128i[i], b_.m128i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = a_.u8[i] % b_.u8[i]; - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_rem_epu8 - #define _mm256_rem_epu8(a, b) simde_mm256_rem_epu8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_rem_epu16 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_rem_epu16(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.u16 = a_.u16 % b_.u16; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_rem_epu16(a_.m128i[i], b_.m128i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = a_.u16[i] % b_.u16[i]; - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_rem_epu16 - #define _mm256_rem_epu16(a, b) simde_mm256_rem_epu16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_rem_epu32 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_rem_epu32(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.u32 = a_.u32 % b_.u32; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_rem_epu32(a_.m128i[i], b_.m128i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] % b_.u32[i]; - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#define simde_mm256_urem_epi32(a, b) simde_mm256_rem_epu32(a, b) -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_rem_epu32 - #define _mm256_rem_epu32(a, b) simde_mm256_rem_epu32(a, b) - #undef _mm256_urem_epi32 - #define _mm256_urem_epi32(a, b) simde_mm256_rem_epu32(a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_rem_epu64 (simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_rem_epu64(a, b); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.u64 = a_.u64 % b_.u64; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_rem_epu64(a_.m128i[i], b_.m128i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = a_.u64[i] % b_.u64[i]; - } - #endif - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_rem_epu64 - #define _mm256_rem_epu64(a, b) simde_mm256_rem_epu64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_rem_epi8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_rem_epi8(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.i8 = a_.i8 % b_.i8; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_rem_epi8(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = a_.i8[i] % b_.i8[i]; - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_rem_epi8 - #define _mm512_rem_epi8(a, b) simde_mm512_rem_epi8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_rem_epi16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_rem_epi16(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.i16 = a_.i16 % b_.i16; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_rem_epi16(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = a_.i16[i] % b_.i16[i]; - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_rem_epi16 - #define _mm512_rem_epi16(a, b) simde_mm512_rem_epi16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_rem_epi32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_rem_epi32(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.i32 = a_.i32 % b_.i32; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_rem_epi32(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = a_.i32[i] % b_.i32[i]; - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_rem_epi32 - #define _mm512_rem_epi32(a, b) simde_mm512_rem_epi32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_rem_epi32(simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_rem_epi32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_rem_epi32(a, b)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_rem_epi32 - #define _mm512_mask_rem_epi32(src, k, a, b) simde_mm512_mask_rem_epi32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_rem_epi64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_rem_epi64(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.i64 = a_.i64 % b_.i64; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_rem_epi64(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = a_.i64[i] % b_.i64[i]; - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_rem_epi64 - #define _mm512_rem_epi64(a, b) simde_mm512_rem_epi64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_rem_epu8 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_rem_epu8(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.u8 = a_.u8 % b_.u8; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_rem_epu8(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = a_.u8[i] % b_.u8[i]; - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_rem_epu8 - #define _mm512_rem_epu8(a, b) simde_mm512_rem_epu8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_rem_epu16 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_rem_epu16(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.u16 = a_.u16 % b_.u16; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_rem_epu16(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = a_.u16[i] % b_.u16[i]; - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_rem_epu16 - #define _mm512_rem_epu16(a, b) simde_mm512_rem_epu16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_rem_epu32 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_rem_epu32(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.u32 = a_.u32 % b_.u32; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_rem_epu32(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = a_.u32[i] % b_.u32[i]; - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_rem_epu32 - #define _mm512_rem_epu32(a, b) simde_mm512_rem_epu32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_mask_rem_epu32(simde__m512i src, simde__mmask16 k, simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_rem_epu32(src, k, a, b); - #else - return simde_mm512_mask_mov_epi32(src, k, simde_mm512_rem_epu32(a, b)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_rem_epu32 - #define _mm512_mask_rem_epu32(src, k, a, b) simde_mm512_mask_rem_epu32(src, k, a, b) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512i -simde_mm512_rem_epu64 (simde__m512i a, simde__m512i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_rem_epu64(a, b); - #else - simde__m512i_private - r_, - a_ = simde__m512i_to_private(a), - b_ = simde__m512i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) && !defined(SIMDE_BUG_PGI_30104) - r_.u64 = a_.u64 % b_.u64; - #else - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256i) / sizeof(r_.m256i[0])) ; i++) { - r_.m256i[i] = simde_mm256_rem_epu64(a_.m256i[i], b_.m256i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = a_.u64[i] % b_.u64[i]; - } - #endif - #endif - - return simde__m512i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_rem_epu64 - #define _mm512_rem_epu64(a, b) simde_mm512_rem_epu64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_recip_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_recip_ps(a); - #else - return simde_mm512_div_ps(simde_mm512_set1_ps(SIMDE_FLOAT32_C(1.0)), a); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_recip_ps - #define _mm512_recip_ps(a) simde_mm512_recip_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_recip_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_recip_pd(a); - #else - return simde_mm512_div_pd(simde_mm512_set1_pd(SIMDE_FLOAT64_C(1.0)), a); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_recip_pd - #define _mm512_recip_pd(a) simde_mm512_recip_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_recip_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_recip_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_recip_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_recip_ps - #define _mm512_mask_recip_ps(src, k, a) simde_mm512_mask_recip_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_recip_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_recip_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_recip_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_recip_pd - #define _mm512_mask_recip_pd(src, k, a) simde_mm512_mask_recip_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_rint_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_rint_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_rintf16(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_rintf(a_.f32[i]); - } - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_rint_ps - #define _mm512_rint_ps(a) simde_mm512_rint_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_rint_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_rint_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_rintd8(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_rint(a_.f64[i]); - } - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_rint_pd - #define _mm512_rint_pd(a) simde_mm512_rint_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_rint_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_rint_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_rint_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_rint_ps - #define _mm512_mask_rint_ps(src, k, a) simde_mm512_mask_rint_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_rint_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_rint_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_rint_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_rint_pd - #define _mm512_mask_rint_pd(src, k, a) simde_mm512_mask_rint_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_sin_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_sin_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_sinf4_u10(a); - #else - return Sleef_sinf4_u35(a); - #endif - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_sinf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_sin_ps - #define _mm_sin_ps(a) simde_mm_sin_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_sin_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_sin_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_sind2_u10(a); - #else - return Sleef_sind2_u35(a); - #endif - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_sin(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_sin_pd - #define _mm_sin_pd(a) simde_mm_sin_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_sin_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_sin_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_sinf8_u10(a); - #else - return Sleef_sinf8_u35(a); - #endif - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_sin_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_sinf(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_sin_ps - #define _mm256_sin_ps(a) simde_mm256_sin_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_sin_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_sin_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_sind4_u10(a); - #else - return Sleef_sind4_u35(a); - #endif - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_sin_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_sin(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_sin_pd - #define _mm256_sin_pd(a) simde_mm256_sin_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_sin_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_sin_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_sinf16_u10(a); - #else - return Sleef_sinf16_u35(a); - #endif - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_sin_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_sinf(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_sin_ps - #define _mm512_sin_ps(a) simde_mm512_sin_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_sin_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_sin_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_sind8_u10(a); - #else - return Sleef_sind8_u35(a); - #endif - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_sin_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_sin(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_sin_pd - #define _mm512_sin_pd(a) simde_mm512_sin_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_sin_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_sin_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_sin_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_sin_ps - #define _mm512_mask_sin_ps(src, k, a) simde_mm512_mask_sin_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_sin_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_sin_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_sin_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_sin_pd - #define _mm512_mask_sin_pd(src, k, a) simde_mm512_mask_sin_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_sincos_ps (simde__m128* mem_addr, simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_sincos_ps(HEDLEY_REINTERPRET_CAST(__m128*, mem_addr), a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - Sleef___m128_2 temp; - - #if SIMDE_ACCURACY_PREFERENCE > 1 - temp = Sleef_sincosf4_u10(a); - #else - temp = Sleef_sincosf4_u35(a); - #endif - - *mem_addr = temp.y; - return temp.x; - #else - simde__m128 r; - - r = simde_mm_sin_ps(a); - *mem_addr = simde_mm_cos_ps(a); - - return r; - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_sincos_ps - #define _mm_sincos_ps(mem_addr, a) simde_mm_sincos_ps((mem_addr),(a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_sincos_pd (simde__m128d* mem_addr, simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_sincos_pd(HEDLEY_REINTERPRET_CAST(__m128d*, mem_addr), a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - Sleef___m128d_2 temp; - - #if SIMDE_ACCURACY_PREFERENCE > 1 - temp = Sleef_sincosd2_u10(a); - #else - temp = Sleef_sincosd2_u35(a); - #endif - - *mem_addr = temp.y; - return temp.x; - #else - simde__m128d r; - - r = simde_mm_sin_pd(a); - *mem_addr = simde_mm_cos_pd(a); - - return r; - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_sincos_pd - #define _mm_sincos_pd(mem_addr, a) simde_mm_sincos_pd((mem_addr),(a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_sincos_ps (simde__m256* mem_addr, simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_sincos_ps(HEDLEY_REINTERPRET_CAST(__m256*, mem_addr), a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - Sleef___m256_2 temp; - - #if SIMDE_ACCURACY_PREFERENCE > 1 - temp = Sleef_sincosf8_u10(a); - #else - temp = Sleef_sincosf8_u35(a); - #endif - - *mem_addr = temp.y; - return temp.x; - #else - simde__m256 r; - - r = simde_mm256_sin_ps(a); - *mem_addr = simde_mm256_cos_ps(a); - - return r; - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_sincos_ps - #define _mm256_sincos_ps(mem_addr, a) simde_mm256_sincos_ps((mem_addr),(a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_sincos_pd (simde__m256d* mem_addr, simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_sincos_pd(HEDLEY_REINTERPRET_CAST(__m256d*, mem_addr), a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - Sleef___m256d_2 temp; - - #if SIMDE_ACCURACY_PREFERENCE > 1 - temp = Sleef_sincosd4_u10(a); - #else - temp = Sleef_sincosd4_u35(a); - #endif - - *mem_addr = temp.y; - return temp.x; - #else - simde__m256d r; - - r = simde_mm256_sin_pd(a); - *mem_addr = simde_mm256_cos_pd(a); - - return r; - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_sincos_pd - #define _mm256_sincos_pd(mem_addr, a) simde_mm256_sincos_pd((mem_addr),(a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_sincos_ps (simde__m512* mem_addr, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_sincos_ps(HEDLEY_REINTERPRET_CAST(__m512*, mem_addr), a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - Sleef___m512_2 temp; - - #if SIMDE_ACCURACY_PREFERENCE > 1 - temp = Sleef_sincosf16_u10(a); - #else - temp = Sleef_sincosf16_u35(a); - #endif - - *mem_addr = temp.y; - return temp.x; - #else - simde__m512 r; - - r = simde_mm512_sin_ps(a); - *mem_addr = simde_mm512_cos_ps(a); - - return r; - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_sincos_ps - #define _mm512_sincos_ps(mem_addr, a) simde_mm512_sincos_ps((mem_addr),(a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_sincos_pd (simde__m512d* mem_addr, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_sincos_pd(HEDLEY_REINTERPRET_CAST(__m512d*, mem_addr), a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - Sleef___m512d_2 temp; - - #if SIMDE_ACCURACY_PREFERENCE > 1 - temp = Sleef_sincosd8_u10(a); - #else - temp = Sleef_sincosd8_u35(a); - #endif - - *mem_addr = temp.y; - return temp.x; - #else - simde__m512d r; - - r = simde_mm512_sin_pd(a); - *mem_addr = simde_mm512_cos_pd(a); - - return r; - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_sincos_pd - #define _mm512_sincos_pd(mem_addr, a) simde_mm512_sincos_pd((mem_addr),(a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_sincos_ps(simde__m512* mem_addr, simde__m512 sin_src, simde__m512 cos_src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_sincos_ps(mem_addr, sin_src, cos_src, k, a); - #else - simde__m512 cos_res, sin_res; - sin_res = simde_mm512_sincos_ps(&cos_res, a); - *mem_addr = simde_mm512_mask_mov_ps(cos_src, k, cos_res); - return simde_mm512_mask_mov_ps(sin_src, k, sin_res); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_sincos_ps - #define _mm512_mask_sincos_ps(mem_addr, sin_src, cos_src, k, a) simde_mm512_mask_sincos_ps(mem_addr, sin_src, cos_src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_sincos_pd(simde__m512d* mem_addr, simde__m512d sin_src, simde__m512d cos_src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_sincos_pd(mem_addr, sin_src, cos_src, k, a); - #else - simde__m512d cos_res, sin_res; - sin_res = simde_mm512_sincos_pd(&cos_res, a); - *mem_addr = simde_mm512_mask_mov_pd(cos_src, k, cos_res); - return simde_mm512_mask_mov_pd(sin_src, k, sin_res); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_sincos_pd - #define _mm512_mask_sincos_pd(mem_addr, sin_src, cos_src, k, a) simde_mm512_mask_sincos_pd(mem_addr, sin_src, cos_src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_sind_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_sind_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_sinf4_u10(simde_x_mm_deg2rad_ps(a)); - #else - return Sleef_sinf4_u35(simde_x_mm_deg2rad_ps(a)); - #endif - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_sinf(simde_math_deg2radf(a_.f32[i])); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_sind_ps - #define _mm_sind_ps(a) simde_mm_sind_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_sind_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_sind_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_sind2_u10(simde_x_mm_deg2rad_pd(a)); - #else - return Sleef_sind2_u35(simde_x_mm_deg2rad_pd(a)); - #endif - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_sin(simde_math_deg2rad(a_.f64[i])); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_sind_pd - #define _mm_sind_pd(a) simde_mm_sind_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_sind_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_sind_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_sinf8_u10(simde_x_mm256_deg2rad_ps(a)); - #else - return Sleef_sinf8_u35(simde_x_mm256_deg2rad_ps(a)); - #endif - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_sind_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_sinf(simde_math_deg2radf(a_.f32[i])); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_sind_ps - #define _mm256_sind_ps(a) simde_mm256_sind_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_sind_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_sind_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_sind4_u10(simde_x_mm256_deg2rad_pd(a)); - #else - return Sleef_sind4_u35(simde_x_mm256_deg2rad_pd(a)); - #endif - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_sind_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_sin(simde_math_deg2rad(a_.f64[i])); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_sind_pd - #define _mm256_sind_pd(a) simde_mm256_sind_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_sind_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_sind_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_sinf16_u10(simde_x_mm512_deg2rad_ps(a)); - #else - return Sleef_sinf16_u35(simde_x_mm512_deg2rad_ps(a)); - #endif - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_sind_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_sinf(simde_math_deg2radf(a_.f32[i])); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_sind_ps - #define _mm512_sind_ps(a) simde_mm512_sind_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_sind_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_sind_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_sind8_u10(simde_x_mm512_deg2rad_pd(a)); - #else - return Sleef_sind8_u35(simde_x_mm512_deg2rad_pd(a)); - #endif - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_sind_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_sin(simde_math_deg2rad(a_.f64[i])); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_sind_pd - #define _mm512_sind_pd(a) simde_mm512_sind_pd(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_sind_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_sind_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_sind_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_sind_ps - #define _mm512_mask_sind_ps(src, k, a) simde_mm512_mask_sind_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_sind_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_sind_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_sind_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_sind_pd - #define _mm512_mask_sind_pd(src, k, a) simde_mm512_mask_sind_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_sinh_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_sinh_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_sinhf4_u10(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_sinhf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_sinh_ps - #define _mm_sinh_ps(a) simde_mm_sinh_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_sinh_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_sinh_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_sinhd2_u10(a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_sinh(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_sinh_pd - #define _mm_sinh_pd(a) simde_mm_sinh_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_sinh_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_sinh_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_sinhf8_u10(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_sinh_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_sinhf(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_sinh_ps - #define _mm256_sinh_ps(a) simde_mm256_sinh_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_sinh_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_sinh_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_sinhd4_u10(a); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_sinh_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_sinh(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_sinh_pd - #define _mm256_sinh_pd(a) simde_mm256_sinh_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_sinh_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_sinh_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_sinhf16_u10(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_sinh_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_sinhf(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_sinh_ps - #define _mm512_sinh_ps(a) simde_mm512_sinh_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_sinh_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_sinh_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_sinhd8_u10(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_sinh_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_sinh(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_sinh_pd - #define _mm512_sinh_pd(a) simde_mm512_sinh_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_sinh_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_sinh_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_sinh_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_sinh_ps - #define _mm512_mask_sinh_ps(src, k, a) simde_mm512_mask_sinh_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_sinh_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_sinh_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_sinh_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_sinh_pd - #define _mm512_mask_sinh_pd(src, k, a) simde_mm512_mask_sinh_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_svml_ceil_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_svml_ceil_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_ceilf4(a); - #else - return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_POS_INF); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_svml_ceil_ps - #define _mm_svml_ceil_ps(a) simde_mm_svml_ceil_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_svml_ceil_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_svml_ceil_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_ceild2(a); - #else - return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_POS_INF); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_svml_ceil_pd - #define _mm_svml_ceil_pd(a) simde_mm_svml_ceil_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_svml_ceil_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_svml_ceil_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_ceilf8(a); - #else - return simde_mm256_round_ps(a, SIMDE_MM_FROUND_TO_POS_INF); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_svml_ceil_ps - #define _mm256_svml_ceil_ps(a) simde_mm256_svml_ceil_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_svml_ceil_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_svml_ceil_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_ceild4(a); - #else - return simde_mm256_round_pd(a, SIMDE_MM_FROUND_TO_POS_INF); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_svml_ceil_pd - #define _mm256_svml_ceil_pd(a) simde_mm256_svml_ceil_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_ceil_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_ceil_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_ceilf16(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_ceil_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_ceilf(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_ceil_ps - #define _mm512_ceil_ps(a) simde_mm512_ceil_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_ceil_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_ceil_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_ceild8(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_ceil_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_ceil(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_ceil_pd - #define _mm512_ceil_pd(a) simde_mm512_ceil_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_ceil_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_ceil_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_ceil_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_ceil_ps - #define _mm512_mask_ceil_ps(src, k, a) simde_mm512_mask_ceil_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_ceil_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_ceil_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_ceil_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_ceil_pd - #define _mm512_mask_ceil_pd(src, k, a) simde_mm512_mask_ceil_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_svml_floor_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_svml_floor_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_floorf4(a); - #else - return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_NEG_INF); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_svml_floor_ps - #define _mm_svml_floor_ps(a) simde_mm_svml_floor_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_svml_floor_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_svml_floor_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_floord2(a); - #else - return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_NEG_INF); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_svml_floor_pd - #define _mm_svml_floor_pd(a) simde_mm_svml_floor_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_svml_floor_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_svml_floor_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_floorf8(a); - #else - return simde_mm256_round_ps(a, SIMDE_MM_FROUND_TO_NEG_INF); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_svml_floor_ps - #define _mm256_svml_floor_ps(a) simde_mm256_svml_floor_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_svml_floor_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_svml_floor_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_floord4(a); - #else - return simde_mm256_round_pd(a, SIMDE_MM_FROUND_TO_NEG_INF); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_svml_floor_pd - #define _mm256_svml_floor_pd(a) simde_mm256_svml_floor_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_floor_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_floor_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_floorf16(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_floor_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_floorf(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_floor_ps - #define _mm512_floor_ps(a) simde_mm512_floor_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_floor_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_floor_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_floord8(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_floor_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_floor(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_floor_pd - #define _mm512_floor_pd(a) simde_mm512_floor_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_floor_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_floor_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_floor_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_floor_ps - #define _mm512_mask_floor_ps(src, k, a) simde_mm512_mask_floor_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_floor_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_floor_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_floor_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_floor_pd - #define _mm512_mask_floor_pd(src, k, a) simde_mm512_mask_floor_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_svml_round_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_svml_round_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_roundf4(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_roundf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_svml_round_ps - #define _mm_svml_round_ps(a) simde_mm_svml_round_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_svml_round_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_svml_round_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_roundd2(a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_round(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_svml_round_pd - #define _mm_svml_round_pd(a) simde_mm_svml_round_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_svml_round_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_svml_round_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_roundf8(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_svml_round_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_roundf(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_svml_round_ps - #define _mm256_svml_round_ps(a) simde_mm256_svml_round_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_svml_round_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_svml_round_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_roundd4(a); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_svml_round_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_round(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_svml_round_pd - #define _mm256_svml_round_pd(a) simde_mm256_svml_round_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_svml_round_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_svml_round_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_roundd8(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_svml_round_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_round(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_svml_round_pd - #define _mm512_svml_round_pd(a) simde_mm512_svml_round_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_svml_round_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_svml_round_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_svml_round_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_svml_round_pd - #define _mm512_mask_svml_round_pd(src, k, a) simde_mm512_mask_svml_round_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_svml_sqrt_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_svml_sqrt_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_sqrtf4(a); - #else - return simde_mm_sqrt_ps(a); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_svml_sqrt_ps - #define _mm_svml_sqrt_ps(a) simde_mm_svml_sqrt_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_svml_sqrt_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_svml_sqrt_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_sqrtd2(a); - #else - return simde_mm_sqrt_pd(a); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_svml_sqrt_pd - #define _mm_svml_sqrt_pd(a) simde_mm_svml_sqrt_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_svml_sqrt_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_svml_sqrt_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_sqrtf8(a); - #else - return simde_mm256_sqrt_ps(a); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_svml_sqrt_ps - #define _mm256_svml_sqrt_ps(a) simde_mm256_svml_sqrt_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_svml_sqrt_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_svml_sqrt_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_sqrtd4(a); - #else - return simde_mm256_sqrt_pd(a); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_svml_sqrt_pd - #define _mm256_svml_sqrt_pd(a) simde_mm256_svml_sqrt_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_svml_sqrt_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_svml_sqrt_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_sqrtf16(a); - #else - return simde_mm512_sqrt_ps(a); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_svml_sqrt_ps - #define _mm512_svml_sqrt_ps(a) simde_mm512_svml_sqrt_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_svml_sqrt_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_svml_sqrt_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_sqrtd8(a); - #else - return simde_mm512_sqrt_pd(a); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_svml_sqrt_pd - #define _mm512_svml_sqrt_pd(a) simde_mm512_svml_sqrt_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_tan_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_tan_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_tanf4_u10(a); - #else - return Sleef_tanf4_u35(a); - #endif - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_tanf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_tan_ps - #define _mm_tan_ps(a) simde_mm_tan_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_tan_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_tan_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_tand2_u10(a); - #else - return Sleef_tand2_u35(a); - #endif - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_tan(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_tan_pd - #define _mm_tan_pd(a) simde_mm_tan_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_tan_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_tan_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_tanf8_u10(a); - #else - return Sleef_tanf8_u35(a); - #endif - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_tan_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_tanf(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_tan_ps - #define _mm256_tan_ps(a) simde_mm256_tan_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_tan_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_tan_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_tand4_u10(a); - #else - return Sleef_tand4_u35(a); - #endif - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_tan_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_tan(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_tan_pd - #define _mm256_tan_pd(a) simde_mm256_tan_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_tan_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_tan_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_tanf16_u10(a); - #else - return Sleef_tanf16_u35(a); - #endif - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_tan_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_tanf(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_tan_ps - #define _mm512_tan_ps(a) simde_mm512_tan_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_tan_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_tan_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_tand8_u10(a); - #else - return Sleef_tand8_u35(a); - #endif - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_tan_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_tan(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_tan_pd - #define _mm512_tan_pd(a) simde_mm512_tan_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_tan_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_tan_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_tan_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_tan_ps - #define _mm512_mask_tan_ps(src, k, a) simde_mm512_mask_tan_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_tan_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_tan_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_tan_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_tan_pd - #define _mm512_mask_tan_pd(src, k, a) simde_mm512_mask_tan_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_tand_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_tand_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_tanf4_u10(simde_x_mm_deg2rad_ps(a)); - #else - return Sleef_tanf4_u35(simde_x_mm_deg2rad_ps(a)); - #endif - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_tanf(simde_math_deg2radf(a_.f32[i])); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_tand_ps - #define _mm_tand_ps(a) simde_mm_tand_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_tand_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_tand_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_tand2_u10(simde_x_mm_deg2rad_pd(a)); - #else - return Sleef_tand2_u35(simde_x_mm_deg2rad_pd(a)); - #endif - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_tan(simde_math_deg2rad(a_.f64[i])); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_tand_pd - #define _mm_tand_pd(a) simde_mm_tand_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_tand_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_tand_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_tanf8_u10(simde_x_mm256_deg2rad_ps(a)); - #else - return Sleef_tanf8_u35(simde_x_mm256_deg2rad_ps(a)); - #endif - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_tand_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_tanf(simde_math_deg2radf(a_.f32[i])); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_tand_ps - #define _mm256_tand_ps(a) simde_mm256_tand_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_tand_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_tand_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_tand4_u10(simde_x_mm256_deg2rad_pd(a)); - #else - return Sleef_tand4_u35(simde_x_mm256_deg2rad_pd(a)); - #endif - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_tand_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_tan(simde_math_deg2rad(a_.f64[i])); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_tand_pd - #define _mm256_tand_pd(a) simde_mm256_tand_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_tand_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_tand_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_tanf16_u10(simde_x_mm512_deg2rad_ps(a)); - #else - return Sleef_tanf16_u35(simde_x_mm512_deg2rad_ps(a)); - #endif - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_tand_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_tanf(simde_math_deg2radf(a_.f32[i])); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_tand_ps - #define _mm512_tand_ps(a) simde_mm512_tand_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_tand_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_tand_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - #if SIMDE_ACCURACY_PREFERENCE > 1 - return Sleef_tand8_u10(simde_x_mm512_deg2rad_pd(a)); - #else - return Sleef_tand8_u35(simde_x_mm512_deg2rad_pd(a)); - #endif - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_tand_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_tan(simde_math_deg2rad(a_.f64[i])); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_tand_pd - #define _mm512_tand_pd(a) simde_mm512_tand_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_tand_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_tand_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_tand_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_tand_ps - #define _mm512_mask_tand_ps(src, k, a) simde_mm512_mask_tand_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_tand_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_tand_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_tand_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_tand_pd - #define _mm512_mask_tand_pd(src, k, a) simde_mm512_mask_tand_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_tanh_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_tanh_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_tanhf4_u10(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_tanhf(a_.f32[i]); - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_tanh_ps - #define _mm_tanh_ps(a) simde_mm_tanh_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_tanh_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_tanh_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_tanhd2_u10(a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_tanh(a_.f64[i]); - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_tanh_pd - #define _mm_tanh_pd(a) simde_mm_tanh_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_tanh_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_tanh_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_tanhf8_u10(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_tanh_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_tanhf(a_.f32[i]); - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_tanh_ps - #define _mm256_tanh_ps(a) simde_mm256_tanh_ps(a) -#endif - - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_tanh_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_tanh_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_tanhd4_u10(a); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_tanh_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_tanh(a_.f64[i]); - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_tanh_pd - #define _mm256_tanh_pd(a) simde_mm256_tanh_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_tanh_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_tanh_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_tanhf16_u10(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_tanh_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_tanhf(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_tanh_ps - #define _mm512_tanh_ps(a) simde_mm512_tanh_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_tanh_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_tanh_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_tanhd8_u10(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_tanh_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_tanh(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_tanh_pd - #define _mm512_tanh_pd(a) simde_mm512_tanh_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_tanh_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_tanh_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_tanh_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_tanh_ps - #define _mm512_mask_tanh_ps(src, k, a) simde_mm512_mask_tanh_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_tanh_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_tanh_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_tanh_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_tanh_pd - #define _mm512_mask_tanh_pd(src, k, a) simde_mm512_mask_tanh_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_trunc_ps (simde__m128 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_trunc_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_truncf4(a); - #else - return simde_mm_round_ps(a, SIMDE_MM_FROUND_TO_ZERO); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_trunc_ps - #define _mm_trunc_ps(a) simde_mm_trunc_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_trunc_pd (simde__m128d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE_NATIVE) - return _mm_trunc_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_SSE_NATIVE) - return Sleef_truncd2(a); - #else - return simde_mm_round_pd(a, SIMDE_MM_FROUND_TO_ZERO); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_trunc_pd - #define _mm_trunc_pd(a) simde_mm_trunc_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_trunc_ps (simde__m256 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_trunc_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_truncf8(a); - #else - return simde_mm256_round_ps(a, SIMDE_MM_FROUND_TO_ZERO); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_trunc_ps - #define _mm256_trunc_ps(a) simde_mm256_trunc_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_trunc_pd (simde__m256d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_trunc_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX_NATIVE) - return Sleef_truncd4(a); - #else - return simde_mm256_round_pd(a, SIMDE_MM_FROUND_TO_ZERO); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_trunc_pd - #define _mm256_trunc_pd(a) simde_mm256_trunc_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_trunc_ps (simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_trunc_ps(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_truncf16(a); - #else - simde__m512_private - r_, - a_ = simde__m512_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256) / sizeof(r_.m256[0])) ; i++) { - r_.m256[i] = simde_mm256_trunc_ps(a_.m256[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - r_.f32[i] = simde_math_truncf(a_.f32[i]); - } - #endif - - return simde__m512_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_trunc_ps - #define _mm512_trunc_ps(a) simde_mm512_trunc_ps(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_trunc_pd (simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_trunc_pd(a); - #elif defined(SIMDE_MATH_SLEEF_ENABLE) && defined(SIMDE_X86_AVX512F_NATIVE) - return Sleef_truncd8(a); - #else - simde__m512d_private - r_, - a_ = simde__m512d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(256) - for (size_t i = 0 ; i < (sizeof(r_.m256d) / sizeof(r_.m256d[0])) ; i++) { - r_.m256d[i] = simde_mm256_trunc_pd(a_.m256d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - r_.f64[i] = simde_math_trunc(a_.f64[i]); - } - #endif - - return simde__m512d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_trunc_pd - #define _mm512_trunc_pd(a) simde_mm512_trunc_pd(a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512 -simde_mm512_mask_trunc_ps(simde__m512 src, simde__mmask16 k, simde__m512 a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_trunc_ps(src, k, a); - #else - return simde_mm512_mask_mov_ps(src, k, simde_mm512_trunc_ps(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_trunc_ps - #define _mm512_mask_trunc_ps(src, k, a) simde_mm512_mask_trunc_ps(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m512d -simde_mm512_mask_trunc_pd(simde__m512d src, simde__mmask8 k, simde__m512d a) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX512F_NATIVE) - return _mm512_mask_trunc_pd(src, k, a); - #else - return simde_mm512_mask_mov_pd(src, k, simde_mm512_trunc_pd(a)); - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm512_mask_trunc_pd - #define _mm512_mask_trunc_pd(src, k, a) simde_mm512_mask_trunc_pd(src, k, a) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_udivrem_epi32 (simde__m128i * mem_addr, simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_SSE2_NATIVE) - return _mm_udivrem_epi32(mem_addr, a, b); - #else - simde__m128i r; - - r = simde_mm_div_epu32(a, b); - *mem_addr = simde_x_mm_sub_epu32(a, simde_x_mm_mullo_epu32(r, b)); - - return r; - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm_udivrem_epi32 - #define _mm_udivrem_epi32(mem_addr, a, b) simde_mm_udivrem_epi32((mem_addr),(a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_udivrem_epi32 (simde__m256i* mem_addr, simde__m256i a, simde__m256i b) { - #if defined(SIMDE_X86_SVML_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) - return _mm256_udivrem_epi32(HEDLEY_REINTERPRET_CAST(__m256i*, mem_addr), a, b); - #else - simde__m256i r; - - r = simde_mm256_div_epu32(a, b); - *mem_addr = simde_x_mm256_sub_epu32(a, simde_x_mm256_mullo_epu32(r, b)); - - return r; - #endif -} -#if defined(SIMDE_X86_SVML_ENABLE_NATIVE_ALIASES) - #undef _mm256_udivrem_epi32 - #define _mm256_udivrem_epi32(mem_addr, a, b) simde_mm256_udivrem_epi32((mem_addr),(a), (b)) -#endif - -SIMDE_END_DECLS_ - -HEDLEY_DIAGNOSTIC_POP - -#endif /* !defined(SIMDE_X86_SVML_H) */ diff --git a/ffi-deps/simde/simde/x86/xop.h b/ffi-deps/simde/simde/x86/xop.h deleted file mode 100644 index 8b83ed2..0000000 --- a/ffi-deps/simde/simde/x86/xop.h +++ /dev/null @@ -1,3740 +0,0 @@ -/* SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - * - * Copyright: - * 2020 Evan Nemerson - */ - -#if !defined(SIMDE_X86_XOP_H) -#define SIMDE_X86_XOP_H - -#include "avx2.h" - -#if !defined(SIMDE_X86_XOP_NATIVE) && defined(SIMDE_ENABLE_NATIVE_ALIASES) -# define SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES -#endif - -HEDLEY_DIAGNOSTIC_PUSH -SIMDE_DISABLE_UNWANTED_DIAGNOSTICS -SIMDE_BEGIN_DECLS_ - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_cmov_si128 (simde__m128i a, simde__m128i b, simde__m128i c) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_cmov_si128(a, b, c); - #elif defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm_ternarylogic_epi32(a, b, c, 0xe4); - #elif defined(SIMDE_X86_SSE2_NATIVE) - return _mm_or_si128(_mm_and_si128(c, a), _mm_andnot_si128(c, b)); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - c_ = simde__m128i_to_private(c); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vbslq_s8(c_.neon_u8, a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_v128_bitselect(a_.wasm_v128, b_.wasm_v128, c_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P7_NATIVE) - r_.altivec_i32 = vec_sel(b_.altivec_i32, a_.altivec_i32, c_.altivec_u32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32f = (c_.i32f & a_.i32f) | (~c_.i32f & b_.i32f); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = (c_.i32f[i] & a_.i32f[i]) | (~c_.i32f[i] & b_.i32f[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_cmov_si128(a, b, c) simde_mm_cmov_si128((a), (b), (c)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256i -simde_mm256_cmov_si256 (simde__m256i a, simde__m256i b, simde__m256i c) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(SIMDE_X86_AVX_NATIVE) && !defined(SIMDE_BUG_GCC_98521) && !defined(SIMDE_BUG_MCST_LCC_MISSING_CMOV_M256) - return _mm256_cmov_si256(a, b, c); - #elif defined(SIMDE_X86_AVX512VL_NATIVE) - return _mm256_ternarylogic_epi32(a, b, c, 0xe4); - #elif defined(SIMDE_X86_AVX2_NATIVE) - return _mm256_or_si256(_mm256_and_si256(c, a), _mm256_andnot_si256(c, b)); - #else - simde__m256i_private - r_, - a_ = simde__m256i_to_private(a), - b_ = simde__m256i_to_private(b), - c_ = simde__m256i_to_private(c); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128i) / sizeof(r_.m128i[0])) ; i++) { - r_.m128i[i] = simde_mm_cmov_si128(a_.m128i[i], b_.m128i[i], c_.m128i[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32f) / sizeof(r_.i32f[0])) ; i++) { - r_.i32f[i] = (c_.i32f[i] & a_.i32f[i]) | (~c_.i32f[i] & b_.i32f[i]); - } - #endif - - return simde__m256i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm256_cmov_si256(a, b, c) simde_mm256_cmov_si256((a), (b), (c)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comeq_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_EQ) - return _mm_com_epi8(a, b, _MM_PCOMCTRL_EQ); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comeq_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vceqq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), a_.i8 == b_.i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] == b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comeq_epi8(a, b) simde_mm_comeq_epi8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comeq_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_EQ) - return _mm_com_epi16(a, b, _MM_PCOMCTRL_EQ); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comeq_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vceqq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), a_.i16 == b_.i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] == b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comeq_epi16(a, b) simde_mm_comeq_epi16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comeq_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_EQ) - return _mm_com_epi32(a, b, _MM_PCOMCTRL_EQ); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comeq_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vceqq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.i32 == b_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] == b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comeq_epi32(a, b) simde_mm_comeq_epi32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comeq_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_EQ) - return _mm_com_epi64(a, b, _MM_PCOMCTRL_EQ); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comeq_epi64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vceqq_s64(a_.neon_i64, b_.neon_i64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 == b_.i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = (a_.i64[i] == b_.i64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comeq_epi64(a, b) simde_mm_comeq_epi64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comeq_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_EQ) - return _mm_com_epu8(a, b, _MM_PCOMCTRL_EQ); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comeq_epu8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vceqq_u8(a_.neon_u8, b_.neon_u8); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u8), a_.u8 == b_.u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] == b_.u8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comeq_epu8(a, b) simde_mm_comeq_epu8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comeq_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_EQ) - return _mm_com_epu16(a, b, _MM_PCOMCTRL_EQ); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comeq_epu16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vceqq_u16(a_.neon_u16, b_.neon_u16); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), a_.u16 == b_.u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] == b_.u16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comeq_epu16(a, b) simde_mm_comeq_epu16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comeq_epu32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_EQ) - return _mm_com_epu32(a, b, _MM_PCOMCTRL_EQ); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comeq_epu32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vceqq_u32(a_.neon_u32, b_.neon_u32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), a_.u32 == b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] == b_.u32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comeq_epu32(a, b) simde_mm_comeq_epu32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comeq_epu64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_EQ) - return _mm_com_epu64(a, b, _MM_PCOMCTRL_EQ); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comeq_epu64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vceqq_u64(a_.neon_u64, b_.neon_u64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), a_.u64 == b_.u64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] == b_.u64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comeq_epu64(a, b) simde_mm_comeq_epu64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comge_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_GE) - return _mm_com_epi8(a, b, _MM_PCOMCTRL_GE); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comge_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vcgeq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), a_.i8 >= b_.i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] >= b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comge_epi8(a, b) simde_mm_comge_epi8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comge_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_GE) - return _mm_com_epi16(a, b, _MM_PCOMCTRL_GE); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comge_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vcgeq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), a_.i16 >= b_.i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] >= b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comge_epi16(a, b) simde_mm_comge_epi16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comge_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_GE) - return _mm_com_epi32(a, b, _MM_PCOMCTRL_GE); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comge_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcgeq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.i32 >= b_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] >= b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comge_epi32(a, b) simde_mm_comge_epi32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comge_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_GE) - return _mm_com_epi64(a, b, _MM_PCOMCTRL_GE); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comge_epi64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vcgeq_s64(a_.neon_i64, b_.neon_i64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 >= b_.i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = (a_.i64[i] >= b_.i64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comge_epi64(a, b) simde_mm_comge_epi64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comge_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_GE) - return _mm_com_epu8(a, b, _MM_PCOMCTRL_GE); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comge_epu8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vcgeq_u8(a_.neon_u8, b_.neon_u8); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u8), a_.u8 >= b_.u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] >= b_.u8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comge_epu8(a, b) simde_mm_comge_epu8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comge_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_GE) - return _mm_com_epu16(a, b, _MM_PCOMCTRL_GE); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comge_epu16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vcgeq_u16(a_.neon_u16, b_.neon_u16); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), a_.u16 >= b_.u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] >= b_.u16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comge_epu16(a, b) simde_mm_comge_epu16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comge_epu32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_GE) - return _mm_com_epu32(a, b, _MM_PCOMCTRL_GE); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comge_epu32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcgeq_u32(a_.neon_u32, b_.neon_u32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), a_.u32 >= b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] >= b_.u32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comge_epu32(a, b) simde_mm_comge_epu32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comge_epu64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_GE) - return _mm_com_epu64(a, b, _MM_PCOMCTRL_GE); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comge_epu64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vcgeq_u64(a_.neon_u64, b_.neon_u64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), a_.u64 >= b_.u64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] >= b_.u64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comge_epu64(a, b) simde_mm_comge_epu64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comgt_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_GT) - return _mm_com_epi8(a, b, _MM_PCOMCTRL_GT); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comgt_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vcgtq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), a_.i8 > b_.i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] > b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comgt_epi8(a, b) simde_mm_comgt_epi8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comgt_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_GT) - return _mm_com_epi16(a, b, _MM_PCOMCTRL_GT); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comgt_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vcgtq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), a_.i16 > b_.i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] > b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comgt_epi16(a, b) simde_mm_comgt_epi16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comgt_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_GT) - return _mm_com_epi32(a, b, _MM_PCOMCTRL_GT); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comgt_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcgtq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.i32 > b_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] > b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comgt_epi32(a, b) simde_mm_comgt_epi32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comgt_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_GT) - return _mm_com_epi64(a, b, _MM_PCOMCTRL_GT); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comgt_epi64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vcgtq_s64(a_.neon_i64, b_.neon_i64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 > b_.i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = (a_.i64[i] > b_.i64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comgt_epi64(a, b) simde_mm_comgt_epi64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comgt_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_GT) - return _mm_com_epu8(a, b, _MM_PCOMCTRL_GT); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comgt_epu8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vcgtq_u8(a_.neon_u8, b_.neon_u8); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u8), a_.u8 > b_.u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] > b_.u8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comgt_epu8(a, b) simde_mm_comgt_epu8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comgt_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_GT) - return _mm_com_epu16(a, b, _MM_PCOMCTRL_GT); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comgt_epu16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vcgtq_u16(a_.neon_u16, b_.neon_u16); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), a_.u16 > b_.u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] > b_.u16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comgt_epu16(a, b) simde_mm_comgt_epu16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comgt_epu32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_GT) - return _mm_com_epu32(a, b, _MM_PCOMCTRL_GT); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comgt_epu32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcgtq_u32(a_.neon_u32, b_.neon_u32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), a_.u32 > b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] > b_.u32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comgt_epu32(a, b) simde_mm_comgt_epu32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comgt_epu64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_GT) - return _mm_com_epu64(a, b, _MM_PCOMCTRL_GT); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comgt_epu64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vcgtq_u64(a_.neon_u64, b_.neon_u64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), a_.u64 > b_.u64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] > b_.u64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comgt_epu64(a, b) simde_mm_comgt_epu64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comle_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_LE) - return _mm_com_epi8(a, b, _MM_PCOMCTRL_LE); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comle_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vcleq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), a_.i8 <= b_.i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] <= b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comle_epi8(a, b) simde_mm_comle_epi8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comle_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_LE) - return _mm_com_epi16(a, b, _MM_PCOMCTRL_LE); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comle_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), a_.i16 <= b_.i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] <= b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comle_epi16(a, b) simde_mm_comle_epi16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comle_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_LE) - return _mm_com_epi32(a, b, _MM_PCOMCTRL_LE); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comle_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcleq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.i32 <= b_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] <= b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comle_epi32(a, b) simde_mm_comle_epi32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comle_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_LE) - return _mm_com_epi64(a, b, _MM_PCOMCTRL_LE); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comle_epi64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vcleq_s64(a_.neon_i64, b_.neon_i64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 <= b_.i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = (a_.i64[i] <= b_.i64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comle_epi64(a, b) simde_mm_comle_epi64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comle_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_LE) - return _mm_com_epu8(a, b, _MM_PCOMCTRL_LE); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comle_epu8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vcleq_u8(a_.neon_u8, b_.neon_u8); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u8), a_.u8 <= b_.u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] <= b_.u8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comle_epu8(a, b) simde_mm_comle_epu8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comle_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_LE) - return _mm_com_epu16(a, b, _MM_PCOMCTRL_LE); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comle_epu16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vcleq_u16(a_.neon_u16, b_.neon_u16); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), a_.u16 <= b_.u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] <= b_.u16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comle_epu16(a, b) simde_mm_comle_epu16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comle_epu32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_LE) - return _mm_com_epu32(a, b, _MM_PCOMCTRL_LE); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comle_epu32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcleq_u32(a_.neon_u32, b_.neon_u32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), a_.u32 <= b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] <= b_.u32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comle_epu32(a, b) simde_mm_comle_epu32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comle_epu64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_LE) - return _mm_com_epu64(a, b, _MM_PCOMCTRL_LE); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comle_epu64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vcleq_u64(a_.neon_u64, b_.neon_u64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), a_.u64 <= b_.u64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] <= b_.u64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comle_epu64(a, b) simde_mm_comle_epu64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comlt_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_LT) - return _mm_com_epi8(a, b, _MM_PCOMCTRL_LT); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comlt_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vcltq_s8(a_.neon_i8, b_.neon_i8); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), a_.i8 < b_.i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] < b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comlt_epi8(a, b) simde_mm_comlt_epi8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comlt_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_LT) - return _mm_com_epi16(a, b, _MM_PCOMCTRL_LT); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comlt_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vcltq_s16(a_.neon_i16, b_.neon_i16); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), a_.i16 < b_.i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] < b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comlt_epi16(a, b) simde_mm_comlt_epi16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comlt_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_LT) - return _mm_com_epi32(a, b, _MM_PCOMCTRL_LT); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comlt_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcltq_s32(a_.neon_i32, b_.neon_i32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.i32 < b_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] < b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comlt_epi32(a, b) simde_mm_comlt_epi32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comlt_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_LT) - return _mm_com_epi64(a, b, _MM_PCOMCTRL_LT); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comlt_epi64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vcltq_s64(a_.neon_i64, b_.neon_i64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 < b_.i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = (a_.i64[i] < b_.i64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comlt_epi64(a, b) simde_mm_comlt_epi64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comlt_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_LT) - return _mm_com_epu8(a, b, _MM_PCOMCTRL_LT); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comlt_epu8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vcltq_u8(a_.neon_u8, b_.neon_u8); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u8), a_.u8 < b_.u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] < b_.u8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comlt_epu8(a, b) simde_mm_comlt_epu8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comlt_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_LT) - return _mm_com_epu16(a, b, _MM_PCOMCTRL_LT); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comlt_epu16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vcltq_u16(a_.neon_u16, b_.neon_u16); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), a_.u16 < b_.u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] < b_.u16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comlt_epu16(a, b) simde_mm_comlt_epu16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comlt_epu32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_LT) - return _mm_com_epu32(a, b, _MM_PCOMCTRL_LT); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comlt_epu32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vcltq_u32(a_.neon_u32, b_.neon_u32); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), a_.u32 < b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] < b_.u32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comlt_epu32(a, b) simde_mm_comlt_epu32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comlt_epu64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_LT) - return _mm_com_epu64(a, b, _MM_PCOMCTRL_LT); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comlt_epu64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u64 = vcltq_u64(a_.neon_u64, b_.neon_u64); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), a_.u64 < b_.u64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] < b_.u64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comlt_epu64(a, b) simde_mm_comlt_epu64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comneq_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_NEQ) - return _mm_com_epi8(a, b, _MM_PCOMCTRL_NEQ); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comneq_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vmvnq_u8(vceqq_s8(a_.neon_i8, b_.neon_i8)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i8), a_.i8 != b_.i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - r_.i8[i] = (a_.i8[i] != b_.i8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comneq_epi8(a, b) simde_mm_comneq_epi8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comneq_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_NEQ) - return _mm_com_epi16(a, b, _MM_PCOMCTRL_NEQ); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comneq_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vmvnq_u16(vceqq_s16(a_.neon_i16, b_.neon_i16)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i16), a_.i16 != b_.i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] != b_.i16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comneq_epi16(a, b) simde_mm_comneq_epi16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comneq_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_NEQ) - return _mm_com_epi32(a, b, _MM_PCOMCTRL_NEQ); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comneq_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vmvnq_u32(vceqq_s32(a_.neon_i32, b_.neon_i32)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i32), a_.i32 != b_.i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] != b_.i32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comneq_epi32(a, b) simde_mm_comneq_epi32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comneq_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_NEQ) - return _mm_com_epi64(a, b, _MM_PCOMCTRL_NEQ); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comneq_epi64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u32 = vmvnq_u32(vreinterpretq_u32_u64(vceqq_s64(a_.neon_i64, b_.neon_i64))); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.i64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.i64), a_.i64 != b_.i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = (a_.i64[i] != b_.i64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comneq_epi64(a, b) simde_mm_comneq_epi64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comneq_epu8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_NEQ) - return _mm_com_epu8(a, b, _MM_PCOMCTRL_NEQ); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comneq_epu8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vmvnq_u8(vceqq_u8(a_.neon_u8, b_.neon_u8)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u8 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u8), a_.u8 != b_.u8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (a_.u8[i] != b_.u8[i]) ? ~INT8_C(0) : INT8_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comneq_epu8(a, b) simde_mm_comneq_epu8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comneq_epu16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_NEQ) - return _mm_com_epu16(a, b, _MM_PCOMCTRL_NEQ); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comneq_epu16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vmvnq_u16(vceqq_u16(a_.neon_u16, b_.neon_u16)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u16 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u16), a_.u16 != b_.u16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (a_.u16[i] != b_.u16[i]) ? ~INT16_C(0) : INT16_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comneq_epu16(a, b) simde_mm_comneq_epu16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comneq_epu32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_NEQ) - return _mm_com_epu32(a, b, _MM_PCOMCTRL_NEQ); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comneq_epu32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vmvnq_u32(vceqq_u32(a_.neon_u32, b_.neon_u32)); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u32 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u32), a_.u32 != b_.u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (a_.u32[i] != b_.u32[i]) ? ~INT32_C(0) : INT32_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comneq_epu32(a, b) simde_mm_comneq_epu32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comneq_epu64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_NEQ) - return _mm_com_epu64(a, b, _MM_PCOMCTRL_NEQ); - #elif defined(SIMDE_X86_XOP_NATIVE) - return _mm_comneq_epu64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - r_.neon_u32 = vmvnq_u32(vreinterpretq_u32_u64(vceqq_u64(a_.neon_u64, b_.neon_u64))); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_OPS) - r_.u64 = HEDLEY_REINTERPRET_CAST(__typeof__(r_.u64), a_.u64 != b_.u64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (a_.u64[i] != b_.u64[i]) ? ~INT64_C(0) : INT64_C(0); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comneq_epu64(a, b) simde_mm_comneq_epu64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comfalse_epi8 (simde__m128i a, simde__m128i b) { - (void) a; - (void) b; - return simde_mm_setzero_si128(); -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comfalse_epi8(a, b) simde_mm_comfalse_epi8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comfalse_epi16 (simde__m128i a, simde__m128i b) { - (void) a; - (void) b; - return simde_mm_setzero_si128(); -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comfalse_epi16(a, b) simde_mm_comfalse_epi16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comfalse_epi32 (simde__m128i a, simde__m128i b) { - (void) a; - (void) b; - return simde_mm_setzero_si128(); -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comfalse_epi32(a, b) simde_mm_comfalse_epi32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comfalse_epi64 (simde__m128i a, simde__m128i b) { - (void) a; - (void) b; - return simde_mm_setzero_si128(); -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comfalse_epi64(a, b) simde_mm_comfalse_epi64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comfalse_epu8 (simde__m128i a, simde__m128i b) { - (void) a; - (void) b; - return simde_mm_setzero_si128(); -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comfalse_epu8(a, b) simde_mm_comfalse_epu8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comfalse_epu16 (simde__m128i a, simde__m128i b) { - (void) a; - (void) b; - return simde_mm_setzero_si128(); -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comfalse_epu16(a, b) simde_mm_comfalse_epu16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comfalse_epu32 (simde__m128i a, simde__m128i b) { - (void) a; - (void) b; - return simde_mm_setzero_si128(); -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comfalse_epu32(a, b) simde_mm_comfalse_epu32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comfalse_epu64 (simde__m128i a, simde__m128i b) { - (void) a; - (void) b; - return simde_mm_setzero_si128(); -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comfalse_epu64(a, b) simde_mm_comfalse_epu64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comtrue_epi8 (simde__m128i a, simde__m128i b) { - (void) a; - (void) b; - return simde_x_mm_setone_si128(); -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comtrue_epi8(a, b) simde_mm_comtrue_epi8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comtrue_epi16 (simde__m128i a, simde__m128i b) { - (void) a; - (void) b; - return simde_x_mm_setone_si128(); -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comtrue_epi16(a, b) simde_mm_comtrue_epi16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comtrue_epi32 (simde__m128i a, simde__m128i b) { - (void) a; - (void) b; - return simde_x_mm_setone_si128(); -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comtrue_epi32(a, b) simde_mm_comtrue_epi32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comtrue_epi64 (simde__m128i a, simde__m128i b) { - (void) a; - (void) b; - return simde_x_mm_setone_si128(); -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comtrue_epi64(a, b) simde_mm_comtrue_epi64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comtrue_epu8 (simde__m128i a, simde__m128i b) { - (void) a; - (void) b; - return simde_x_mm_setone_si128(); -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comtrue_epu8(a, b) simde_mm_comtrue_epu8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comtrue_epu16 (simde__m128i a, simde__m128i b) { - (void) a; - (void) b; - return simde_x_mm_setone_si128(); -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comtrue_epu16(a, b) simde_mm_comtrue_epu16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comtrue_epu32 (simde__m128i a, simde__m128i b) { - (void) a; - (void) b; - return simde_x_mm_setone_si128(); -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comtrue_epu32(a, b) simde_mm_comtrue_epu32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_comtrue_epu64 (simde__m128i a, simde__m128i b) { - (void) a; - (void) b; - return simde_x_mm_setone_si128(); -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_comtrue_epu64(a, b) simde_mm_comtrue_epu64((a), (b)) -#endif - -#if defined(SIMDE_X86_XOP_NATIVE) && defined(_MM_PCOMCTRL_LT) - #define SIMDE_X86_XOP_HAVE_COM_ 1 - #define SIMDE_MM_PCOMCTRL_LT _MM_PCOMCTRL_LT - #define SIMDE_MM_PCOMCTRL_LE _MM_PCOMCTRL_LE - #define SIMDE_MM_PCOMCTRL_GT _MM_PCOMCTRL_GT - #define SIMDE_MM_PCOMCTRL_GE _MM_PCOMCTRL_GE - #define SIMDE_MM_PCOMCTRL_EQ _MM_PCOMCTRL_EQ - #define SIMDE_MM_PCOMCTRL_NEQ _MM_PCOMCTRL_NEQ - #define SIMDE_MM_PCOMCTRL_FALSE _MM_PCOMCTRL_FALSE - #define SIMDE_MM_PCOMCTRL_TRUE _MM_PCOMCTRL_TRUE -#else - #define SIMDE_MM_PCOMCTRL_LT 0 - #define SIMDE_MM_PCOMCTRL_LE 1 - #define SIMDE_MM_PCOMCTRL_GT 2 - #define SIMDE_MM_PCOMCTRL_GE 3 - #define SIMDE_MM_PCOMCTRL_EQ 4 - #define SIMDE_MM_PCOMCTRL_NEQ 5 - #define SIMDE_MM_PCOMCTRL_FALSE 6 - #define SIMDE_MM_PCOMCTRL_TRUE 7 - - #if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _MM_PCOMCTRL_LT SIMDE_MM_PCOMCTRL_LT - #define _MM_PCOMCTRL_LE SIMDE_MM_PCOMCTRL_LE - #define _MM_PCOMCTRL_GT SIMDE_MM_PCOMCTRL_GT - #define _MM_PCOMCTRL_GE SIMDE_MM_PCOMCTRL_GE - #define _MM_PCOMCTRL_EQ SIMDE_MM_PCOMCTRL_EQ - #define _MM_PCOMCTRL_NEQ SIMDE_MM_PCOMCTRL_NEQ - #define _MM_PCOMCTRL_FALSE SIMDE_MM_PCOMCTRL_FALSE - #define _MM_PCOMCTRL_TRUE SIMDE_MM_PCOMCTRL_TRUE - #endif -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_com_epi8 (simde__m128i a, simde__m128i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { - switch (imm8) { - case SIMDE_MM_PCOMCTRL_LT: - return simde_mm_comlt_epi8(a, b); - case SIMDE_MM_PCOMCTRL_LE: - return simde_mm_comle_epi8(a, b); - case SIMDE_MM_PCOMCTRL_GT: - return simde_mm_comgt_epi8(a, b); - case SIMDE_MM_PCOMCTRL_GE: - return simde_mm_comge_epi8(a, b); - case SIMDE_MM_PCOMCTRL_EQ: - return simde_mm_comeq_epi8(a, b); - case SIMDE_MM_PCOMCTRL_NEQ: - return simde_mm_comneq_epi8(a, b); - case SIMDE_MM_PCOMCTRL_FALSE: - return simde_mm_comfalse_epi8(a, b); - case SIMDE_MM_PCOMCTRL_TRUE: - return simde_mm_comtrue_epi8(a, b); - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm_setzero_si128()); - } -} -#if defined(SIMDE_X86_XOP_NATIVE) && defined(SIMDE_X86_XOP_HAVE_COM_) - #define simde_mm_com_epi8(a, b, imm8) _mm_com_epi8((a), (b), (imm8)) -#endif -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_com_epi8(a, b, imm8) simde_mm_com_epi8((a), (b), (imm8)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_com_epi16 (simde__m128i a, simde__m128i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { - switch (imm8) { - case SIMDE_MM_PCOMCTRL_LT: - return simde_mm_comlt_epi16(a, b); - case SIMDE_MM_PCOMCTRL_LE: - return simde_mm_comle_epi16(a, b); - case SIMDE_MM_PCOMCTRL_GT: - return simde_mm_comgt_epi16(a, b); - case SIMDE_MM_PCOMCTRL_GE: - return simde_mm_comge_epi16(a, b); - case SIMDE_MM_PCOMCTRL_EQ: - return simde_mm_comeq_epi16(a, b); - case SIMDE_MM_PCOMCTRL_NEQ: - return simde_mm_comneq_epi16(a, b); - case SIMDE_MM_PCOMCTRL_FALSE: - return simde_mm_comfalse_epi16(a, b); - case SIMDE_MM_PCOMCTRL_TRUE: - return simde_mm_comtrue_epi16(a, b); - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm_setzero_si128()); - } -} -#if defined(SIMDE_X86_XOP_NATIVE) && defined(SIMDE_X86_XOP_HAVE_COM_) - #define simde_mm_com_epi16(a, b, imm8) _mm_com_epi16((a), (b), (imm8)) -#endif -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_com_epi16(a, b, imm8) simde_mm_com_epi16((a), (b), (imm8)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_com_epi32 (simde__m128i a, simde__m128i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { - switch (imm8) { - case SIMDE_MM_PCOMCTRL_LT: - return simde_mm_comlt_epi32(a, b); - case SIMDE_MM_PCOMCTRL_LE: - return simde_mm_comle_epi32(a, b); - case SIMDE_MM_PCOMCTRL_GT: - return simde_mm_comgt_epi32(a, b); - case SIMDE_MM_PCOMCTRL_GE: - return simde_mm_comge_epi32(a, b); - case SIMDE_MM_PCOMCTRL_EQ: - return simde_mm_comeq_epi32(a, b); - case SIMDE_MM_PCOMCTRL_NEQ: - return simde_mm_comneq_epi32(a, b); - case SIMDE_MM_PCOMCTRL_FALSE: - return simde_mm_comfalse_epi32(a, b); - case SIMDE_MM_PCOMCTRL_TRUE: - return simde_mm_comtrue_epi32(a, b); - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm_setzero_si128()); - } -} -#if defined(SIMDE_X86_XOP_NATIVE) && defined(SIMDE_X86_XOP_HAVE_COM_) - #define simde_mm_com_epi32(a, b, imm8) _mm_com_epi32((a), (b), (imm8)) -#endif -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_com_epi32(a, b, imm8) simde_mm_com_epi32((a), (b), (imm8)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_com_epi64 (simde__m128i a, simde__m128i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { - switch (imm8) { - case SIMDE_MM_PCOMCTRL_LT: - return simde_mm_comlt_epi64(a, b); - case SIMDE_MM_PCOMCTRL_LE: - return simde_mm_comle_epi64(a, b); - case SIMDE_MM_PCOMCTRL_GT: - return simde_mm_comgt_epi64(a, b); - case SIMDE_MM_PCOMCTRL_GE: - return simde_mm_comge_epi64(a, b); - case SIMDE_MM_PCOMCTRL_EQ: - return simde_mm_comeq_epi64(a, b); - case SIMDE_MM_PCOMCTRL_NEQ: - return simde_mm_comneq_epi64(a, b); - case SIMDE_MM_PCOMCTRL_FALSE: - return simde_mm_comfalse_epi64(a, b); - case SIMDE_MM_PCOMCTRL_TRUE: - return simde_mm_comtrue_epi64(a, b); - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm_setzero_si128()); - } -} -#if defined(SIMDE_X86_XOP_NATIVE) && defined(SIMDE_X86_XOP_HAVE_COM_) - #define simde_mm_com_epi64(a, b, imm8) _mm_com_epi64((a), (b), (imm8)) -#endif -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_com_epi64(a, b, imm8) simde_mm_com_epi64((a), (b), (imm8)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_com_epu8 (simde__m128i a, simde__m128i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { - switch (imm8) { - case SIMDE_MM_PCOMCTRL_LT: - return simde_mm_comlt_epu8(a, b); - case SIMDE_MM_PCOMCTRL_LE: - return simde_mm_comle_epu8(a, b); - case SIMDE_MM_PCOMCTRL_GT: - return simde_mm_comgt_epu8(a, b); - case SIMDE_MM_PCOMCTRL_GE: - return simde_mm_comge_epu8(a, b); - case SIMDE_MM_PCOMCTRL_EQ: - return simde_mm_comeq_epu8(a, b); - case SIMDE_MM_PCOMCTRL_NEQ: - return simde_mm_comneq_epu8(a, b); - case SIMDE_MM_PCOMCTRL_FALSE: - return simde_mm_comfalse_epu8(a, b); - case SIMDE_MM_PCOMCTRL_TRUE: - return simde_mm_comtrue_epu8(a, b); - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm_setzero_si128()); - } -} -#if defined(SIMDE_X86_XOP_NATIVE) && defined(SIMDE_X86_XOP_HAVE_COM_) - #define simde_mm_com_epu8(a, b, imm8) _mm_com_epu8((a), (b), (imm8)) -#endif -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_com_epu8(a, b, imm8) simde_mm_com_epu8((a), (b), (imm8)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_com_epu16 (simde__m128i a, simde__m128i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { - switch (imm8) { - case SIMDE_MM_PCOMCTRL_LT: - return simde_mm_comlt_epu16(a, b); - case SIMDE_MM_PCOMCTRL_LE: - return simde_mm_comle_epu16(a, b); - case SIMDE_MM_PCOMCTRL_GT: - return simde_mm_comgt_epu16(a, b); - case SIMDE_MM_PCOMCTRL_GE: - return simde_mm_comge_epu16(a, b); - case SIMDE_MM_PCOMCTRL_EQ: - return simde_mm_comeq_epu16(a, b); - case SIMDE_MM_PCOMCTRL_NEQ: - return simde_mm_comneq_epu16(a, b); - case SIMDE_MM_PCOMCTRL_FALSE: - return simde_mm_comfalse_epu16(a, b); - case SIMDE_MM_PCOMCTRL_TRUE: - return simde_mm_comtrue_epu16(a, b); - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm_setzero_si128()); - } -} -#if defined(SIMDE_X86_XOP_NATIVE) && defined(SIMDE_X86_XOP_HAVE_COM_) - #define simde_mm_com_epu16(a, b, imm8) _mm_com_epu16((a), (b), (imm8)) -#endif -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_com_epu16(a, b, imm8) simde_mm_com_epu16((a), (b), (imm8)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_com_epu32 (simde__m128i a, simde__m128i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { - switch (imm8) { - case SIMDE_MM_PCOMCTRL_LT: - return simde_mm_comlt_epu32(a, b); - case SIMDE_MM_PCOMCTRL_LE: - return simde_mm_comle_epu32(a, b); - case SIMDE_MM_PCOMCTRL_GT: - return simde_mm_comgt_epu32(a, b); - case SIMDE_MM_PCOMCTRL_GE: - return simde_mm_comge_epu32(a, b); - case SIMDE_MM_PCOMCTRL_EQ: - return simde_mm_comeq_epu32(a, b); - case SIMDE_MM_PCOMCTRL_NEQ: - return simde_mm_comneq_epu32(a, b); - case SIMDE_MM_PCOMCTRL_FALSE: - return simde_mm_comfalse_epu32(a, b); - case SIMDE_MM_PCOMCTRL_TRUE: - return simde_mm_comtrue_epu32(a, b); - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm_setzero_si128()); - } -} -#if defined(SIMDE_X86_XOP_NATIVE) && defined(SIMDE_X86_XOP_HAVE_COM_) - #define simde_mm_com_epu32(a, b, imm8) _mm_com_epu32((a), (b), (imm8)) -#endif -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_com_epu32(a, b, imm8) simde_mm_com_epu32((a), (b), (imm8)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_com_epu64 (simde__m128i a, simde__m128i b, const int imm8) - SIMDE_REQUIRE_CONSTANT_RANGE(imm8, 0, 7) { - switch (imm8) { - case SIMDE_MM_PCOMCTRL_LT: - return simde_mm_comlt_epu64(a, b); - case SIMDE_MM_PCOMCTRL_LE: - return simde_mm_comle_epu64(a, b); - case SIMDE_MM_PCOMCTRL_GT: - return simde_mm_comgt_epu64(a, b); - case SIMDE_MM_PCOMCTRL_GE: - return simde_mm_comge_epu64(a, b); - case SIMDE_MM_PCOMCTRL_EQ: - return simde_mm_comeq_epu64(a, b); - case SIMDE_MM_PCOMCTRL_NEQ: - return simde_mm_comneq_epu64(a, b); - case SIMDE_MM_PCOMCTRL_FALSE: - return simde_mm_comfalse_epu64(a, b); - case SIMDE_MM_PCOMCTRL_TRUE: - return simde_mm_comtrue_epu64(a, b); - default: - HEDLEY_UNREACHABLE_RETURN(simde_mm_setzero_si128()); - } -} -#if defined(SIMDE_X86_XOP_NATIVE) && defined(SIMDE_X86_XOP_HAVE_COM_) - #define simde_mm_com_epu64(a, b, imm8) _mm_com_epu64((a), (b), (imm8)) -#endif -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_com_epu64(a, b, imm8) simde_mm_com_epu64((a), (b), (imm8)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_frcz_ps (simde__m128 a) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_frcz_ps(a); - #else - simde__m128_private - r_, - a_ = simde__m128_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - #if defined(simde_math_modff) - simde_float32 integral; - r_.f32[i] = simde_math_modff(a_.f32[i], &integral); - #else - r_.f32[i] = (a_.f32[i] / 1.0f); - #endif - } - - return simde__m128_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_frcz_ps(a) simde_mm_frcz_ps((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_frcz_pd (simde__m128d a) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_frcz_pd(a); - #else - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - #if defined(simde_math_modf) - simde_float64 integral; - r_.f64[i] = simde_math_modf(a_.f64[i], &integral); - #else - r_.f64[i] = (a_.f64[i] / 1.0f); - #endif - } - - return simde__m128d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_frcz_ps(a) simde_mm_frcz_ps((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_frcz_ss (simde__m128 a, simde__m128 b) { - #if defined(SIMDE_X86_XOP_NATIVE) && !defined(SIMDE_BUG_CLANG_48673) - return _mm_frcz_ss(a, b); - #else - simde__m128_private - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - - #if defined(simde_math_modff) - simde_float32 integral; - a_.f32[0] = simde_math_modff(b_.f32[0], &integral); - #else - a_.f32[0] = (b_.f32[0] / 1.0f); - #endif - - return simde__m128_from_private(a_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_frcz_ss(a, b) simde_mm_frcz_ss((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_frcz_sd (simde__m128d a, simde__m128d b) { - #if defined(SIMDE_X86_XOP_NATIVE) && !defined(SIMDE_BUG_CLANG_48673) - return _mm_frcz_sd(a, b); - #else - simde__m128d_private - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - - #if defined(simde_math_modf) - simde_float64 integral; - a_.f64[0] = simde_math_modf(b_.f64[0], &integral); - #else - a_.f64[0] = (b_.f64[0] / 1.0f); - #endif - - return simde__m128d_from_private(a_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_frcz_sd(a, b) simde_mm_frcz_sd((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_frcz_ps (simde__m256 a) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm256_frcz_ps(a); - #else - simde__m256_private - r_, - a_ = simde__m256_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_frcz_ps(a_.m128[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - #if defined(simde_math_modff) - simde_float32 integral; - r_.f32[i] = simde_math_modff(a_.f32[i], &integral); - #else - r_.f32[i] = (a_.f32[i] / 1.0f); - #endif - } - #endif - - return simde__m256_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm256_frcz_ps(a) simde_mm256_frcz_ps((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_frcz_pd (simde__m256d a) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm256_frcz_pd(a); - #else - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_frcz_pd(a_.m128d[i]); - } - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - #if defined(simde_math_modf) - simde_float64 integral; - r_.f64[i] = simde_math_modf(a_.f64[i], &integral); - #else - r_.f64[i] = (a_.f64[i] / 1.0f); - #endif - } - #endif - - return simde__m256d_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm256_frcz_ps(a) simde_mm256_frcz_ps((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_haddw_epi8 (simde__m128i a) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_haddw_epi8(a); - #elif defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_maddubs_epi16(_mm_set1_epi8(INT8_C(1)), a); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vpaddlq_s8(a_.neon_i8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i16x8_extadd_pairwise_i8x16(a_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(signed char) one = vec_splat_s8(1); - r_.altivec_i16 = - vec_add( - vec_mule(a_.altivec_i8, one), - vec_mulo(a_.altivec_i8, one) - ); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i16 = - ((a_.i16 << 8) >> 8) + - ((a_.i16 >> 8) ); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i8[(i * 2)]) + HEDLEY_STATIC_CAST(int16_t, a_.i8[(i * 2) + 1]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_haddw_epi8(a) simde_mm_haddw_epi8((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_haddw_epu8 (simde__m128i a) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_haddw_epu8(a); - #elif defined(SIMDE_X86_SSSE3_NATIVE) - return _mm_maddubs_epi16(a, _mm_set1_epi8(INT8_C(1))); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vpaddlq_u8(a_.neon_u8); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u16x8_extadd_pairwise_u8x16(a_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned char) one = vec_splat_u8(1); - r_.altivec_u16 = - vec_add( - vec_mule(a_.altivec_u8, one), - vec_mulo(a_.altivec_u8, one) - ); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u16 = - ((a_.u16 << 8) >> 8) + - ((a_.u16 >> 8) ); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, a_.u8[(i * 2)]) + HEDLEY_STATIC_CAST(uint16_t, a_.u8[(i * 2) + 1]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_haddw_epu8(a) simde_mm_haddw_epu8((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_haddd_epi8 (simde__m128i a) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_haddd_epi8(a); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vpaddlq_s16(vpaddlq_s8(a_.neon_i8)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = - HEDLEY_STATIC_CAST(int32_t, a_.i8[(i * 4) ]) + HEDLEY_STATIC_CAST(int32_t, a_.i8[(i * 4) + 1]) + - HEDLEY_STATIC_CAST(int32_t, a_.i8[(i * 4) + 2]) + HEDLEY_STATIC_CAST(int32_t, a_.i8[(i * 4) + 3]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_haddd_epi8(a) simde_mm_haddd_epi8((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_haddd_epi16 (simde__m128i a) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_haddd_epi16(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - return _mm_madd_epi16(a, _mm_set1_epi16(INT8_C(1))); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vpaddlq_s16(a_.neon_i16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_i32x4_extadd_pairwise_i16x8(a_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(signed short) one = vec_splat_s16(1); - r_.altivec_i32 = - vec_add( - vec_mule(a_.altivec_i16, one), - vec_mulo(a_.altivec_i16, one) - ); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.i32 = - ((a_.i32 << 16) >> 16) + - ((a_.i32 >> 16) ); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.i16[(i * 2)]) + HEDLEY_STATIC_CAST(int32_t, a_.i16[(i * 2) + 1]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_haddd_epi8(a) simde_mm_haddd_epi8((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_haddd_epu8 (simde__m128i a) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_haddd_epu8(a); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vpaddlq_u16(vpaddlq_u8(a_.neon_u8)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = - HEDLEY_STATIC_CAST(uint32_t, a_.u8[(i * 4) ]) + HEDLEY_STATIC_CAST(uint32_t, a_.u8[(i * 4) + 1]) + - HEDLEY_STATIC_CAST(uint32_t, a_.u8[(i * 4) + 2]) + HEDLEY_STATIC_CAST(uint32_t, a_.u8[(i * 4) + 3]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_haddd_epu8(a) simde_mm_haddd_epu8((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_haddd_epu16 (simde__m128i a) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_haddd_epu16(a); - #elif defined(SIMDE_X86_SSE2_NATIVE) - return - _mm_add_epi32( - _mm_srli_epi32(a, 16), - _mm_and_si128(a, _mm_set1_epi32(INT32_C(0x0000ffff))) - ); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vpaddlq_u16(a_.neon_u16); - #elif defined(SIMDE_WASM_SIMD128_NATIVE) - r_.wasm_v128 = wasm_u32x4_extadd_pairwise_u16x8(a_.wasm_v128); - #elif defined(SIMDE_POWER_ALTIVEC_P6_NATIVE) - SIMDE_POWER_ALTIVEC_VECTOR(unsigned short) one = vec_splat_u16(1); - r_.altivec_u32 = - vec_add( - vec_mule(a_.altivec_u16, one), - vec_mulo(a_.altivec_u16, one) - ); - #elif defined(SIMDE_VECTOR_SUBSCRIPT_SCALAR) - r_.u32 = - ((a_.u32 << 16) >> 16) + - ((a_.u32 >> 16) ); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, a_.u16[(i * 2)]) + HEDLEY_STATIC_CAST(uint32_t, a_.u16[(i * 2) + 1]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_haddd_epu8(a) simde_mm_haddd_epu8((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_haddq_epi8 (simde__m128i a) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_haddq_epi8(a); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vpaddlq_s32(vpaddlq_s16(vpaddlq_s8(a_.neon_i8))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = - HEDLEY_STATIC_CAST(int64_t, a_.i8[(i * 8) ]) + HEDLEY_STATIC_CAST(int64_t, a_.i8[(i * 8) + 1]) + - HEDLEY_STATIC_CAST(int64_t, a_.i8[(i * 8) + 2]) + HEDLEY_STATIC_CAST(int64_t, a_.i8[(i * 8) + 3]) + - HEDLEY_STATIC_CAST(int64_t, a_.i8[(i * 8) + 4]) + HEDLEY_STATIC_CAST(int64_t, a_.i8[(i * 8) + 5]) + - HEDLEY_STATIC_CAST(int64_t, a_.i8[(i * 8) + 6]) + HEDLEY_STATIC_CAST(int64_t, a_.i8[(i * 8) + 7]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_haddq_epi8(a) simde_mm_haddq_epi8((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_haddq_epi16 (simde__m128i a) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_haddq_epi16(a); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vpaddlq_s32(vpaddlq_s16(a_.neon_i16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = - HEDLEY_STATIC_CAST(int64_t, a_.i16[(i * 4) ]) + HEDLEY_STATIC_CAST(int64_t, a_.i16[(i * 4) + 1]) + - HEDLEY_STATIC_CAST(int64_t, a_.i16[(i * 4) + 2]) + HEDLEY_STATIC_CAST(int64_t, a_.i16[(i * 4) + 3]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_haddq_epi16(a) simde_mm_haddq_epi16((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_haddq_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_haddq_epi32(a); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vpaddlq_s32(a_.neon_i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = HEDLEY_STATIC_CAST(int64_t, a_.i32[(i * 2) ]) + HEDLEY_STATIC_CAST(int64_t, a_.i32[(i * 2) + 1]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_haddq_epi32(a) simde_mm_haddq_epi32((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_haddq_epu8 (simde__m128i a) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_haddq_epu8(a); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u64 = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(a_.neon_u8))); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.u64[i] = - HEDLEY_STATIC_CAST(uint64_t, a_.u8[(i * 8) ]) + HEDLEY_STATIC_CAST(uint64_t, a_.u8[(i * 8) + 1]) + - HEDLEY_STATIC_CAST(uint64_t, a_.u8[(i * 8) + 2]) + HEDLEY_STATIC_CAST(uint64_t, a_.u8[(i * 8) + 3]) + - HEDLEY_STATIC_CAST(uint64_t, a_.u8[(i * 8) + 4]) + HEDLEY_STATIC_CAST(uint64_t, a_.u8[(i * 8) + 5]) + - HEDLEY_STATIC_CAST(uint64_t, a_.u8[(i * 8) + 6]) + HEDLEY_STATIC_CAST(uint64_t, a_.u8[(i * 8) + 7]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_haddq_epu8(a) simde_mm_haddq_epu8((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_haddq_epu16 (simde__m128i a) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_haddq_epu16(a); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u64 = vpaddlq_u32(vpaddlq_u16(a_.neon_u16)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.u64[i] = - HEDLEY_STATIC_CAST(uint64_t, a_.u16[(i * 4) ]) + HEDLEY_STATIC_CAST(uint64_t, a_.u16[(i * 4) + 1]) + - HEDLEY_STATIC_CAST(uint64_t, a_.u16[(i * 4) + 2]) + HEDLEY_STATIC_CAST(uint64_t, a_.u16[(i * 4) + 3]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_haddq_epu16(a) simde_mm_haddq_epu16((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_haddq_epu32 (simde__m128i a) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_haddq_epu32(a); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u64 = vpaddlq_u32(a_.neon_u32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u32[(i * 2) ]) + HEDLEY_STATIC_CAST(uint64_t, a_.u32[(i * 2) + 1]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_haddq_epu32(a) simde_mm_haddq_epu32((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_hsubw_epi8 (simde__m128i a) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_hsubw_epi8(a); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i8[i * 2]) - HEDLEY_STATIC_CAST(int16_t, a_.i8[(i * 2) + 1]); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_hsubw_epi8(a) simde_mm_hsubw_epi8((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_hsubd_epi16 (simde__m128i a) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_hsubd_epi16(a); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = - HEDLEY_STATIC_CAST(int32_t, a_.i16[(i * 2) ]) - HEDLEY_STATIC_CAST(int32_t, a_.i16[(i * 2) + 1]); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_hsubd_epi8(a) simde_mm_hsubd_epi8((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_hsubq_epi32 (simde__m128i a) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_hsubq_epi32(a); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = HEDLEY_STATIC_CAST(int64_t, a_.i32[(i * 2) ]) - HEDLEY_STATIC_CAST(int64_t, a_.i32[(i * 2) + 1]); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_hsubq_epi32(a) simde_mm_hsubq_epi32((a)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_macc_epi16 (simde__m128i a, simde__m128i b, simde__m128i c) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_macc_epi16(a, b, c); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - c_ = simde__m128i_to_private(c); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vmlaq_s16(c_.neon_i16, a_.neon_i16, b_.neon_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - r_.i16[i] = (a_.i16[i] * b_.i16[i]) + c_.i16[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_macc_epi16(a, b, c) simde_mm_macc_epi16((a), (b), (c)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_macc_epi32 (simde__m128i a, simde__m128i b, simde__m128i c) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_macc_epi32(a, b, c); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - c_ = simde__m128i_to_private(c); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vmlaq_s32(c_.neon_i32, a_.neon_i32, b_.neon_i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (a_.i32[i] * b_.i32[i]) + c_.i32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_macc_epi32(a, b, c) simde_mm_macc_epi32((a), (b), (c)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maccd_epi16 (simde__m128i a, simde__m128i b, simde__m128i c) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_maccd_epi16(a, b, c); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - c_ = simde__m128i_to_private(c); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - int16x8_t even = vuzp1q_s16(a_.neon_i16, b_.neon_i16); - int32x4_t a_even = vmovl_s16(vget_low_s16(even)); - int32x4_t b_even = vmovl_high_s16(even); - r_.neon_i32 = vmlaq_s32(c_.neon_i32, a_even, b_even); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = (HEDLEY_STATIC_CAST(int32_t, a_.i16[i * 2]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i * 2])) + c_.i32[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_maccd_epi16(a, b, c) simde_mm_maccd_epi16((a), (b), (c)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_macclo_epi32 (simde__m128i a, simde__m128i b, simde__m128i c) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_macclo_epi32(a, b, c); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - c_ = simde__m128i_to_private(c); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - int32x4_t even = vuzp1q_s32(a_.neon_i32, b_.neon_i32); - r_.neon_i64 = vaddq_s64(vmull_s32(vget_low_s32(even), vget_high_s32(even)), c_.neon_i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = (HEDLEY_STATIC_CAST(int64_t, a_.i32[(i * 2) + 0]) * HEDLEY_STATIC_CAST(int64_t, b_.i32[(i * 2) + 0])) + c_.i64[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_macclo_epi16(a, b, c) simde_mm_macclo_epi16((a), (b), (c)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_macchi_epi32 (simde__m128i a, simde__m128i b, simde__m128i c) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_macchi_epi32(a, b, c); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - c_ = simde__m128i_to_private(c); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - int32x4_t even = vuzp2q_s32(a_.neon_i32, b_.neon_i32); - r_.neon_i64 = vaddq_s64(vmull_s32(vget_low_s32(even), vget_high_s32(even)), c_.neon_i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - r_.i64[i] = (HEDLEY_STATIC_CAST(int64_t, a_.i32[(i * 2) + 1]) * HEDLEY_STATIC_CAST(int64_t, b_.i32[(i * 2) + 1])) + c_.i64[i]; - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_macchi_epi16(a, b, c) simde_mm_macchi_epi16((a), (b), (c)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maccs_epi16 (simde__m128i a, simde__m128i b, simde__m128i c) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_maccs_epi16(a, b, c); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - c_ = simde__m128i_to_private(c); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - int32x4_t c_lo = vmovl_s16(vget_low_s16(c_.neon_i16)); - int32x4_t c_hi = vmovl_high_s16(c_.neon_i16); - int32x4_t lo = vmlal_s16(c_lo, vget_low_s16(a_.neon_i16), vget_low_s16(b_.neon_i16)); - int32x4_t hi = vmlal_high_s16(c_hi, a_.neon_i16, b_.neon_i16); - r_.neon_i16 = vcombine_s16(vqmovn_s32(lo), vqmovn_s32(hi)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - int32_t tmp = HEDLEY_STATIC_CAST(int32_t, a_.i16[i]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i]); - tmp += c_.i16[i]; - if (tmp > INT16_MAX) - r_.i16[i] = INT16_MAX; - else if (tmp < INT16_MIN) - r_.i16[i] = INT16_MIN; - else - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, tmp); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_maccs_epi16(a, b, c) simde_mm_maccs_epi16((a), (b), (c)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maccs_epi32 (simde__m128i a, simde__m128i b, simde__m128i c) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_maccs_epi32(a, b, c); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - c_ = simde__m128i_to_private(c); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - int64x2_t c_lo = vmovl_s32(vget_low_s32(c_.neon_i32)); - int64x2_t c_hi = vmovl_high_s32(c_.neon_i32); - int64x2_t lo = vmlal_s32(c_lo, vget_low_s32(a_.neon_i32), vget_low_s32(b_.neon_i32)); - int64x2_t hi = vmlal_high_s32(c_hi, a_.neon_i32, b_.neon_i32); - r_.neon_i32 = vcombine_s32(vqmovn_s64(lo), vqmovn_s64(hi)); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - int64_t tmp = HEDLEY_STATIC_CAST(int64_t, a_.i32[i]) * HEDLEY_STATIC_CAST(int64_t, b_.i32[i]); - tmp += HEDLEY_STATIC_CAST(int64_t, c_.i32[i]); - if (tmp > INT32_MAX) - r_.i32[i] = INT32_MAX; - else if (tmp < INT32_MIN) - r_.i32[i] = INT32_MIN; - else - r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, tmp); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_maccs_epi32(a, b, c) simde_mm_maccs_epi32((a), (b), (c)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maccsd_epi16 (simde__m128i a, simde__m128i b, simde__m128i c) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_maccsd_epi16(a, b, c); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - c_ = simde__m128i_to_private(c); - - #if defined(SIMDE_ARM_NEON_A64V8_NATIVE) - int16x8_t even = vuzp1q_s16(a_.neon_i16, b_.neon_i16); - r_.neon_i32 = vqaddq_s32(vmull_s16(vget_low_s16(even), vget_high_s16(even)), c_.neon_i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - int32_t prod = HEDLEY_STATIC_CAST(int32_t, a_.i16[i * 2]) * HEDLEY_STATIC_CAST(int32_t, b_.i16[i * 2]); - r_.i32[i] = simde_math_adds_i32(prod, c_.i32[i]); - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_maccsd_epi16(a, b, c) simde_mm_maccsd_epi16((a), (b), (c)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maccslo_epi32 (simde__m128i a, simde__m128i b, simde__m128i c) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_maccslo_epi32(a, b, c); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - c_ = simde__m128i_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - int64_t tmp = HEDLEY_STATIC_CAST(int64_t, a_.i32[(i * 2) + 0]) * HEDLEY_STATIC_CAST(int64_t, b_.i32[(i * 2) + 0]); - r_.i64[i] = simde_math_adds_i64(tmp, c_.i64[i]); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_maccslo_epi16(a, b, c) simde_mm_maccslo_epi16((a), (b), (c)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maccshi_epi32 (simde__m128i a, simde__m128i b, simde__m128i c) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_maccshi_epi32(a, b, c); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - c_ = simde__m128i_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - int64_t tmp = HEDLEY_STATIC_CAST(int64_t, a_.i32[(i * 2) + 1]) * HEDLEY_STATIC_CAST(int64_t, b_.i32[(i * 2) + 1]); - r_.i64[i] = simde_math_adds_i64(tmp, c_.i64[i]); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_maccshi_epi16(a, b, c) simde_mm_maccshi_epi16((a), (b), (c)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maddd_epi16 (simde__m128i a, simde__m128i b, simde__m128i c) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_maddd_epi16(a, b, c); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - c_ = simde__m128i_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - r_.i32[i] = - (a_.i16[(i * 2) + 0] * b_.i16[(i * 2) + 0]) + - (a_.i16[(i * 2) + 1] * b_.i16[(i * 2) + 1]); - r_.i32[i] += c_.i32[i]; - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_maddd_epi16(a, b, c) simde_mm_maddd_epi16((a), (b), (c)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_maddsd_epi16 (simde__m128i a, simde__m128i b, simde__m128i c) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_maddsd_epi16(a, b, c); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - c_ = simde__m128i_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - /* The AMD64 Architecture Programmer's Manual says that "the" - * addition is saturated; I'm not sure whether that means - * the pairwise addition or the accumulate, or both. */ - r_.i32[i] = - (a_.i16[(i * 2) + 0] * b_.i16[(i * 2) + 0]) + - (a_.i16[(i * 2) + 1] * b_.i16[(i * 2) + 1]); - r_.i32[i] = simde_math_adds_i32(r_.i32[i], c_.i32[i]); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_maddsd_epi16(a, b, c) simde_mm_maddsd_epi16((a), (b), (c)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sha_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_sha_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i8 = vshlq_s8(a_.neon_i8, b_.neon_i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - if (b_.i8[i] < 0) { - r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, a_.i8[i] >> -b_.i8[i]); - } else { - r_.i8[i] = HEDLEY_STATIC_CAST(int8_t, a_.i8[i] << b_.i8[i]); - } - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_sha_epi8(a, b) simde_mm_sha_epi8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sha_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_sha_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i16 = vshlq_s16(a_.neon_i16, b_.neon_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i16) / sizeof(r_.i16[0])) ; i++) { - if (b_.i16[i] < 0) { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] >> -b_.i16[i]); - } else { - r_.i16[i] = HEDLEY_STATIC_CAST(int16_t, a_.i16[i] << b_.i16[i]); - } - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_sha_epi16(a, b) simde_mm_sha_epi16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sha_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_sha_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i32 = vshlq_s32(a_.neon_i32, b_.neon_i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i32) / sizeof(r_.i32[0])) ; i++) { - if (b_.i32[i] < 0) { - r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.i32[i] >> -b_.i32[i]); - } else { - r_.i32[i] = HEDLEY_STATIC_CAST(int32_t, a_.i32[i] << b_.i32[i]); - } - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_sha_epi32(a, b) simde_mm_sha_epi32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_sha_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_sha_epi64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_i64 = vshlq_s64(a_.neon_i64, b_.neon_i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i64) / sizeof(r_.i64[0])) ; i++) { - if (b_.i64[i] < 0) { - r_.i64[i] = HEDLEY_STATIC_CAST(int64_t, a_.i64[i] >> -b_.i64[i]); - } else { - r_.i64[i] = HEDLEY_STATIC_CAST(int64_t, a_.i64[i] << b_.i64[i]); - } - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_sha_epi64(a, b) simde_mm_sha_epi64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_shl_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_shl_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u8 = vshlq_u8(a_.neon_u8, b_.neon_i8); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - if (HEDLEY_UNLIKELY(b_.i8[i] < -7 || b_.i8[i] > 7)) { - r_.u8[i] = 0; - } else { - if (b_.i8[i] < 0) { - r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, a_.u8[i] >> -b_.i8[i]); - } else { - r_.u8[i] = HEDLEY_STATIC_CAST(uint8_t, a_.u8[i] << b_.i8[i]); - } - } - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_shl_epi8(a, b) simde_mm_shl_epi8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_shl_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_shl_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u16 = vshlq_u16(a_.neon_u16, b_.neon_i16); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - if (HEDLEY_UNLIKELY(b_.i16[i] < -15 || b_.i16[i] > 15)) { - r_.u16[i] = 0; - } else { - if (b_.i16[i] < 0) { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i] >> -b_.i16[i]); - } else { - r_.u16[i] = HEDLEY_STATIC_CAST(uint16_t, a_.u16[i] << b_.i16[i]); - } - } - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_shl_epi16(a, b) simde_mm_shl_epi16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_shl_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_shl_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u32 = vshlq_u32(a_.neon_u32, b_.neon_i32); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - if (HEDLEY_UNLIKELY(b_.i32[i] < -31 || b_.i32[i] > 31)) { - r_.u32[i] = 0; - } else { - if (b_.i32[i] < 0) { - r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, a_.u32[i] >> -b_.i32[i]); - } else { - r_.u32[i] = HEDLEY_STATIC_CAST(uint32_t, a_.u32[i] << b_.i32[i]); - } - } - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_shl_epi32(a, b) simde_mm_shl_epi32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_shl_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_shl_epi64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - #if defined(SIMDE_ARM_NEON_A32V7_NATIVE) - r_.neon_u64 = vshlq_u64(a_.neon_u64, b_.neon_i64); - #else - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - if (HEDLEY_UNLIKELY(b_.i64[i] < -63 || b_.i64[i] > 63)) { - r_.u64[i] = 0; - } else { - if (b_.i64[i] < 0) { - r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u64[i] >> -b_.i64[i]); - } else { - r_.u64[i] = HEDLEY_STATIC_CAST(uint64_t, a_.u64[i] << b_.i64[i]); - } - } - } - #endif - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_shl_epi64(a, b) simde_mm_shl_epi64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_rot_epi8 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_rot_epi8(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (b_.i8[i] < 0) ? - HEDLEY_STATIC_CAST(uint8_t, ((a_.u8[i] >> -b_.i8[i]) | (a_.u8[i] << ( b_.i8[i] & 7)))) : - HEDLEY_STATIC_CAST(uint8_t, ((a_.u8[i] << b_.i8[i]) | (a_.u8[i] >> (-b_.i8[i] & 7)))); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_rot_epi8(a, b) simde_mm_rot_epi8((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_rot_epi16 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_rot_epi16(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (b_.i16[i] < 0) ? - HEDLEY_STATIC_CAST(uint16_t, ((a_.u16[i] >> -b_.i16[i]) | (a_.u16[i] << ( b_.i16[i] & 15)))) : - HEDLEY_STATIC_CAST(uint16_t, ((a_.u16[i] << b_.i16[i]) | (a_.u16[i] >> (-b_.i16[i] & 15)))); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_rot_epi16(a, b) simde_mm_rot_epi16((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_rot_epi32 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_rot_epi32(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (b_.i32[i] < 0) ? - HEDLEY_STATIC_CAST(uint32_t, ((a_.u32[i] >> -b_.i32[i]) | (a_.u32[i] << ( b_.i32[i] & 31)))) : - HEDLEY_STATIC_CAST(uint32_t, ((a_.u32[i] << b_.i32[i]) | (a_.u32[i] >> (-b_.i32[i] & 31)))); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_rot_epi32(a, b) simde_mm_rot_epi32((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_rot_epi64 (simde__m128i a, simde__m128i b) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_rot_epi64(a, b); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (b_.i64[i] < 0) ? - HEDLEY_STATIC_CAST(uint64_t, ((a_.u64[i] >> -b_.i64[i]) | (a_.u64[i] << ( b_.i64[i] & 63)))) : - HEDLEY_STATIC_CAST(uint64_t, ((a_.u64[i] << b_.i64[i]) | (a_.u64[i] >> (-b_.i64[i] & 63)))); - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_rot_epi64(a, b) simde_mm_rot_epi64((a), (b)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_roti_epi8 (simde__m128i a, const int count) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u8) / sizeof(r_.u8[0])) ; i++) { - r_.u8[i] = (count < 0) ? - HEDLEY_STATIC_CAST(uint8_t, ((a_.u8[i] >> -count) | (a_.u8[i] << ( count & 7)))) : - HEDLEY_STATIC_CAST(uint8_t, ((a_.u8[i] << count) | (a_.u8[i] >> (-count & 7)))); - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_XOP_NATIVE) - #define simde_mm_roti_epi8(a, count) _mm_roti_epi8((a), (count)) -#endif -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_roti_epi8(a, b) simde_mm_roti_epi8((a), (count)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_roti_epi16 (simde__m128i a, const int count) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u16) / sizeof(r_.u16[0])) ; i++) { - r_.u16[i] = (count < 0) ? - HEDLEY_STATIC_CAST(uint16_t, ((a_.u16[i] >> -count) | (a_.u16[i] << ( count & 15)))) : - HEDLEY_STATIC_CAST(uint16_t, ((a_.u16[i] << count) | (a_.u16[i] >> (-count & 15)))); - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_XOP_NATIVE) - #define simde_mm_roti_epi16(a, count) _mm_roti_epi16((a), (count)) -#endif -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_roti_epi16(a, count) simde_mm_roti_epi16((a), (count)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_roti_epi32 (simde__m128i a, const int count) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u32) / sizeof(r_.u32[0])) ; i++) { - r_.u32[i] = (count < 0) ? - HEDLEY_STATIC_CAST(uint32_t, ((a_.u32[i] >> -count) | (a_.u32[i] << ( count & 31)))) : - HEDLEY_STATIC_CAST(uint32_t, ((a_.u32[i] << count) | (a_.u32[i] >> (-count & 31)))); - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_XOP_NATIVE) - #define simde_mm_roti_epi32(a, count) _mm_roti_epi32((a), (count)) -#endif -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_roti_epi32(a, count) simde_mm_roti_epi32((a), (count)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_roti_epi64 (simde__m128i a, const int count) { - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.u64) / sizeof(r_.u64[0])) ; i++) { - r_.u64[i] = (count < 0) ? - HEDLEY_STATIC_CAST(uint64_t, ((a_.u64[i] >> -count) | (a_.u64[i] << ( count & 63)))) : - HEDLEY_STATIC_CAST(uint64_t, ((a_.u64[i] << count) | (a_.u64[i] >> (-count & 63)))); - } - - return simde__m128i_from_private(r_); -} -#if defined(SIMDE_X86_XOP_NATIVE) - #define simde_mm_roti_epi64(a, count) _mm_roti_epi64((a), (count)) -#endif -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_roti_epi64(a, count) simde_mm_roti_epi64((a), (count)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128i -simde_mm_perm_epi8 (simde__m128i a, simde__m128i b, simde__m128i c) { - #if defined(SIMDE_X86_XOP_NATIVE) - return _mm_perm_epi8(a, b, c); - #else - simde__m128i_private - r_, - a_ = simde__m128i_to_private(a), - b_ = simde__m128i_to_private(b), - c_ = simde__m128i_to_private(c); - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.i8) / sizeof(r_.i8[0])) ; i++) { - int8_t src = (c_.u8[i] & 0x10) ? b_.i8[c_.u8[i] & 0xf] : a_.i8[c_.u8[i] & 0xf]; - - switch (c_.u8[i] & 0xc0) { - case 0x40: - #if HEDLEY_HAS_BUILTIN(__builtin_bitreverse8) && !defined(HEDLEY_IBM_VERSION) - src = HEDLEY_STATIC_CAST(int8_t, __builtin_bitreverse8(HEDLEY_STATIC_CAST(uint8_t, src))); - #else - src = HEDLEY_STATIC_CAST(int8_t, ((HEDLEY_STATIC_CAST(uint8_t, src) * UINT64_C(0x80200802)) & UINT64_C(0x0884422110)) * UINT64_C(0x0101010101) >> 32); - #endif - break; - case 0x80: - src = 0; - break; - case 0xc0: - src >>= 7; - break; - } - - r_.i8[i] = (c_.u8[i] & 0x20) ? ~src : src; - } - - return simde__m128i_from_private(r_); - #endif -} -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_perm_epi8(a, b, c) simde_mm_perm_epi8((a), (b), (c)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128 -simde_mm_permute2_ps (simde__m128 a, simde__m128 b, simde__m128i c, const int imm8) { - simde__m128_private - r_, - a_ = simde__m128_to_private(a), - b_ = simde__m128_to_private(b); - simde__m128i_private c_ = simde__m128i_to_private(c); - - const int m2z = imm8 & 0x03; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - const int sel = c_.i32[i] & 0x07; - const int m = c_.i32[i] & 0x08; - - switch (m | m2z) { - case 0xa: - case 0x3: - r_.i32[i] = 0; - break; - default: - r_.i32[i] = (sel > 3) ? b_.i32[sel - 4] : a_.i32[sel]; - break; - } - } - - return simde__m128_from_private(r_); -} -#if defined(SIMDE_X86_XOP_NATIVE) - #if defined(HEDLEY_MCST_LCC_VERSION) - #define simde_mm_permute2_ps(a, b, c, imm8) (__extension__ ({ \ - SIMDE_LCC_DISABLE_DEPRECATED_WARNINGS \ - _mm_permute2_ps((a), (b), (c), (imm8)); \ - SIMDE_LCC_REVERT_DEPRECATED_WARNINGS \ - })) - #else - #define simde_mm_permute2_ps(a, b, c, imm8) _mm_permute2_ps((a), (b), (c), (imm8)) - #endif -#endif -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_permute2_ps(a, b, c, imm8) simde_mm_permute2_ps((a), (b), (c), (imm8)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m128d -simde_mm_permute2_pd (simde__m128d a, simde__m128d b, simde__m128i c, const int imm8) { - simde__m128d_private - r_, - a_ = simde__m128d_to_private(a), - b_ = simde__m128d_to_private(b); - simde__m128i_private c_ = simde__m128i_to_private(c); - - const int m2z = imm8 & 0x03; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - const int sel = (c_.i64[i] & 0x06) >> 1; - const int m = c_.i64[i] & 0x08; - - switch (m | m2z) { - case 0x0a: - case 0x03: - r_.i64[i] = 0; - break; - default: - r_.i64[i] = (sel > 1) ? b_.i64[sel - 2] : a_.i64[sel]; - break; - } - } - - return simde__m128d_from_private(r_); -} - -#if defined(SIMDE_X86_XOP_NATIVE) - #if defined(HEDLEY_MCST_LCC_VERSION) - #define simde_mm_permute2_pd(a, b, c, imm8) (__extension__ ({ \ - SIMDE_LCC_DISABLE_DEPRECATED_WARNINGS \ - _mm_permute2_pd((a), (b), (c), (imm8)); \ - SIMDE_LCC_REVERT_DEPRECATED_WARNINGS \ - })) - #else - #define simde_mm_permute2_pd(a, b, c, imm8) _mm_permute2_pd((a), (b), (c), (imm8)) - #endif -#endif -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm_permute2_pd(a, b, c, imm8) simde_mm_permute2_pd((a), (b), (c), (imm8)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256 -simde_mm256_permute2_ps (simde__m256 a, simde__m256 b, simde__m256i c, const int imm8) { - simde__m256_private - r_, - a_ = simde__m256_to_private(a), - b_ = simde__m256_to_private(b); - simde__m256i_private c_ = simde__m256i_to_private(c); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128) / sizeof(r_.m128[0])) ; i++) { - r_.m128[i] = simde_mm_permute2_ps(a_.m128[i], b_.m128[i], c_.m128i[i], imm8); - } - #else - const int m2z = imm8 & 0x03; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f32) / sizeof(r_.f32[0])) ; i++) { - const int sel = c_.i32[i] & 0x07; - const int m = c_.i32[i] & 0x08; - - switch (m | m2z) { - case 0xa: - case 0x3: - r_.i32[i] = 0; - break; - default: - r_.i32[i] = (sel > 3) ? b_.i32[sel + (HEDLEY_STATIC_CAST(int, i) & 4) - 4] : a_.i32[sel + (HEDLEY_STATIC_CAST(int, i) & 4)]; - break; - } - } - #endif - - return simde__m256_from_private(r_); -} - -#if defined(SIMDE_X86_XOP_NATIVE) - #if defined(HEDLEY_MCST_LCC_VERSION) - #define simde_mm256_permute2_ps(a, b, c, imm8) (__extension__ ({ \ - SIMDE_LCC_DISABLE_DEPRECATED_WARNINGS \ - _mm256_permute2_ps((a), (b), (c), (imm8)); \ - SIMDE_LCC_REVERT_DEPRECATED_WARNINGS \ - })) - #else - #define simde_mm256_permute2_ps(a, b, c, imm8) _mm256_permute2_ps((a), (b), (c), (imm8)) - #endif -#endif -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm256_permute2_ps(a, b, c, imm8) simde_mm256_permute2_ps((a), (b), (c), (imm8)) -#endif - -SIMDE_FUNCTION_ATTRIBUTES -simde__m256d -simde_mm256_permute2_pd (simde__m256d a, simde__m256d b, simde__m256i c, const int imm8) { - simde__m256d_private - r_, - a_ = simde__m256d_to_private(a), - b_ = simde__m256d_to_private(b); - simde__m256i_private c_ = simde__m256i_to_private(c); - - #if SIMDE_NATURAL_VECTOR_SIZE_LE(128) - for (size_t i = 0 ; i < (sizeof(r_.m128d) / sizeof(r_.m128d[0])) ; i++) { - r_.m128d[i] = simde_mm_permute2_pd(a_.m128d[i], b_.m128d[i], c_.m128i[i], imm8); - } - #else - const int m2z = imm8 & 0x03; - - SIMDE_VECTORIZE - for (size_t i = 0 ; i < (sizeof(r_.f64) / sizeof(r_.f64[0])) ; i++) { - const int sel = (c_.i64[i] & 0x06) >> 1; - const int m = c_.i64[i] & 0x08; - - switch (m | m2z) { - case 0x0a: - case 0x03: - r_.i64[i] = 0; - break; - default: - r_.i64[i] = (sel > 1) ? b_.i64[sel + (HEDLEY_STATIC_CAST(int, i) & 2) - 2] : a_.i64[sel + (HEDLEY_STATIC_CAST(int, i) & 2)]; - break; - } - } - #endif - - return simde__m256d_from_private(r_); -} -#if defined(SIMDE_X86_XOP_NATIVE) - #if defined(HEDLEY_MCST_LCC_VERSION) - #define simde_mm256_permute2_pd(a, b, c, imm8) (__extension__ ({ \ - SIMDE_LCC_DISABLE_DEPRECATED_WARNINGS \ - _mm256_permute2_pd((a), (b), (c), (imm8)); \ - SIMDE_LCC_REVERT_DEPRECATED_WARNINGS \ - })) - #else - #define simde_mm256_permute2_pd(a, b, c, imm8) simde_undeprecated_mm256_permute2_pd((a), (b), (c), (imm8)) - #endif -#endif -#if defined(SIMDE_X86_XOP_ENABLE_NATIVE_ALIASES) - #define _mm256_permute2_pd(a, b, c, imm8) simde_mm256_permute2_pd((a), (b), (c), (imm8)) -#endif - -HEDLEY_DIAGNOSTIC_POP -SIMDE_END_DECLS_ - -#endif /* !defined(SIMDE_X86_XOP_H) */ diff --git a/identity/Cargo.toml b/identity/Cargo.toml index 11f1988..d32e274 100644 --- a/identity/Cargo.toml +++ b/identity/Cargo.toml @@ -5,7 +5,6 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -libc = "0.2.147" crypto = { path = '../crypto', features = ["encryption"] } logger = { path = '../logger' } diff --git a/identity/src/lib.rs b/identity/src/lib.rs index 7e56c77..80f5e1b 100644 --- a/identity/src/lib.rs +++ b/identity/src/lib.rs @@ -1,34 +1,7 @@ -extern crate libc; +#![allow(unused_assignments)] +#![allow(dead_code)] use crypto; -use std::str::Utf8Error; -use logger::error; - -extern { - fn getSubseed(seed: *const u8, subseed: *mut u8) -> bool; - fn getPrivateKey(subseed: *const u8, privateKey: *mut u8); - fn getPublicKey(privateKey: *const u8, publicKey: *mut u8); - fn getIdentity(publicKey: *const u8, identity: *const u8, isLowerCase: bool); - //bool getPublicKeyFromIdentity(const unsigned char* identity, unsigned char* publicKey) - fn getPublicKeyFromIdentity(identity: *const u8, publicKey: *mut u8); - - // bool getSharedKey(const unsigned char* privateKey, const unsigned char* publicKey, unsigned char* sharedKey) - //void sign(const unsigned char* subseed, const unsigned char* publicKey, const unsigned char* messageDigest, unsigned char* signature) - //bool verify(const unsigned char* publicKey, const unsigned char* messageDigest, const unsigned char* signature) -} - -fn identity(seed: &str) -> Vec { - let mut own_subseed: [u8; 32] = [0; 32]; - let mut private_key: [u8; 32] = [0; 32]; - let mut public_key: [u8; 32] = [0; 32]; - let mut identity: [u8; 60] = [0; 60]; - unsafe { - getSubseed(seed.as_ptr(), own_subseed.as_mut_ptr()); - getPrivateKey(own_subseed.as_ptr(), private_key.as_mut_ptr()); - getPublicKey(private_key.as_ptr(), public_key.as_mut_ptr()); - getIdentity(public_key.as_ptr(), identity.as_mut_ptr(), false); - } - identity.to_owned().to_vec() -} +use core::str::Utf8Error; fn identity_to_address(identity: &Vec) -> Result { match std::str::from_utf8(identity.as_slice()) { @@ -37,18 +10,6 @@ fn identity_to_address(identity: &Vec) -> Result { } } -pub fn get_identity_from_pub_key(pub_key: &[u8]) -> String { - let mut identity: [u8; 60] = [0; 60]; - unsafe { getIdentity(pub_key.as_ptr(), identity.as_mut_ptr(), false); } - std::str::from_utf8(&identity).unwrap().to_string() -} - -pub fn get_public_key_from_identity(identity: &str) -> Result, ()> { - let mut pub_key: [u8; 32] = [0; 32]; - unsafe { getPublicKeyFromIdentity(identity.as_ptr(), pub_key.as_mut_ptr()) }; - Ok(pub_key.to_owned().to_vec()) -} - #[derive(Debug)] pub struct Identity { pub seed: String, @@ -63,14 +24,13 @@ impl Identity { if !self.contains_seed() { Err("Invalid Seed! Can't Get Public Key!".to_string()) } else { - let mut own_subseed: [u8; 32] = [0; 32]; - let mut private_key: [u8; 32] = [0; 32]; + let mut own_subseed: Vec = vec![]; + let mut private_key: Vec = vec![]; let mut public_key: [u8; 32] = [0; 32]; - unsafe { - getSubseed(self.seed.as_str().as_ptr(), own_subseed.as_mut_ptr()); - getPrivateKey(own_subseed.as_ptr(), private_key.as_mut_ptr()); - getPublicKey(private_key.as_ptr(), public_key.as_mut_ptr()); - } + own_subseed = crypto::qubic_identities::get_subseed(self.seed.as_str()).expect("Failed To Get SubSeed!"); + private_key = crypto::qubic_identities::get_private_key(&own_subseed); + public_key = crypto::qubic_identities::get_public_key(&private_key); + own_subseed = crypto::qubic_identities::get_subseed(self.seed.as_str()).expect("Failed To Get SubSeed!"); Ok(Vec::from(public_key)) } } @@ -85,29 +45,16 @@ impl Identity { } pub fn contains_seed(&self) -> bool { self.seed.len() == 55} pub fn new(seed: &str) -> Self { - let id = identity(seed); - - - match identity_to_address(&id) { - Ok(address) => { - Identity { - seed: String::from(seed), - hash: String::from(""), - salt: String::from(""), - identity: address, - encrypted: false - } - }, - Err(err) => { - error!("Error Generating Identity! : {}", err.to_string()); - Identity { - seed: String::from(""), - hash: String::from(""), - salt: String::from(""), - identity: String::from(""), - encrypted: false - } - } + let subseed = crypto::qubic_identities::get_subseed(seed).expect("Failed To Get SubSeed!"); + let private_key = crypto::qubic_identities::get_private_key(&subseed); + let public_key = crypto::qubic_identities::get_public_key(&private_key); + let id = crypto::qubic_identities::get_identity(&public_key); + Identity { + seed: String::from(seed), + hash: String::from(""), + salt: String::from(""), + identity: id, + encrypted: false } } pub fn encrypt_identity(&mut self, password: &str) -> Result { @@ -173,7 +120,6 @@ mod create_identity { #[test] fn create_new_identity() { let id: Identity = Identity::new("lcehvbvddggkjfnokduyjuiyvkklrvrmsaozwbvjlzvgvfipqpnkkuf"); - println!("{:?}", &id); assert_eq!(id.identity.as_str(), "EPYWDREDNLHXOFYVGQUKPHJGOMPBSLDDGZDPKVQUMFXAIQYMZGEHPZTAAWON"); } From 85f0fe44b652b45aa0a366bba9a6b04ce74dc788 Mon Sep 17 00:00:00 2001 From: Matthew Darnell Date: Fri, 1 Mar 2024 14:25:21 -0500 Subject: [PATCH 2/5] Add get_public_key_from_identity function --- api/src/lib.rs | 4 ++-- api/src/transfer.rs | 16 +++++++++------- crypto/src/lib.rs | 45 ++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 55 insertions(+), 10 deletions(-) diff --git a/api/src/lib.rs b/api/src/lib.rs index 1ca8853..157fee7 100644 --- a/api/src/lib.rs +++ b/api/src/lib.rs @@ -5,7 +5,7 @@ extern crate crypto; extern crate identity; extern crate core; -use identity::get_public_key_from_identity; +use crypto::qubic_identities::get_public_key_from_identity; use crate::header::{ EntityType, RequestResponseHeader }; use crate::transfer::TransferTransaction; @@ -43,7 +43,7 @@ impl QubicApiPacket { let mut header = RequestResponseHeader::new(); header.set_type(EntityType::RequestEntity); - let data: Vec = get_public_key_from_identity(id).unwrap(); + let data: Vec = get_public_key_from_identity(&String::from(id)).unwrap().to_vec(); let size = std::mem::size_of::() + data.len(); header.set_size(size); QubicApiPacket { diff --git a/api/src/transfer.rs b/api/src/transfer.rs index a835c0d..b126a7c 100644 --- a/api/src/transfer.rs +++ b/api/src/transfer.rs @@ -1,6 +1,7 @@ use std::ffi::c_uchar; -use identity::{Identity, get_public_key_from_identity}; +use identity::Identity; use crypto::hash::k12_bytes; +use crypto::qubic_identities::{ get_subseed, get_public_key_from_identity }; use logger::info; extern { //extern ECCRYPTO_STATUS SchnorrQ_Sign(const unsigned char* SecretKey, const unsigned char* PublicKey, const unsigned char* Message, const unsigned int SizeMessage, unsigned char* Signature); @@ -33,18 +34,18 @@ impl TransferTransaction { if source_identity.seed.len() != 55 { panic!("Trying To Transfer From Corrupted Identity!"); } - let pub_key_src = match get_public_key_from_identity(source_identity.identity.as_str()) { + let pub_key_src = match get_public_key_from_identity(&source_identity.identity) { Ok(pub_key) => pub_key, Err(err) => panic!("{:?}", err) }; - let pub_key_dest = match get_public_key_from_identity(dest) { + let pub_key_dest = match get_public_key_from_identity(&String::from(dest)) { Ok(pub_key) => pub_key, Err(err) => panic!("{:?}", err) }; let mut t: TransferTransaction = TransferTransaction { - _source_public_key: pub_key_src.clone(), - _source_destination_public_key: pub_key_dest.clone(), + _source_public_key: pub_key_src.to_vec(), + _source_destination_public_key: pub_key_dest.to_vec(), _amount: amount, _tick: tick + TICK_OFFSET, _input_type: 0, @@ -53,13 +54,14 @@ impl TransferTransaction { }; info!("Setting Expiration Tick For Transaction To {}", tick + TICK_OFFSET); let digest: Vec = k12_bytes(&t.as_bytes_without_signature()); - let mut sub_seed: [u8; 32] = [0; 32]; + //let mut sub_seed: [u8; 32] = [0; 32]; + let mut sub_seed: Vec = get_subseed(source_identity.seed.as_str()).expect("Failed To Get SubSeed!"); unsafe { getSubseed(source_identity.seed.as_str().as_ptr(), sub_seed.as_mut_ptr()); } let mut sig: [u8; 64] = [0; 64]; unsafe { - sign(sub_seed.as_ptr(), pub_key_src.as_ptr(), digest.as_ptr(), sig.as_mut_ptr()); + sign(sub_seed.as_slice().as_ptr(), pub_key_src.as_ptr(), digest.as_ptr(), sig.as_mut_ptr()); //SchnorrQ_Sign(sub_seed.as_ptr(), pub_key_src.as_ptr(), digest.as_ptr(), 32, sig.as_mut_ptr()); } t._signature = sig.to_vec(); diff --git a/crypto/src/lib.rs b/crypto/src/lib.rs index 851b859..d358737 100644 --- a/crypto/src/lib.rs +++ b/crypto/src/lib.rs @@ -110,9 +110,41 @@ pub mod qubic_identities { String::from_utf8(identity.to_vec()).unwrap() } + pub fn get_public_key_from_identity(identity: &String) -> Result<[u8; 32], bool> { + let id: &[u8] = identity.as_bytes(); + let mut public_key: [u8; 32] = [0; 32]; + for i in 0..4 { + public_key[i << 3..((i<<3) + 8)].copy_from_slice(&u64::to_le_bytes(0u64)); + for j in 0..14 { + let index = 14 - j - 1; + if id[i * 14 + index] < b'A' || (id[i * 14 + index]) > b'Z' { + return Err(false); + } + let _bytes: [u8; 8] = public_key[i << 3..((i << 3) + 8)].try_into().unwrap(); + let temp: u64 = u64::from_le_bytes(_bytes) * 26u64 + + ((id[i * 14 + index] - b'A') as u64); + public_key[i << 3..((i<<3) + 8)].copy_from_slice(&u64::to_le_bytes(temp)); + + } + } + #[allow(unused_assignments)] + let mut identity_bytes_checksum: u32 = 0; + let hash: Vec = k12_bytes(&public_key.to_vec()); + let bytes: [u8; 4] = hash[0..4].try_into().unwrap(); + identity_bytes_checksum = u32::from_le_bytes(bytes); + identity_bytes_checksum &= 0x3FFFF; + for i in 0..4 { + if (identity_bytes_checksum % 26) as u8 + b'A' != identity.as_bytes()[56 + i] { + return Err(false) + } + identity_bytes_checksum /= 26; + } + Ok(public_key) + } + #[cfg(test)] pub mod qubic_identity_primitive_tests { - use crate::qubic_identities::{get_identity, get_private_key, get_public_key, get_subseed}; + use crate::qubic_identities::{get_identity, get_private_key, get_public_key, get_public_key_from_identity, get_subseed}; #[test] fn get_a_subseed() { let seed = "lcehvbvddggkjfnokduyjuiyvkklrvrmsaozwbvjlzvgvfipqpnkkuf"; @@ -146,6 +178,17 @@ pub mod qubic_identities { let identity = get_identity(&public_key); assert_eq!(identity, "EPYWDREDNLHXOFYVGQUKPHJGOMPBSLDDGZDPKVQUMFXAIQYMZGEHPZTAAWON".to_string()) } + #[test] + fn get_a_public_key_from_identity() { + let seed = "lcehvbvddggkjfnokduyjuiyvkklrvrmsaozwbvjlzvgvfipqpnkkuf"; + let subseed = get_subseed(seed).unwrap(); + let private_key = get_private_key(&subseed); + let public_key = get_public_key(&private_key); + let identity = get_identity(&public_key); + let pub_key_from_id = get_public_key_from_identity(&identity).unwrap(); + + assert_eq!(public_key, pub_key_from_id) + } } } From 84834d5e2e7d2a4f92df59b383e2b5161c601fff Mon Sep 17 00:00:00 2001 From: Matthew Darnell Date: Fri, 1 Mar 2024 16:42:37 -0500 Subject: [PATCH 3/5] Sign function native Rust --- api/src/lib.rs | 7 +-- api/src/transfer.rs | 45 ++++++++++--------- crypto/src/fourq/ops.rs | 6 +-- crypto/src/lib.rs | 97 +++++++++++++++++++++++++++++++++++++++-- network/src/bin/bin.rs | 35 --------------- 5 files changed, 126 insertions(+), 64 deletions(-) delete mode 100644 network/src/bin/bin.rs diff --git a/api/src/lib.rs b/api/src/lib.rs index 157fee7..22f12fa 100644 --- a/api/src/lib.rs +++ b/api/src/lib.rs @@ -3,7 +3,6 @@ pub mod response; pub mod transfer; extern crate crypto; extern crate identity; -extern crate core; use crypto::qubic_identities::get_public_key_from_identity; use crate::header::{ EntityType, RequestResponseHeader }; @@ -112,8 +111,10 @@ pub mod api_formatting_tests { let mut req = QubicApiPacket::get_identity_balance("EPYWDREDNLHXOFYVGQUKPHJGOMPBSLDDGZDPKVQUMFXAIQYMZGEHPZTAAWON"); req.header.zero_dejavu(); //Dejavu is random 3 byte value let bytes = req.as_bytes(); - assert_eq!(bytes.len(), 68); + println!("{:?}", &bytes); + assert_eq!(bytes.len(), 40); assert_eq!(bytes.as_slice(), - vec![68, 0, 0, 0, 0, 0, 0, 31, 69, 80, 89, 87, 68, 82, 69, 68, 78, 76, 72, 88, 79, 70, 89, 86, 71, 81, 85, 75, 80, 72, 74, 71, 79, 77, 80, 66, 83, 76, 68, 68, 71, 90, 68, 80, 75, 86, 81, 85, 77, 70, 88, 65, 73, 81, 89, 77, 90, 71, 69, 72, 80, 90, 84, 65, 65, 87, 79, 78]); + vec![40, 0, 0, 31, 0, 0, 0, 0, 170, 135, 62, 76, 253, 55, 228, 191, 82, 138, 42, 160, 30, 236, 239, 54, 84, 124, 153, 202, 170, 189, 27, 189, 247, 37, 58, 101, 176, 65, 119, 26] + ); } } \ No newline at end of file diff --git a/api/src/transfer.rs b/api/src/transfer.rs index b126a7c..03470fe 100644 --- a/api/src/transfer.rs +++ b/api/src/transfer.rs @@ -1,17 +1,7 @@ -use std::ffi::c_uchar; use identity::Identity; use crypto::hash::k12_bytes; -use crypto::qubic_identities::{ get_subseed, get_public_key_from_identity }; +use crypto::qubic_identities::{get_subseed, get_public_key_from_identity, sign_raw}; use logger::info; -extern { - //extern ECCRYPTO_STATUS SchnorrQ_Sign(const unsigned char* SecretKey, const unsigned char* PublicKey, const unsigned char* Message, const unsigned int SizeMessage, unsigned char* Signature); - fn sign(subseed: *const u8, publicKey: *const c_uchar, messageDigest: *const c_uchar, signature: *mut c_uchar); - //fn SchnorrQ_Sign(subseed: *const u8, publicKey: *const c_uchar, messageDigest: *const c_uchar, SizeMessage: u32, signature: *mut c_uchar); - fn getSubseed(seed: *const c_uchar, subseed: *mut c_uchar) -> bool; - //bool getSubseed(const unsigned char* seed, unsigned char* subseed) - //void sign(const unsigned char* subseed, const unsigned char* publicKey, const unsigned char* messageDigest, unsigned char* signature) -} - #[derive(Debug, Clone)] pub struct TransferTransaction { @@ -38,6 +28,7 @@ impl TransferTransaction { Ok(pub_key) => pub_key, Err(err) => panic!("{:?}", err) }; + println!("{} -> {:?}", source_identity.identity.as_str(), &pub_key_src); let pub_key_dest = match get_public_key_from_identity(&String::from(dest)) { Ok(pub_key) => pub_key, Err(err) => panic!("{:?}", err) @@ -56,14 +47,10 @@ impl TransferTransaction { let digest: Vec = k12_bytes(&t.as_bytes_without_signature()); //let mut sub_seed: [u8; 32] = [0; 32]; let mut sub_seed: Vec = get_subseed(source_identity.seed.as_str()).expect("Failed To Get SubSeed!"); - unsafe { - getSubseed(source_identity.seed.as_str().as_ptr(), sub_seed.as_mut_ptr()); - } let mut sig: [u8; 64] = [0; 64]; - unsafe { - sign(sub_seed.as_slice().as_ptr(), pub_key_src.as_ptr(), digest.as_ptr(), sig.as_mut_ptr()); - //SchnorrQ_Sign(sub_seed.as_ptr(), pub_key_src.as_ptr(), digest.as_ptr(), 32, sig.as_mut_ptr()); - } + + sig = sign_raw(&sub_seed, &pub_key_src, digest.as_slice().try_into().unwrap()); + println!("Signed Signature: {:?}", sig); t._signature = sig.to_vec(); t } @@ -132,9 +119,27 @@ impl TransferTransaction { #[test] fn create_transfer() { let id: Identity = Identity::new("lcehvbvddggkjfnokduyjuiyvkklrvrmsaozwbvjlzvgvfipqpnkkuf"); - let t: TransferTransaction = TransferTransaction::from_vars(&id, "EPYWDREDNLHXOFYVGQUKPHJGOMPBSLDDGZDPKVQUMFXAIQYMZGEHPZTAAWON", 100, 100); - let expected: Vec = vec![170, 135, 62, 76, 253, 55, 228, 191, 82, 138, 42, 160, 30, 236, 239, 54, 84, 124, 153, 202, 170, 189, 27, 189, 247, 37, 58, 101, 176, 65, 119, 26, 170, 135, 62, 76, 253, 55, 228, 191, 82, 138, 42, 160, 30, 236, 239, 54, 84, 124, 153, 202, 170, 189, 27, 189, 247, 37, 58, 101, 176, 65, 119, 26, 100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 110, 0, 0, 0, 105, 55, 108, 214, 255, 246, 151, 81, 6, 214, 129, 65, 96, 14, 146, 66, 206, 140, 212, 149, 217, 230, 189, 217, 106, 16, 216, 3, 208, 51, 185, 179, 25, 89, 215, 168, 85, 62, 9, 204, 52, 238, 245, 199, 48, 2, 43, 52, 117, 72, 109, 119, 84, 236, 135, 240, 56, 179, 194, 36, 96, 124, 32, 0]; + let t: TransferTransaction = TransferTransaction::from_vars(&id, "EPYWDREDNLHXOFYVGQUKPHJGOMPBSLDDGZDPKVQUMFXAIQYMZGEHPZTAAWON", 100, 80); + let expected: Vec = vec![ + //source pub key: u32 + 170, 135, 62, 76, 253, 55, 228, 191, 82, 138, 42, 160, 30, 236, 239, 54, 84, 124, 153, 202, 170, 189, 27, 189, 247, 37, 58, 101, 176, 65, 119, 26, + //dest pub key: u32 + 170, 135, 62, 76, 253, 55, 228, 191, 82, 138, 42, 160, 30, 236, 239, 54, 84, 124, 153, 202, 170, 189, 27, 189, 247, 37, 58, 101, 176, 65, 119, 26, + //amount: u64 + 100, 0, 0, 0, 0, 0, 0, 0, + //tick: u32 + 110, 0, 0, 0, + //input type: u16 + 0, 0, + //input size: u16 + 0, 0, + //signature: u64 + 179, 108, 100, 1, 209, 21, 45, 198, 110, 190, 137, 194, 107, 157, 36, 76, 124, 94, 142, 45, 125, 220, 238, 70, 17, 253, 181, 125, 147, 192, 126, + 93, 7, 155, 196, 186, 185, 143, 220, 131, 215, 170, 241, 92, 83, 71, 181, 143, 107, 62, 90, 232, 10, 164, 55, 202, 24, 189, 84, 156, 203, 51, 27, 0 + ]; + assert_eq!(t.as_bytes().as_slice(), expected.as_slice()); + } diff --git a/crypto/src/fourq/ops.rs b/crypto/src/fourq/ops.rs index afde1a3..9aab9cb 100644 --- a/crypto/src/fourq/ops.rs +++ b/crypto/src/fourq/ops.rs @@ -35,7 +35,7 @@ use crate::{ }}; #[inline(always)] -fn addcarry_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 { +pub fn addcarry_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 { #[cfg(target_arch = "x86_64")] unsafe { _addcarry_u64(c_in, a, b, out) @@ -53,7 +53,7 @@ fn addcarry_u64(c_in: u8, a: u64, b: u64, out: &mut u64) -> u8 { } #[inline(always)] -fn subborrow_u64(b_in: u8, a: u64, b: u64, out: &mut u64) -> u8 { +pub fn subborrow_u64(b_in: u8, a: u64, b: u64, out: &mut u64) -> u8 { #[cfg(target_arch = "x86_64")] unsafe { _subborrow_u64(b_in, a, b, out) @@ -212,7 +212,7 @@ pub fn fp2div1271(a: &mut F2elmT) { let mut mask: u64; let mut temp = [0u64; 2]; - mask = 0 - (1 & a[0][0]); + mask = 0u64.wrapping_sub((1 & a[0][0])); addcarry_u64(addcarry_u64(0, a[0][0], mask, &mut temp[0]), a[0][1], mask >> 1, &mut temp[1]); a[0][0] = __shiftright128(temp[0], temp[1], 1); a[0][1] = temp[1] >> 1; diff --git a/crypto/src/lib.rs b/crypto/src/lib.rs index d358737..cc60066 100644 --- a/crypto/src/lib.rs +++ b/crypto/src/lib.rs @@ -1,6 +1,5 @@ #![feature(ascii_char)] #![feature(ascii_char_variants)] - mod fourq; const A_LOWERCASE_ASCII: u8 = 97u8; @@ -37,9 +36,12 @@ pub mod hash { pub mod qubic_identities { + use core::{ptr::copy_nonoverlapping, fmt::{Debug, Display}, str::FromStr}; + use tiny_keccak::{Hasher, IntoXof, KangarooTwelve, Xof}; use crate::{A_LOWERCASE_ASCII, hash}; use hash::k12_bytes; - use crate::fourq::ops::{ecc_mul_fixed, encode}; + use crate::fourq::consts::{CURVE_ORDER_0, CURVE_ORDER_1, CURVE_ORDER_2, CURVE_ORDER_3, MONTGOMERY_R_PRIME, ONE}; + use crate::fourq::ops::{addcarry_u64, ecc_mul_fixed, encode, montgomery_multiply_mod_order, subborrow_u64}; use crate::fourq::types::{PointAffine}; // fn getPublicKey(privateKey: *const u8, publicKey: *mut u8); @@ -142,9 +144,81 @@ pub mod qubic_identities { Ok(public_key) } + pub fn sign_raw(subseed: &Vec, public_key: &[u8; 32], message_digest: [u8; 32]) -> [u8; 64] { + + println!("Got Subseed: {:?}", subseed); + println!("Got Public Key: {:?}", public_key); + println!("Got Message Digest: {:?}", &message_digest); + + let mut r_a = PointAffine::default(); + let (mut k, mut h, mut temp) = ([0u8; 64], [0u8; 64], [0u8; 96]); + let mut r = [0u8; 64]; + + + let mut kg = KangarooTwelve::new(b""); + kg.update(subseed.as_slice()); + kg.into_xof().squeeze(&mut k); + + let mut signature = [0u8; 64]; + + unsafe { + copy_nonoverlapping(k.as_ptr().offset(32), temp.as_mut_ptr().offset(32), 32); + copy_nonoverlapping(message_digest.as_ptr(), temp.as_mut_ptr().offset(64), 32); + + + let mut kg = KangarooTwelve::new(b""); + kg.update(&temp[32..]); + let mut im = [0u8; 64]; + kg.into_xof().squeeze(&mut im); + + copy_nonoverlapping(im.as_ptr(), r.as_mut_ptr(), 64); + let k: [u64; 8] = k.chunks_exact(8).map(|c| u64::from_le_bytes(c.try_into().unwrap())).collect::>().try_into().unwrap(); + let mut r: [u64; 8] = r.chunks_exact(8).map(|c| u64::from_le_bytes(c.try_into().unwrap())).collect::>().try_into().unwrap(); + ecc_mul_fixed(&r, &mut r_a); + + encode(&mut r_a, &mut signature); + let mut signature_i: [u64; 8] = signature.chunks_exact(8).map(|c| u64::from_le_bytes(c.try_into().unwrap())).collect::>().try_into().unwrap(); + + copy_nonoverlapping(signature_i.as_ptr() as *mut u8, temp.as_mut_ptr(), 32); + copy_nonoverlapping(public_key.as_ptr(), temp.as_mut_ptr().offset(32), 32); + + + let mut kg = KangarooTwelve::new(b""); + kg.update(&temp); + kg.into_xof().squeeze(&mut h); + + let mut h: [u64; 8] = h.chunks_exact(8).map(|c| u64::from_le_bytes(c.try_into().unwrap())).collect::>().try_into().unwrap(); + let r_i = r; + montgomery_multiply_mod_order(&r_i, &MONTGOMERY_R_PRIME, &mut r); + let r_i = r; + montgomery_multiply_mod_order(&r_i, &ONE, &mut r); + let h_i = h; + montgomery_multiply_mod_order(&h_i, &MONTGOMERY_R_PRIME, &mut h); + let h_i = h; + montgomery_multiply_mod_order(&h_i, &ONE, &mut h); + montgomery_multiply_mod_order(&k, &MONTGOMERY_R_PRIME, &mut signature_i[4..]); + let h_i = h; + montgomery_multiply_mod_order(&h_i, &MONTGOMERY_R_PRIME, &mut h); + let mut s_i = [0u64; 4]; + s_i.copy_from_slice(&signature_i[4..]); + montgomery_multiply_mod_order(&s_i, &h, &mut signature_i[4..]); + s_i.copy_from_slice(&signature_i[4..]); + montgomery_multiply_mod_order(&s_i, &ONE, &mut signature_i[4..]); + + if subborrow_u64(subborrow_u64(subborrow_u64(subborrow_u64(0, r[0], signature_i[4], &mut signature_i[4]), r[1], signature_i[5], &mut signature_i[5]), r[2], signature_i[6], &mut signature_i[6]), r[3], signature_i[7], &mut signature_i[7]) != 0 { + addcarry_u64(addcarry_u64(addcarry_u64(addcarry_u64(0, signature_i[4], CURVE_ORDER_0, &mut signature_i[4]), signature_i[5], CURVE_ORDER_1, &mut signature_i[5]), signature_i[6], CURVE_ORDER_2, &mut signature_i[6]),signature_i[7], CURVE_ORDER_3, &mut signature_i[7]); + } + + signature = signature_i.into_iter().flat_map(u64::to_le_bytes).collect::>().try_into().unwrap(); + } + signature + } + + #[cfg(test)] pub mod qubic_identity_primitive_tests { - use crate::qubic_identities::{get_identity, get_private_key, get_public_key, get_public_key_from_identity, get_subseed}; + use crate::hash::k12_bytes; + use crate::qubic_identities::{get_identity, get_private_key, get_public_key, get_public_key_from_identity, get_subseed, sign_raw}; #[test] fn get_a_subseed() { let seed = "lcehvbvddggkjfnokduyjuiyvkklrvrmsaozwbvjlzvgvfipqpnkkuf"; @@ -189,6 +263,23 @@ pub mod qubic_identities { assert_eq!(public_key, pub_key_from_id) } + + #[test] + fn test_sign_a_message() { + let seed = "lcehvbvddggkjfnokduyjuiyvkklrvrmsaozwbvjlzvgvfipqpnkkuf"; + let message: [u8; 32] = [1; 32]; + let digest = k12_bytes(&message.to_vec()); + let subseed = get_subseed(seed).unwrap(); + let private_key = get_private_key(&subseed); + let public_key = get_public_key(&private_key); + let identity = get_identity(&public_key); + let pub_key_from_id = get_public_key_from_identity(&identity).unwrap(); + let result = sign_raw(&subseed, &public_key, <[u8; 32]>::try_from(digest.as_slice()).expect("Failed!")); + println!("{:?}", result); + assert_eq!(public_key, pub_key_from_id) + } + + } } diff --git a/network/src/bin/bin.rs b/network/src/bin/bin.rs deleted file mode 100644 index 820c243..0000000 --- a/network/src/bin/bin.rs +++ /dev/null @@ -1,35 +0,0 @@ -use std::time::Duration; -use std::thread::sleep; -use std::io::prelude::*; -use std::net::TcpStream; -extern crate api; -extern crate network; -extern crate crypto; -use api::qubic_api_t; -use network::peers::{PeerStrategy, PeerSet}; - -fn main() { - //let peer_ips = vec!["85.10.199.154:21841", "148.251.184.163:21841"]; - let peer_ips = vec!["85.10.199.154:21841"]; - println!("Creating Peer Set"); - let mut peer_set = PeerSet::new(PeerStrategy::RANDOM); - for ip in peer_ips { - println!("Adding Peer {}", ip); - peer_set.add_peer(ip); - println!("Peer Added"); - } - println!("Number Of Peers: {}", peer_set.num_peers()); - let delay = Duration::from_secs(3); - - - loop { - let mut request = qubic_api_t::get_identity_balance("BAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAARMID"); - match peer_set.make_request(request) { - Ok(_) => {}, - //Ok(_) => println!("{:?}", request.response_data), - Err(err) => println!("{}", err) - } - sleep(delay); - } - -} \ No newline at end of file From 7183cdf9e3e64b0122b83e7a919025ff862ad0dc Mon Sep 17 00:00:00 2001 From: Matthew Darnell Date: Fri, 1 Mar 2024 16:48:05 -0500 Subject: [PATCH 4/5] Clean up logs --- api/src/lib.rs | 3 +-- api/src/transfer.rs | 2 -- crypto/src/lib.rs | 5 ----- 3 files changed, 1 insertion(+), 9 deletions(-) diff --git a/api/src/lib.rs b/api/src/lib.rs index 22f12fa..482789c 100644 --- a/api/src/lib.rs +++ b/api/src/lib.rs @@ -103,7 +103,7 @@ pub mod api_formatting_tests { #[test] fn create_identity_balance_request_entity() { let req = QubicApiPacket::get_identity_balance("EPYWDREDNLHXOFYVGQUKPHJGOMPBSLDDGZDPKVQUMFXAIQYMZGEHPZTAAWON"); - println!("{:?}", req); + assert_eq!(req.header._size[0], 40u8); } #[test] @@ -111,7 +111,6 @@ pub mod api_formatting_tests { let mut req = QubicApiPacket::get_identity_balance("EPYWDREDNLHXOFYVGQUKPHJGOMPBSLDDGZDPKVQUMFXAIQYMZGEHPZTAAWON"); req.header.zero_dejavu(); //Dejavu is random 3 byte value let bytes = req.as_bytes(); - println!("{:?}", &bytes); assert_eq!(bytes.len(), 40); assert_eq!(bytes.as_slice(), vec![40, 0, 0, 31, 0, 0, 0, 0, 170, 135, 62, 76, 253, 55, 228, 191, 82, 138, 42, 160, 30, 236, 239, 54, 84, 124, 153, 202, 170, 189, 27, 189, 247, 37, 58, 101, 176, 65, 119, 26] diff --git a/api/src/transfer.rs b/api/src/transfer.rs index 03470fe..bd0a19c 100644 --- a/api/src/transfer.rs +++ b/api/src/transfer.rs @@ -28,7 +28,6 @@ impl TransferTransaction { Ok(pub_key) => pub_key, Err(err) => panic!("{:?}", err) }; - println!("{} -> {:?}", source_identity.identity.as_str(), &pub_key_src); let pub_key_dest = match get_public_key_from_identity(&String::from(dest)) { Ok(pub_key) => pub_key, Err(err) => panic!("{:?}", err) @@ -50,7 +49,6 @@ impl TransferTransaction { let mut sig: [u8; 64] = [0; 64]; sig = sign_raw(&sub_seed, &pub_key_src, digest.as_slice().try_into().unwrap()); - println!("Signed Signature: {:?}", sig); t._signature = sig.to_vec(); t } diff --git a/crypto/src/lib.rs b/crypto/src/lib.rs index cc60066..a39d744 100644 --- a/crypto/src/lib.rs +++ b/crypto/src/lib.rs @@ -145,11 +145,6 @@ pub mod qubic_identities { } pub fn sign_raw(subseed: &Vec, public_key: &[u8; 32], message_digest: [u8; 32]) -> [u8; 64] { - - println!("Got Subseed: {:?}", subseed); - println!("Got Public Key: {:?}", public_key); - println!("Got Message Digest: {:?}", &message_digest); - let mut r_a = PointAffine::default(); let (mut k, mut h, mut temp) = ([0u8; 64], [0u8; 64], [0u8; 96]); let mut r = [0u8; 64]; From 5985bf37055f21a3d3311178b09c11ea6a675533 Mon Sep 17 00:00:00 2001 From: Matthew Darnell Date: Fri, 1 Mar 2024 17:14:02 -0500 Subject: [PATCH 5/5] Don't worry about extra bytes on TickInfo... ...4 bytes were added to include epoch info Cut down on starting peers --- api/src/response.rs | 7 +++---- src/main.rs | 9 +-------- 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/api/src/response.rs b/api/src/response.rs index 7c37713..9e690cc 100644 --- a/api/src/response.rs +++ b/api/src/response.rs @@ -17,7 +17,9 @@ pub fn get_formatted_response(response: &mut QubicApiPacket) { match response.api_type { EntityType::RespondCurrentTickInfo => { if let Some(peer_id) = &response.peer { - if response.data.len() == 12 { + if response.data.len() < 12 { + println!("Malformed Current Tick Response."); + } else { let mut data: [u8; 4] = [0; 4]; data[0] = response.data[4]; data[1] = response.data[5]; @@ -28,9 +30,6 @@ pub fn get_formatted_response(response: &mut QubicApiPacket) { Ok(_) => {}, Err(_err) => {} } - } else { - println!("{:?}", response); - println!("Malformed Current Tick Response."); } } }, diff --git a/src/main.rs b/src/main.rs index b7b7f57..7cdeefd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -44,14 +44,7 @@ async fn main() { crud::peer::set_all_peers_disconnected(path.as_str()).unwrap(); let peer_ips = vec![ "62.2.98.75:21841", - "45.67.139.81:21841", - "176.9.20.10:21841", - "136.243.41.109:21841", - "65.21.194.226:21841", - "135.181.246.92:21841", - "85.10.199.154:21841", - "148.251.184.163:21841", - "193.135.9.63:21841", + "185.117.0.116:21841", "144.2.106.163:21841" ]; debug!("Creating Peer Set");