diff --git a/Makefile.am b/Makefile.am index ccb19304d4..f6d86b09d9 100755 --- a/Makefile.am +++ b/Makefile.am @@ -90,12 +90,6 @@ src_libbitcoin_system_la_SOURCES = \ src/hash/accumulator.cpp \ src/hash/checksum.cpp \ src/hash/siphash.cpp \ - src/hash/vectorization/sha256_1_native.cpp \ - src/hash/vectorization/sha256_2_shani.cpp \ - src/hash/vectorization/sha256_4_neon.cpp \ - src/hash/vectorization/sha256_4_sse4.cpp \ - src/hash/vectorization/sha256_4_sse41.cpp \ - src/hash/vectorization/sha256_8_avx2.cpp \ src/math/math.cpp \ src/radix/base_10.cpp \ src/radix/base_2048.cpp \ @@ -622,6 +616,7 @@ include_bitcoin_system_impl_hash_sha_HEADERS = \ include/bitcoin/system/impl/hash/sha/algorithm_double.ipp \ include/bitcoin/system/impl/hash/sha/algorithm_functions.ipp \ include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp \ + include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp \ include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp \ include/bitcoin/system/impl/hash/sha/algorithm_native.ipp \ include/bitcoin/system/impl/hash/sha/algorithm_padding.ipp \ diff --git a/builds/cmake/CMakeLists.txt b/builds/cmake/CMakeLists.txt index 7547906182..4ebdf5b59b 100644 --- a/builds/cmake/CMakeLists.txt +++ b/builds/cmake/CMakeLists.txt @@ -529,12 +529,6 @@ add_library( ${CANONICAL_LIB_NAME} "../../src/hash/accumulator.cpp" "../../src/hash/checksum.cpp" "../../src/hash/siphash.cpp" - "../../src/hash/vectorization/sha256_1_native.cpp" - "../../src/hash/vectorization/sha256_2_shani.cpp" - "../../src/hash/vectorization/sha256_4_neon.cpp" - "../../src/hash/vectorization/sha256_4_sse4.cpp" - "../../src/hash/vectorization/sha256_4_sse41.cpp" - "../../src/hash/vectorization/sha256_8_avx2.cpp" "../../src/math/math.cpp" "../../src/radix/base_10.cpp" "../../src/radix/base_2048.cpp" diff --git a/builds/msvc/vs2022/libbitcoin-system/libbitcoin-system.vcxproj b/builds/msvc/vs2022/libbitcoin-system/libbitcoin-system.vcxproj index a70cf70ca2..69f4210f14 100644 --- a/builds/msvc/vs2022/libbitcoin-system/libbitcoin-system.vcxproj +++ b/builds/msvc/vs2022/libbitcoin-system/libbitcoin-system.vcxproj @@ -155,12 +155,6 @@ - - - - - - @@ -548,6 +542,7 @@ + diff --git a/builds/msvc/vs2022/libbitcoin-system/libbitcoin-system.vcxproj.filters b/builds/msvc/vs2022/libbitcoin-system/libbitcoin-system.vcxproj.filters index 5596a34380..9f54b85f9c 100644 --- a/builds/msvc/vs2022/libbitcoin-system/libbitcoin-system.vcxproj.filters +++ b/builds/msvc/vs2022/libbitcoin-system/libbitcoin-system.vcxproj.filters @@ -8,157 +8,157 @@ - {39F60708-FF48-4C22-0000-000000000009} + {39F60708-FF48-4C22-0000-000000000008} - {39F60708-FF48-4C22-0000-000000000010} + {39F60708-FF48-4C22-0000-000000000009} - {39F60708-FF48-4C22-0000-0000000000A1} + {39F60708-FF48-4C22-0000-000000000010} - {39F60708-FF48-4C22-0000-0000000000B1} + {39F60708-FF48-4C22-0000-0000000000A1} - {39F60708-FF48-4C22-0000-0000000000C2} + {39F60708-FF48-4C22-0000-0000000000B2} - {39F60708-FF48-4C22-0000-0000000000C1} + {39F60708-FF48-4C22-0000-0000000000B1} - {39F60708-FF48-4C22-0000-0000000000D1} + {39F60708-FF48-4C22-0000-0000000000C1} - {39F60708-FF48-4C22-0000-0000000000E1} + {39F60708-FF48-4C22-0000-0000000000D1} - {39F60708-FF48-4C22-0000-0000000000F1} + {39F60708-FF48-4C22-0000-0000000000E1} - {39F60708-FF48-4C22-0000-000000000002} + {39F60708-FF48-4C22-0000-0000000000F1} - {39F60708-FF48-4C22-0000-000000000003} + {39F60708-FF48-4C22-0000-000000000002} - {39F60708-FF48-4C22-0000-0000000000D2} + {39F60708-FF48-4C22-0000-0000000000C2} - {39F60708-FF48-4C22-0000-0000000000E2} + {39F60708-FF48-4C22-0000-0000000000D2} - {39F60708-FF48-4C22-0000-000000000004} + {39F60708-FF48-4C22-0000-000000000003} - {39F60708-FF48-4C22-0000-0000000000F2} + {39F60708-FF48-4C22-0000-0000000000E2} - {39F60708-FF48-4C22-0000-000000000003} + {39F60708-FF48-4C22-0000-0000000000F2} - {39F60708-FF48-4C22-0000-000000000004} + {39F60708-FF48-4C22-0000-000000000003} - {39F60708-FF48-4C22-0000-000000000005} + {39F60708-FF48-4C22-0000-000000000004} - {39F60708-FF48-4C22-0000-0000000000A3} + {39F60708-FF48-4C22-0000-000000000012} - {39F60708-FF48-4C22-0000-0000000000B3} + {39F60708-FF48-4C22-0000-0000000000A3} - {39F60708-FF48-4C22-0000-000000000006} + {39F60708-FF48-4C22-0000-000000000005} - {39F60708-FF48-4C22-0000-000000000007} + {39F60708-FF48-4C22-0000-000000000006} - {39F60708-FF48-4C22-0000-000000000008} + {39F60708-FF48-4C22-0000-000000000007} - {39F60708-FF48-4C22-0000-000000000009} + {39F60708-FF48-4C22-0000-000000000008} - {39F60708-FF48-4C22-0000-000000000010} + {39F60708-FF48-4C22-0000-000000000009} - {39F60708-FF48-4C22-0000-0000000000C3} + {39F60708-FF48-4C22-0000-0000000000B3} - {39F60708-FF48-4C22-0000-0000000000D3} + {39F60708-FF48-4C22-0000-0000000000C3} - {39F60708-FF48-4C22-0000-000000000011} + {39F60708-FF48-4C22-0000-000000000010} - {39F60708-FF48-4C22-0000-0000000000E3} + {39F60708-FF48-4C22-0000-0000000000D3} - {39F60708-FF48-4C22-0000-000000000012} + {39F60708-FF48-4C22-0000-000000000011} - {39F60708-FF48-4C22-0000-000000000005} + {39F60708-FF48-4C22-0000-000000000004} - {39F60708-FF48-4C22-0000-0000000000F3} + {39F60708-FF48-4C22-0000-0000000000E3} - {39F60708-FF48-4C22-0000-000000000004} + {39F60708-FF48-4C22-0000-0000000000F3} - {39F60708-FF48-4C22-0000-000000000006} + {39F60708-FF48-4C22-0000-000000000005} - {39F60708-FF48-4C22-0000-000000000007} + {39F60708-FF48-4C22-0000-000000000006} - {39F60708-FF48-4C22-0000-000000000008} + {39F60708-FF48-4C22-0000-000000000007} - {39F60708-FF48-4C22-0000-000000000009} + {39F60708-FF48-4C22-0000-000000000008} - {39F60708-FF48-4C22-0000-000000000010} + {39F60708-FF48-4C22-0000-000000000009} - {39F60708-FF48-4C22-0000-000000000005} + {39F60708-FF48-4C22-0000-000000000004} - {39F60708-FF48-4C22-0000-000000000006} + {39F60708-FF48-4C22-0000-000000000005} - {39F60708-FF48-4C22-0000-000000000007} + {39F60708-FF48-4C22-0000-000000000006} - {39F60708-FF48-4C22-0000-000000000008} + {39F60708-FF48-4C22-0000-000000000007} - {39F60708-FF48-4C22-0000-000000000011} + {39F60708-FF48-4C22-0000-000000000010} - {39F60708-FF48-4C22-0000-000000000009} + {39F60708-FF48-4C22-0000-000000000008} - {39F60708-FF48-4C22-0000-0000000000A2} + {39F60708-FF48-4C22-0000-000000000011} - {39F60708-FF48-4C22-0000-000000000010} + {39F60708-FF48-4C22-0000-000000000009} - {39F60708-FF48-4C22-0000-000000000011} + {39F60708-FF48-4C22-0000-000000000010} - {39F60708-FF48-4C22-0000-000000000012} + {39F60708-FF48-4C22-0000-000000000011} - {39F60708-FF48-4C22-0000-0000000000B2} + {39F60708-FF48-4C22-0000-0000000000A2} - {39F60708-FF48-4C22-0000-000000000013} + {39F60708-FF48-4C22-0000-000000000012} - {39F60708-FF48-4C22-0000-0000000000A4} + {39F60708-FF48-4C22-0000-000000000013} {39F60708-FF48-4C22-0000-000000000000} @@ -187,9 +187,6 @@ {39F60708-FF48-4C22-0000-000000000007} - - {39F60708-FF48-4C22-0000-000000000001} - {39F60708-FF48-4C22-0000-000000000008} @@ -206,31 +203,31 @@ {39F60708-FF48-4C22-0000-00000000000C} - {39F60708-FF48-4C22-0000-000000000002} + {39F60708-FF48-4C22-0000-000000000001} {39F60708-FF48-4C22-0000-00000000000D} - {39F60708-FF48-4C22-0000-000000000003} + {39F60708-FF48-4C22-0000-000000000002} - {39F60708-FF48-4C22-0000-000000000006} + {39F60708-FF48-4C22-0000-000000000005} - {39F60708-FF48-4C22-0000-000000000004} + {39F60708-FF48-4C22-0000-000000000003} - {39F60708-FF48-4C22-0000-000000000007} + {39F60708-FF48-4C22-0000-000000000006} - {39F60708-FF48-4C22-0000-000000000005} + {39F60708-FF48-4C22-0000-000000000004} {39F60708-FF48-4C22-0000-00000000000E} - {39F60708-FF48-4C22-0000-000000000008} + {39F60708-FF48-4C22-0000-000000000007} @@ -390,24 +387,6 @@ src\hash - - src\hash\vectorization - - - src\hash\vectorization - - - src\hash\vectorization - - - src\hash\vectorization - - - src\hash\vectorization - - - src\hash\vectorization - src\math @@ -1519,6 +1498,9 @@ include\bitcoin\system\impl\hash\sha + + include\bitcoin\system\impl\hash\sha + include\bitcoin\system\impl\hash\sha diff --git a/include/bitcoin/system/hash/sha/algorithm.hpp b/include/bitcoin/system/hash/sha/algorithm.hpp index 077beb84b2..48caa62dac 100644 --- a/include/bitcoin/system/hash/sha/algorithm.hpp +++ b/include/bitcoin/system/hash/sha/algorithm.hpp @@ -144,7 +144,8 @@ class algorithm /// Intrinsics types. /// ----------------------------------------------------------------------- - /// Extended integer capacity for uint32_t/uint64_t is 2/4/8/16 only. + /// Expand is multiple of buffer/state for Lane concurrent blocks. + /// Multiple blocks are "striped" across the expanded buffer in xWords. template > = true> using xblock_t = std_array; @@ -157,6 +158,17 @@ class algorithm template = true> using xchunk_t = std_array; + /// Wide is casting of buffer_t to xWord for single block concurrency. + /// This is not multi-block or block striping, just larger words. + template = true> + using wbuffer_t = std_array; + + template = true> + using wstate_t = std_array; + + /// Other types. + /// ----------------------------------------------------------------------- + using uint = unsigned int; using idigests_t = mutable_iterable; using pad_t = std_array INLINE static constexpr void prepare(auto& buffer) NOEXCEPT; - INLINE static constexpr void add_k(auto& buffer) NOEXCEPT; static constexpr void schedule_(auto& buffer) NOEXCEPT; static constexpr void schedule(buffer_t& buffer) NOEXCEPT; @@ -242,7 +253,7 @@ class algorithm static constexpr void reinput(auto& buffer, const auto& state) NOEXCEPT; - /// Iteration. + /// Iteration (message scheduling vectorized for multiple blocks). /// ----------------------------------------------------------------------- template @@ -280,7 +291,7 @@ class algorithm const ablocks_t& blocks) NOEXCEPT; INLINE static void iterate(state_t& state, iblocks_t& blocks) NOEXCEPT; - /// Merkle hashing. + /// Merkle hashing (fully vectorized for multiple blocks). /// ----------------------------------------------------------------------- template @@ -311,7 +322,7 @@ class algorithm VCONSTEXPR static void merkle_hash_(digests_t& digests, size_t offset=zero) NOEXCEPT; - /// sigma0 vectorization. + /// sigma0 vectorization (single blocks). /// ----------------------------------------------------------------------- template = true> @@ -328,22 +339,45 @@ class algorithm INLINE static void schedule_sigma(xbuffer_t& xbuffer) NOEXCEPT; INLINE static void schedule_sigma(buffer_t& buffer) NOEXCEPT; - /// Native. + /// [K]onstant vectorization (single and multiple blocks). + /// ----------------------------------------------------------------------- + + template + INLINE static constexpr void konstant(auto& buffer) NOEXCEPT; + + template + INLINE static void vector_konstant(wbuffer_t& wbuffer) NOEXCEPT; + INLINE static void vector_konstant(buffer_t& buffer) NOEXCEPT; + + template + static constexpr void konstant(xbuffer_t& xbuffer) NOEXCEPT; + static constexpr void konstant(buffer_t& buffer) NOEXCEPT; + static constexpr void konstant_(auto& buffer) NOEXCEPT; + + /// Native SHA optimizations (single blocks). /// ----------------------------------------------------------------------- - static constexpr auto native_lanes = capacity; - static constexpr auto native_rounds = SHA::rounds / native_lanes; - using cbuffer_t = std_array; - using cstate_t = std_array; template - INLINE static void prepare(cbuffer_t& buffer) NOEXCEPT; - INLINE static void add_k(cbuffer_t& buffer) NOEXCEPT; - static void schedule(cbuffer_t& buffer) NOEXCEPT; + INLINE static void prepare_native(wbuffer_t& wbuffer) NOEXCEPT; + static void schedule(wbuffer_t& wbuffer) NOEXCEPT; template INLINE static void schedule_native(xbuffer_t& xbuffer) NOEXCEPT; INLINE static void schedule_native(buffer_t& buffer) NOEXCEPT; + template + INLINE static void round_native(wstate_t& state, + const wbuffer_t& wk) NOEXCEPT; + + INLINE static void shuffle(wstate_t& wstate) NOEXCEPT; + INLINE static void unshuffle(wstate_t& wstate) NOEXCEPT; + INLINE static void summarize_native(wstate_t& out, + const wstate_t& in) NOEXCEPT; + + template + INLINE static void compress_native(wstate_t& state, + const wbuffer_t& wbuffer) NOEXCEPT; + template INLINE static void compress_native(xstate_t& xstate, const xbuffer_t& xbuffer) NOEXCEPT; @@ -381,6 +415,7 @@ BC_PUSH_WARNING(NO_POINTER_ARITHMETIC) BC_PUSH_WARNING(NO_ARRAY_INDEXING) #include +#include #include #include #include diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp index ada7bae8f4..f9da36fd0f 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp @@ -173,7 +173,6 @@ template constexpr void CLASS:: compress_(auto& state, const auto& buffer) NOEXCEPT { - // SHA-NI/256: 64/4 = 16 quad rounds, 8/4 = 2 state elements. // This is a copy (state type varies due to vectorization). const auto start = state; diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp new file mode 100644 index 0000000000..32afdfaec4 --- /dev/null +++ b/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp @@ -0,0 +1,279 @@ +/** + * Copyright (c) 2011-2024 libbitcoin developers (see AUTHORS) + * + * This file is part of libbitcoin. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ +#ifndef LIBBITCOIN_SYSTEM_HASH_SHA_ALGORITHM_KONSTANT_IPP +#define LIBBITCOIN_SYSTEM_HASH_SHA_ALGORITHM_KONSTANT_IPP + +#include + +// [K]onstant adding. +// ============================================================================ + +namespace libbitcoin { +namespace system { +namespace sha { + +// single or expanded (vectorized) buffer +// ---------------------------------------------------------------------------- +// protected + +TEMPLATE +template +INLINE constexpr void CLASS:: +konstant(auto& buffer) NOEXCEPT +{ + // K is broadcast across blocks. + buffer[Round] = f::addc(buffer[Round]); +} + +// wide (vectorized) buffer +// ---------------------------------------------------------------------------- +// protected + +TEMPLATE +template +INLINE void CLASS:: +vector_konstant(wbuffer_t& wbuffer) NOEXCEPT +{ + constexpr auto r = Round; + constexpr auto s = SHA::word_bits; + constexpr auto lanes = capacity; + + if constexpr (lanes == 2) + { + wbuffer[Round] = f::add(wbuffer[Round], set( + K::get[r + 1], K::get[r + 0])); + } + else if constexpr (lanes == 4) + { + wbuffer[Round] = f::add(wbuffer[Round], set( + K::get[r + 3], K::get[r + 2], K::get[r + 1], K::get[r + 0])); + } + else if constexpr (lanes == 8) + { + wbuffer[Round] = f::add(wbuffer[Round], set( + K::get[r + 7], K::get[r + 6], K::get[r + 5], K::get[r + 4], + K::get[r + 3], K::get[r + 2], K::get[r + 1], K::get[r + 0])); + } + else if constexpr (lanes == 16) + { + wbuffer[Round] = f::add(wbuffer[Round], set( + K::get[r + 15], K::get[r + 14], K::get[r + 13], K::get[r + 12], + K::get[r + 11], K::get[r + 10], K::get[r + 9], K::get[r + 8], + K::get[r + 7], K::get[r + 6], K::get[r + 5], K::get[r + 4], + K::get[r + 3], K::get[r + 2], K::get[r + 1], K::get[r + 0])); + } +} + +TEMPLATE +void CLASS:: +vector_konstant(buffer_t& buffer) NOEXCEPT +{ + if constexpr (use_x512) + { + auto& wbuffer = array_cast(buffer); + vector_konstant<0>(wbuffer); + vector_konstant<1>(wbuffer); + vector_konstant<2>(wbuffer); + vector_konstant<3>(wbuffer); + + if constexpr (SHA::rounds == 80) + { + vector_konstant<4>(wbuffer); + } + } + else if constexpr (use_x256) + { + auto& wbuffer = array_cast(buffer); + vector_konstant<0>(wbuffer); + vector_konstant<1>(wbuffer); + vector_konstant<2>(wbuffer); + vector_konstant<3>(wbuffer); + vector_konstant<4>(wbuffer); + vector_konstant<5>(wbuffer); + vector_konstant<6>(wbuffer); + vector_konstant<7>(wbuffer); + + if constexpr (SHA::rounds == 80) + { + vector_konstant<8>(wbuffer); + vector_konstant<9>(wbuffer); + } + } + else if constexpr (use_x128) + { + auto& wbuffer = array_cast(buffer); + vector_konstant<0>(wbuffer); + vector_konstant<1>(wbuffer); + vector_konstant<2>(wbuffer); + vector_konstant<3>(wbuffer); + vector_konstant<4>(wbuffer); + vector_konstant<5>(wbuffer); + vector_konstant<6>(wbuffer); + vector_konstant<7>(wbuffer); + vector_konstant<8>(wbuffer); + vector_konstant<9>(wbuffer); + vector_konstant<10>(wbuffer); + vector_konstant<11>(wbuffer); + vector_konstant<12>(wbuffer); + vector_konstant<13>(wbuffer); + vector_konstant<14>(wbuffer); + vector_konstant<15>(wbuffer); + + if constexpr (SHA::rounds == 80) + { + vector_konstant<16>(wbuffer); + vector_konstant<17>(wbuffer); + vector_konstant<18>(wbuffer); + vector_konstant<19>(wbuffer); + } + } + else + { + konstant_(buffer); + } +} + +// dispatch +// ---------------------------------------------------------------------------- +// protected + +TEMPLATE +constexpr void CLASS:: +konstant_(auto& buffer) NOEXCEPT +{ + konstant<0>(buffer); + konstant<1>(buffer); + konstant<2>(buffer); + konstant<3>(buffer); + konstant<4>(buffer); + konstant<5>(buffer); + konstant<6>(buffer); + konstant<7>(buffer); + konstant<8>(buffer); + konstant<9>(buffer); + konstant<10>(buffer); + konstant<11>(buffer); + konstant<12>(buffer); + konstant<13>(buffer); + konstant<14>(buffer); + konstant<15>(buffer); + + konstant<16>(buffer); + konstant<17>(buffer); + konstant<18>(buffer); + konstant<19>(buffer); + konstant<20>(buffer); + konstant<21>(buffer); + konstant<22>(buffer); + konstant<23>(buffer); + konstant<24>(buffer); + konstant<25>(buffer); + konstant<26>(buffer); + konstant<27>(buffer); + konstant<28>(buffer); + konstant<29>(buffer); + konstant<30>(buffer); + konstant<31>(buffer); + + konstant<32>(buffer); + konstant<33>(buffer); + konstant<34>(buffer); + konstant<35>(buffer); + konstant<36>(buffer); + konstant<37>(buffer); + konstant<38>(buffer); + konstant<39>(buffer); + konstant<40>(buffer); + konstant<41>(buffer); + konstant<42>(buffer); + konstant<43>(buffer); + konstant<44>(buffer); + konstant<45>(buffer); + konstant<46>(buffer); + konstant<47>(buffer); + + konstant<48>(buffer); + konstant<49>(buffer); + konstant<50>(buffer); + konstant<51>(buffer); + konstant<52>(buffer); + konstant<53>(buffer); + konstant<54>(buffer); + konstant<55>(buffer); + konstant<56>(buffer); + konstant<57>(buffer); + konstant<58>(buffer); + konstant<59>(buffer); + konstant<60>(buffer); + konstant<61>(buffer); + konstant<62>(buffer); + konstant<63>(buffer); + + if constexpr (SHA::rounds == 80) + { + konstant<64>(buffer); + konstant<65>(buffer); + konstant<66>(buffer); + konstant<67>(buffer); + konstant<68>(buffer); + konstant<69>(buffer); + konstant<70>(buffer); + konstant<71>(buffer); + konstant<72>(buffer); + konstant<73>(buffer); + konstant<74>(buffer); + konstant<75>(buffer); + konstant<76>(buffer); + konstant<77>(buffer); + konstant<78>(buffer); + konstant<79>(buffer); + } +} + +TEMPLATE +template +INLINE constexpr void CLASS:: +konstant(xbuffer_t& xbuffer) NOEXCEPT +{ + konstant_(xbuffer); +} + +TEMPLATE +INLINE constexpr void CLASS:: +konstant(buffer_t& buffer) NOEXCEPT +{ + if (std::is_constant_evaluated()) + { + konstant_(buffer); + } + ////else if constexpr (vector) + ////{ + //// vector_konstant(buffer); + ////} + else + { + konstant_(buffer); + } +} + +} // namespace sha +} // namespace system +} // namespace libbitcoin + +#endif diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp index 1255d925bb..06553308d5 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp @@ -35,147 +35,91 @@ namespace libbitcoin { namespace system { namespace sha { + +// schedule +// ---------------------------------------------------------------------------- +// protected TEMPLATE template INLINE void CLASS:: -prepare(cbuffer_t& buffer) NOEXCEPT +prepare_native(wbuffer_t& wbuffer) NOEXCEPT { - // K-adding is shifted 16 words, with last 16 added after scheduling. - if constexpr (SHA::strength == 160) { - ////static_assert(false, "sha160 not implemented"); - } - else if constexpr (use_neon) - { - ////static_assert(false, "neon not implemented"); + if constexpr (use_neon) + { + } + else if constexpr (use_shani) + { + } } - else + else if constexpr (SHA::strength == 256) { - static_assert(SHA::strength == 256); - - constexpr auto r1 = Round - 1; - constexpr auto r2 = sub1(r1); - constexpr auto r3 = sub1(r2); - constexpr auto r4 = sub1(r3); - constexpr auto k0 = Round * 4 - 16; - constexpr auto k1 = add1(k0); - constexpr auto k2 = add1(k1); - constexpr auto k3 = add1(k2); - - buffer[Round] = mm_sha256msg2_epu32 + if constexpr (use_neon) + { + } + else if constexpr (use_shani) + { + wbuffer[Round] = mm_sha256msg2_epu32 ( mm_add_epi32 ( mm_alignr_epi8 ( - buffer[r1], buffer[r2], SHA::word_bytes + wbuffer[Round - 1], wbuffer[Round - 2], SHA::word_bytes ), mm_sha256msg1_epu32 ( - buffer[r4], buffer[r3] + wbuffer[Round - 4], wbuffer[Round - 3] ) ), - buffer[r1] - ); - - buffer[r4] = mm_add_epi32 - ( - buffer[r4], - mm_set_epi32(K::get[k3], K::get[k2], K::get[k1], K::get[k0]) + wbuffer[Round - 1] ); + } } } TEMPLATE INLINE void CLASS:: -add_k(cbuffer_t& buffer) NOEXCEPT +schedule(wbuffer_t& wbuffer) NOEXCEPT { - // Add K to last 16 words. - // TODO: Consolidated K-adding can be performed in 4/8/16 lanes. - constexpr auto k = SHA::rounds - SHA::block_words; - constexpr auto r = k / native_lanes; - - buffer[r + 0] = mm_add_epi32 - ( - buffer[r + 0], - mm_set_epi32( - K::get[k + 3], K::get[k + 2], - K::get[k + 1], K::get[k + 0]) - ); - - buffer[r + 1] = mm_add_epi32 - ( - buffer[r + 1], - mm_set_epi32( - K::get[k + 7], K::get[k + 6], - K::get[k + 5], K::get[k + 4]) - ); - - buffer[r + 2] = mm_add_epi32 - ( - buffer[r + 2], - mm_set_epi32( - K::get[k + 11], K::get[k + 10], - K::get[k + 9], K::get[k + 8]) - ); - - buffer[r + 3] = mm_add_epi32 - ( - buffer[r + 3], - mm_set_epi32( - K::get[k + 15], K::get[k + 14], - K::get[k + 13], K::get[k + 12]) - ); -} + prepare_native<4>(wbuffer); + prepare_native<5>(wbuffer); + prepare_native<6>(wbuffer); + prepare_native<7>(wbuffer); + prepare_native<8>(wbuffer); + prepare_native<9>(wbuffer); + prepare_native<10>(wbuffer); + prepare_native<11>(wbuffer); + prepare_native<12>(wbuffer); + prepare_native<13>(wbuffer); + prepare_native<14>(wbuffer); + prepare_native<15>(wbuffer); -TEMPLATE -INLINE void CLASS:: -schedule(cbuffer_t& buffer) NOEXCEPT -{ - auto& cbuffer = array_cast(buffer); - - prepare<4>(cbuffer); - prepare<5>(cbuffer); - prepare<6>(cbuffer); - prepare<7>(cbuffer); - prepare<8>(cbuffer); - prepare<9>(cbuffer); - prepare<10>(cbuffer); - prepare<11>(cbuffer); - prepare<12>(cbuffer); - prepare<13>(cbuffer); - prepare<14>(cbuffer); - prepare<15>(cbuffer); - - ////if constexpr (SHA::rounds == 80) - ////{ - //// prepare<16>(buffer); - //// prepare<17>(buffer); - //// prepare<18>(buffer); - //// prepare<19>(buffer); - ////} + if constexpr (SHA::rounds == 80) + { + prepare_native<16>(wbuffer); + prepare_native<17>(wbuffer); + prepare_native<18>(wbuffer); + prepare_native<19>(wbuffer); + } - add_k(buffer); + konstant(array_cast(wbuffer)); } -// schedule -// ---------------------------------------------------------------------------- -// protected - TEMPLATE INLINE void CLASS:: schedule_native(buffer_t& buffer) NOEXCEPT { // neon and sha160 not yet implemented, sha512 is not native. - if constexpr (SHA::strength == 160 || SHA::strength == 512 || use_neon) + if constexpr (SHA::strength == 256 && !use_neon) { - schedule_(buffer); + schedule(array_cast(buffer)); } else { - schedule(array_cast(buffer)); + schedule_(buffer); } } @@ -192,6 +136,134 @@ schedule_native(xbuffer_t& xbuffer) NOEXCEPT // ---------------------------------------------------------------------------- // protected +TEMPLATE +template +INLINE void CLASS:: +round_native(wstate_t& state, + const wbuffer_t& wk) NOEXCEPT +{ + if constexpr (SHA::strength == 160) + { + if constexpr (use_neon) + { + } + else if constexpr (use_shani) + { + } + } + else if constexpr (SHA::strength == 256) + { + if constexpr (use_neon) + { + } + else if constexpr (use_shani) + { + // Process wk[Round][0..1], [HGDC][FEBA] (initial state) + state[1] = mm_sha256rnds2_epu32(state[1], state[0], wk[Round]); + + // Process wk[Round][2..3] (shifted down) + state[0] = mm_sha256rnds2_epu32(state[0], state[1], + mm_shuffle_epi32(wk[Round], 0x0e)); + } + } +} + +TEMPLATE +INLINE void CLASS:: +summarize_native(wstate_t& out, + const wstate_t& in) NOEXCEPT +{ + if constexpr (SHA::strength == 160) + { + if constexpr (use_neon) + { + } + else if constexpr (use_shani) + { + } + } + else if constexpr (SHA::strength == 256) + { + if constexpr (use_neon) + { + } + else if constexpr (use_shani) + { + out[0] = mm_add_epi32(out[0], in[0]); + out[1] = mm_add_epi32(out[1], in[1]); + } + } +} + +TEMPLATE +INLINE void CLASS:: +shuffle(wstate_t& wstate) NOEXCEPT +{ + // Change wstate to mm_sha256rnds2_epu32 expected form: + // [ABCD][EFGH] -> [FEBA][HGDC] (ordered low to high). + const auto t1 = mm_shuffle_epi32(wstate[0], 0xb1); + const auto t2 = mm_shuffle_epi32(wstate[1], 0x1b); + wstate[0] = mm_alignr_epi8(t1, t2, 8); + wstate[1] = mm_blend_epi16(t2, t1, 15); +} + +TEMPLATE +INLINE void CLASS:: +unshuffle(wstate_t& wstate) NOEXCEPT +{ + // Restore wstate to normal form: + // [FEBA][HGDC] -> [ABCD][EFGH] (ordered low to high). + const auto t1 = mm_shuffle_epi32(wstate[0], 0x1b); + const auto t2 = mm_shuffle_epi32(wstate[1], 0xb1); + wstate[0] = mm_blend_epi16(t1, t2, 15); + wstate[1] = mm_alignr_epi8(t2, t1, 8); +} + +TEMPLATE +template +INLINE void CLASS:: +compress_native(wstate_t& wstate, + const wbuffer_t& wbuffer) NOEXCEPT +{ + // Shuffle and unshuffle can be done outside of all blocks, but this would + // leave state in a non-normal form, so presently absorbing that cost. + shuffle(wstate); + + // This is a copy. + const auto start = wstate; + + round_native< 0, Lane>(wstate, wbuffer); + round_native< 1, Lane>(wstate, wbuffer); + round_native< 2, Lane>(wstate, wbuffer); + round_native< 3, Lane>(wstate, wbuffer); + round_native< 4, Lane>(wstate, wbuffer); + round_native< 5, Lane>(wstate, wbuffer); + round_native< 6, Lane>(wstate, wbuffer); + round_native< 7, Lane>(wstate, wbuffer); + round_native< 8, Lane>(wstate, wbuffer); + round_native< 9, Lane>(wstate, wbuffer); + round_native<10, Lane>(wstate, wbuffer); + round_native<11, Lane>(wstate, wbuffer); + round_native<12, Lane>(wstate, wbuffer); + round_native<13, Lane>(wstate, wbuffer); + round_native<14, Lane>(wstate, wbuffer); + round_native<15, Lane>(wstate, wbuffer); + + if constexpr (SHA::rounds == 80) + { + round_native<16, Lane>(wstate, wbuffer); + round_native<17, Lane>(wstate, wbuffer); + round_native<18, Lane>(wstate, wbuffer); + round_native<19, Lane>(wstate, wbuffer); + } + + // This is just a vectorized version of summarize(). + summarize_native(wstate, start); + + // See above comments on shuffle(). + unshuffle(wstate); +} + TEMPLATE template INLINE void CLASS:: @@ -216,8 +288,18 @@ template INLINE void CLASS:: compress_native(state_t& state, const buffer_t& buffer) NOEXCEPT { - // TODO: Single block compression. - compress_(state, buffer); + // TODO: debug. + // TODO: sha160 state is too small to array cast into two xwords. + // neon and sha160 not yet implemented, sha512 is not native. + ////if constexpr (SHA::strength == 256 && !use_neon) + ////{ + //// compress_native(array_cast(state), + //// array_cast(buffer)); + ////} + ////else + { + compress_(state, buffer); + } } } // namespace sha diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp index cf4b99ae8c..f2796d4b5c 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp @@ -36,63 +36,22 @@ template INLINE constexpr void CLASS:: prepare(auto& buffer) NOEXCEPT { - // K-adding is shifted 16 words, with last 16 added after scheduling. constexpr auto s = SHA::word_bits; if constexpr (SHA::strength == 160) { - constexpr auto r03 = Round - 3; - constexpr auto r08 = Round - 8; - constexpr auto r14 = Round - 14; - constexpr auto r16 = Round - 16; - buffer[Round] = f::rol<1, s>(f::xor_( - f::xor_(buffer[r16], buffer[r14]), - f::xor_(buffer[r08], buffer[r03]))); - - buffer[r16] = f::addc(buffer[r16]); + f::xor_(buffer[Round - 16], buffer[Round - 14]), + f::xor_(buffer[Round - 8], buffer[Round - 3]))); } else { - constexpr auto r02 = Round - 2; - constexpr auto r07 = Round - 7; - constexpr auto r15 = Round - 15; - constexpr auto r16 = Round - 16; - buffer[Round] = f::add( - f::add(buffer[r16], sigma0(buffer[r15])), - f::add(buffer[r07], sigma1(buffer[r02]))); - - buffer[r16] = f::addc(buffer[r16]); + f::add(buffer[Round - 16], sigma0(buffer[Round - 15])), + f::add(buffer[Round - 7], sigma1(buffer[Round - 2]))); } } -TEMPLATE -INLINE constexpr void CLASS:: -add_k(auto& buffer) NOEXCEPT -{ - // Add K to last 16 words. - // TODO: Consolidated K-adding can be performed in 4/8/16 lanes. - constexpr auto s = SHA::word_bits; - constexpr auto r = SHA::rounds - SHA::block_words; - buffer[r + 0] = f::addc(buffer[r + 0]); - buffer[r + 1] = f::addc(buffer[r + 1]); - buffer[r + 2] = f::addc(buffer[r + 2]); - buffer[r + 3] = f::addc(buffer[r + 3]); - buffer[r + 4] = f::addc(buffer[r + 4]); - buffer[r + 5] = f::addc(buffer[r + 5]); - buffer[r + 6] = f::addc(buffer[r + 6]); - buffer[r + 7] = f::addc(buffer[r + 7]); - buffer[r + 8] = f::addc(buffer[r + 8]); - buffer[r + 9] = f::addc(buffer[r + 9]); - buffer[r + 10] = f::addc(buffer[r + 10]); - buffer[r + 11] = f::addc(buffer[r + 11]); - buffer[r + 12] = f::addc(buffer[r + 12]); - buffer[r + 13] = f::addc(buffer[r + 13]); - buffer[r + 14] = f::addc(buffer[r + 14]); - buffer[r + 15] = f::addc(buffer[r + 15]); -} - TEMPLATE constexpr void CLASS:: schedule_(auto& buffer) NOEXCEPT @@ -168,7 +127,7 @@ schedule_(auto& buffer) NOEXCEPT prepare<79>(buffer); } - add_k(buffer); + konstant(buffer); } TEMPLATE diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp index 5f82112743..e6f880dd64 100644 --- a/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp +++ b/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp @@ -49,12 +49,10 @@ prepare1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT constexpr auto r16 = Round - 16; constexpr auto s = SHA::word_bits; - // buffer[r07 + 7] is buffer[Round + 0] - // This is why sigma0 is limited to 8 lanes (vs 16). + // buffer[r07 + 7] is buffer[Round + 0], so sigma0 is limited to 8 lanes. buffer[Round + Offset] = f::add( f::add(buffer[r16 + Offset], get(xsigma0)), f::add(buffer[r07 + Offset], sigma1(buffer[r02 + Offset]))); - buffer[r16 + Offset] = f::addc(buffer[r16 + Offset]); } TEMPLATE @@ -95,7 +93,7 @@ schedule_sigma(xbuffer_t& xbuffer) NOEXCEPT } TEMPLATE -INLINE void CLASS:: +void CLASS:: schedule_sigma(buffer_t& buffer) NOEXCEPT { if constexpr (SHA::strength != 160 && have_lanes()) @@ -113,7 +111,7 @@ schedule_sigma(buffer_t& buffer) NOEXCEPT prepare8<72>(buffer); } - add_k(buffer); + konstant(buffer); } else { diff --git a/include/bitcoin/system/intrinsics/xcpu/defines.hpp b/include/bitcoin/system/intrinsics/xcpu/defines.hpp index 79e5b06334..7594327053 100644 --- a/include/bitcoin/system/intrinsics/xcpu/defines.hpp +++ b/include/bitcoin/system/intrinsics/xcpu/defines.hpp @@ -133,6 +133,8 @@ BC_POP_WARNING() #define mm_extract_epi32(a, Lane) {} #define mm_extract_epi64(a, Lane) {} #define mm_shuffle_epi8(a, mask) (a) + #define mm_shuffle_epi32(a, mask) (a) + #define mm_blend_epi16(a, b, mask) (a) #define mm_load_si128(a) {} #define mm_loadu_si128(a) {} #define mm_store_si128(memory, a) @@ -167,6 +169,8 @@ BC_POP_WARNING() #define mm_extract_epi32(a, Lane) _mm_extract_epi32(a, Lane) #define mm_extract_epi64(a, Lane) _mm_extract_epi64(a, Lane) // undefined for X32 #define mm_shuffle_epi8(a, mask) _mm_shuffle_epi8(a, mask) + #define mm_shuffle_epi32(a, mask) _mm_shuffle_epi32(a, mask) + #define mm_blend_epi16(a, b, mask) _mm_blend_epi16(a, b, mask) #define mm_load_si128(a) _mm_load_si128(a) #define mm_loadu_si128(a) _mm_loadu_si128(a) #define mm_store_si128(memory, a) _mm_store_si128(memory, a) diff --git a/src/hash/vectorization/sha256_1_native.cpp b/src/hash/vectorization/sha256_1_native.cpp deleted file mode 100644 index bcb0b7b96e..0000000000 --- a/src/hash/vectorization/sha256_1_native.cpp +++ /dev/null @@ -1,302 +0,0 @@ -/** - * Copyright (c) 2011-2023 libbitcoin developers (see AUTHORS) - * - * This file is part of libbitcoin. - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . - */ -#include -#include -#include -#include -#include -#include - -// sha256: movable-type.co.uk/scripts/sha256.html -// Use inline vs. constexpr to obtain intrinsic std::rotr. - -namespace libbitcoin { -namespace system { -namespace sha256 { - -#if defined (DISABLED) - -constexpr auto choice(auto x, auto y, auto z) NOEXCEPT -{ - return (x & (y ^ z)) ^ z; -} - -constexpr auto majority(auto x, auto y, auto z) NOEXCEPT -{ - return (x & (y | z)) | (y & z); -} - -inline auto SIGMA0(auto a) NOEXCEPT -{ - return std::rotr(a, 2) ^ std::rotr(a, 13) ^ std::rotr(a, 22); -} - -inline auto SIGMA1(auto a) NOEXCEPT -{ - return std::rotr(a, 6) ^ std::rotr(a, 11) ^ std::rotr(a, 25); -} - -inline auto sigma0(auto a) NOEXCEPT -{ - return std::rotr(a, 7) ^ std::rotr(a, 18) ^ (a >> 3); -} - -inline auto sigma1(auto a) NOEXCEPT -{ - return std::rotr(a, 17) ^ std::rotr(a, 19) ^ (a >> 10); -} - -inline void round(auto a, auto b, auto c, auto& out_d, auto e, auto f, auto g, - auto& out_h, auto k) NOEXCEPT -{ - const auto t0 = SIGMA1(e) + choice(e, f, g) + out_h + k; - const auto t1 = SIGMA0(a) + majority(a, b, c); - out_d += t0; - out_h = t0 + t1; -} - -template -constexpr void round(state& out, const buffer& in) NOEXCEPT -{ - BC_PUSH_WARNING(NO_ARRAY_INDEXING) - round( - out[(block_size + 0 - Round) % state_size], - out[(block_size + 1 - Round) % state_size], - out[(block_size + 2 - Round) % state_size], - out[(block_size + 3 - Round) % state_size], // in/out - out[(block_size + 4 - Round) % state_size], - out[(block_size + 5 - Round) % state_size], - out[(block_size + 6 - Round) % state_size], - out[(block_size + 7 - Round) % state_size], // in/out - in[Round] + K); - BC_POP_WARNING() -} - -template -inline void set(buffer& out) NOEXCEPT -{ - BC_PUSH_WARNING(NO_ARRAY_INDEXING) - out[Offset] = - sigma1(out[Offset - 2]) + out[Offset - 7] + - sigma0(out[Offset - 15]) + out[Offset - 16]; - BC_POP_WARNING() -} - -inline void expand48(buffer& out) NOEXCEPT -{ - set<16>(out); - set<17>(out); - set<18>(out); - set<19>(out); - set<20>(out); - set<21>(out); - set<22>(out); - set<23>(out); - set<24>(out); - set<25>(out); - set<26>(out); - set<27>(out); - set<28>(out); - set<29>(out); - set<30>(out); - set<31>(out); - set<32>(out); - set<33>(out); - set<34>(out); - set<35>(out); - set<36>(out); - set<37>(out); - set<38>(out); - set<39>(out); - set<40>(out); - set<41>(out); - set<42>(out); - set<43>(out); - set<44>(out); - set<45>(out); - set<46>(out); - set<47>(out); - set<48>(out); - set<49>(out); - set<50>(out); - set<51>(out); - set<52>(out); - set<53>(out); - set<54>(out); - set<55>(out); - set<56>(out); - set<57>(out); - set<58>(out); - set<59>(out); - set<60>(out); - set<61>(out); - set<62>(out); - set<63>(out); -} - -inline void rounds64(state& out, const buffer& in) NOEXCEPT -{ - round< 0, 0x428a2f98>(out, in); - round< 1, 0x71374491>(out, in); - round< 2, 0xb5c0fbcf>(out, in); - round< 3, 0xe9b5dba5>(out, in); - round< 4, 0x3956c25b>(out, in); - round< 5, 0x59f111f1>(out, in); - round< 6, 0x923f82a4>(out, in); - round< 7, 0xab1c5ed5>(out, in); - round< 8, 0xd807aa98>(out, in); - round< 9, 0x12835b01>(out, in); - round<10, 0x243185be>(out, in); - round<11, 0x550c7dc3>(out, in); - round<12, 0x72be5d74>(out, in); - round<13, 0x80deb1fe>(out, in); - round<14, 0x9bdc06a7>(out, in); - round<15, 0xc19bf174>(out, in); - round<16, 0xe49b69c1>(out, in); - round<17, 0xefbe4786>(out, in); - round<18, 0x0fc19dc6>(out, in); - round<19, 0x240ca1cc>(out, in); - round<20, 0x2de92c6f>(out, in); - round<21, 0x4a7484aa>(out, in); - round<22, 0x5cb0a9dc>(out, in); - round<23, 0x76f988da>(out, in); - round<24, 0x983e5152>(out, in); - round<25, 0xa831c66d>(out, in); - round<26, 0xb00327c8>(out, in); - round<27, 0xbf597fc7>(out, in); - round<28, 0xc6e00bf3>(out, in); - round<29, 0xd5a79147>(out, in); - round<30, 0x06ca6351>(out, in); - round<31, 0x14292967>(out, in); - round<32, 0x27b70a85>(out, in); - round<33, 0x2e1b2138>(out, in); - round<34, 0x4d2c6dfc>(out, in); - round<35, 0x53380d13>(out, in); - round<36, 0x650a7354>(out, in); - round<37, 0x766a0abb>(out, in); - round<38, 0x81c2c92e>(out, in); - round<39, 0x92722c85>(out, in); - round<40, 0xa2bfe8a1>(out, in); - round<41, 0xa81a664b>(out, in); - round<42, 0xc24b8b70>(out, in); - round<43, 0xc76c51a3>(out, in); - round<44, 0xd192e819>(out, in); - round<45, 0xd6990624>(out, in); - round<46, 0xf40e3585>(out, in); - round<47, 0x106aa070>(out, in); - round<48, 0x19a4c116>(out, in); - round<49, 0x1e376c08>(out, in); - round<50, 0x2748774c>(out, in); - round<51, 0x34b0bcb5>(out, in); - round<52, 0x391c0cb3>(out, in); - round<53, 0x4ed8aa4a>(out, in); - round<54, 0x5b9cca4f>(out, in); - round<55, 0x682e6ff3>(out, in); - round<56, 0x748f82ee>(out, in); - round<57, 0x78a5636f>(out, in); - round<58, 0x84c87814>(out, in); - round<59, 0x8cc70208>(out, in); - round<60, 0x90befffa>(out, in); - round<61, 0xa4506ceb>(out, in); - round<62, 0xbef9a3f7>(out, in); - round<63, 0xc67178f2>(out, in); -} - -inline void summary8(state& out, const state& in) NOEXCEPT -{ - BC_PUSH_WARNING(NO_ARRAY_INDEXING) - out[0] += in[0]; - out[1] += in[1]; - out[2] += in[2]; - out[3] += in[3]; - out[4] += in[4]; - out[5] += in[5]; - out[6] += in[6]; - out[7] += in[7]; - BC_POP_WARNING() -} - -inline void copying8(buffer& out, const state& in) NOEXCEPT -{ - auto& to = array_cast(out); - to = in; -} - -inline void copyin64(buffer& out, const buffer& in) NOEXCEPT -{ - out = in; -} - -inline void bigend16(buffer& out, const block& in) NOEXCEPT -{ - constexpr auto size = block_size / sizeof(uint32_t); - auto& from = array_cast(in); - auto& to = array_cast(out); - from_big_endians(to, from); -} - -inline void bigend08(digest& out, const state& in) NOEXCEPT -{ - auto& to = array_cast(out); - to_big_endians(to, in); -} - -inline void padding8(buffer& out) NOEXCEPT -{ - auto& to = array_cast(out); - to = pad32; -} - -// This requires 32 more words of memory than a circular variables buffer. -// According to FIPS180 this is more performant given no memory constraint. -void hash_native(state& state, const block& block) NOEXCEPT -{ - buffer words; - const sha256::state start{ state }; - bigend16(words, block); - expand48(words); - rounds64(state, words); - summary8(state, start); -} - -// TODO: template with sized array of blocks. -void hash_native(state& state, const block1& blocks) NOEXCEPT -{ - buffer words; - - for (auto& block: blocks) - { - const sha256::state start{ state }; - bigend16(words, block); - expand48(words); - rounds64(state, words); - summary8(state, start); - } -} - -void hash_finalize(digest& digest, const state& state) NOEXCEPT -{ - bigend08(digest, state); -} - -#endif - -} // namespace sha256 -} // namespace system -} // namespace libbitcoin diff --git a/src/hash/vectorization/sha256_2_shani.cpp b/src/hash/vectorization/sha256_2_shani.cpp deleted file mode 100644 index 54a789e358..0000000000 --- a/src/hash/vectorization/sha256_2_shani.cpp +++ /dev/null @@ -1,252 +0,0 @@ -// Based on: -// sha256-x86.c - Intel SHA extensions using C intrinsics -// Written and place in public domain by Jeffrey Walton -// Based on code from Intel, and by Sean Gulley for the miTLS project. - -#include -#include -#include -#include -#include - -namespace libbitcoin { -namespace system { -namespace sha256 { - -#if defined (DISABLED) - -#if !defined(HAVE_XCPU) - -void hash_shani(state&, const block1&) NOEXCEPT -{ - BC_ASSERT_MSG(false, "hash_shani undefined"); -} - -#else - -// See sse41 for defines. -using namespace i128; - -#ifndef VISUAL - -alignas(xint128_t) constexpr uint8_t mask[sizeof(xint128_t)] -{ - 0x03, 0x02, 0x01, 0x00, // 0x00010203ul - 0x07, 0x06, 0x05, 0x04, // 0x04050607ul - 0x0b, 0x0a, 0x09, 0x08, // 0x08090a0bul - 0x0f, 0x0e, 0x0d, 0x0c // 0x0c0d0e0ful -}; - -// Half of little endian IV. -alignas(xint128_t) constexpr uint8_t initial0[sizeof(xint128_t)] -{ - 0x8c, 0x68, 0x05, 0x9b, // 0x9b05688cul [5] - 0x7f, 0x52, 0x0e, 0x51, // 0x510e527ful [4] - 0x85, 0xae, 0x67, 0xbb, // 0xbb67ae85ul [1] - 0x67, 0xe6, 0x09, 0x6a // 0x6a09e667ul [0] -}; - -// Half of little endian IV. -alignas(xint128_t) constexpr uint8_t initial1[sizeof(xint128_t)] -{ - 0x19, 0xcd, 0xe0, 0x5b, // 0x5be0cd19ul [7] - 0xab, 0xd9, 0x83, 0x1f, // 0x1f83d9abul [6] - 0x3a, 0xf5, 0x4f, 0xa5, // 0xa54ff53aul [3] - 0x72, 0xf3, 0x6e, 0x3c // 0x3c6ef372ul [2] -}; - -// load/store i128 -// ---------------------------------------------------------------------------- - -// Loading is just an array_cast into the buffer. - -// Aligned only, do not use with unaligned values. -xint128_t load32x4a(const uint8_t& bytes) NOEXCEPT -{ - return _mm_load_si128(pointer_cast(&bytes)); -} - -xint128_t load32x4u(const uint32_t& bytes) NOEXCEPT -{ - return _mm_loadu_si128(pointer_cast(&bytes)); -} -void store32x4u(uint8_t& bytes, xint128_t value) NOEXCEPT -{ - _mm_storeu_si128(pointer_cast(&bytes), value); -} - -// Aligned but for public data? -xint128_t load(const uint8_t& data) NOEXCEPT -{ - static const auto flipper = load32x4a(mask[0]); - return i128::shuffle(load32x4a(data), flipper); -} - -// Aligned but for public data? -void store(uint8_t& out, xint128_t value) NOEXCEPT -{ - static const auto flipper = load32x4a(mask[0]); - store32x4u(out, i128::shuffle(value, flipper)); -} - -// sha256 -// ---------------------------------------------------------------------------- -// intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html -// intel.com/content/dam/develop/external/us/en/documents/intel-sha-extensions-white-paper.pdf - -// _mm_sha256rnds2_epu32 is power of sha-ni, round reduction to 4 lane native. -// But this needs to be applied to preparation as well, to retain that model. -// Otherwise the round dispath must be modified to use the circular var queue. -// And this changes the size of buffer_t (to words_t). -// _mm_sha1rnds4_epu32 is provided for sha160. This would optimize only script -// evaluation of the uncommon opcode, but will be almost free to implement. - -// _mm_sha256rnds2_epu32 performs two rounds, so this is four. -void round(xint128_t& s0, xint128_t& s1, uint64_t k1, uint64_t k0) NOEXCEPT -{ - // This is actually m + k precomputed for fixed single block padding. - const auto value = set(k1, k0); - - s1 = _mm_sha256rnds2_epu32(s1, s0, value); - s0 = _mm_sha256rnds2_epu32(s0, s1, i128::shuffle<0x0e>(value)); -} - -void round(xint128_t& s0, xint128_t& s1, xint128_t m, uint64_t k1, uint64_t k0) NOEXCEPT -{ - // The sum m + k is computed in the message schedule. - const auto value = sum(m, set(k1, k0)); - - s1 = _mm_sha256rnds2_epu32(s1, s0, value); - s0 = _mm_sha256rnds2_epu32(s0, s1, i128::shuffle<0x0e>(value)); -} - -void shift_message(xint128_t& out, xint128_t m) NOEXCEPT -{ - out = _mm_sha256msg1_epu32(out, m); -} - -void shift_message(xint128_t m0, xint128_t m1, xint128_t& out) NOEXCEPT -{ - constexpr auto shift = sizeof(uint32_t); - out = _mm_sha256msg2_epu32(sum(out, align_right(m1, m0)), m1); -} - -void shift_messages(xint128_t& out0, xint128_t m, - xint128_t& out1) NOEXCEPT -{ - shift_message(out0, m, out1); - shift_message(out0, m); -} - -// endianness -// ---------------------------------------------------------------------------- - -// Endianness of the buffer/digest should be computed outside of hash function. -// Given the full mutable buffer, can be parallallized and vectorized in place. - -void shuffle(xint128_t& s0, xint128_t& s1) NOEXCEPT -{ - const auto t1 = i128::shuffle<0xb1>(s0); - const auto t2 = i128::shuffle<0x1b>(s1); - s0 = align_right<8>(t1, t2); - s1 = blend<15>(t2, t1); -} - -void unshuffle(xint128_t& s0, xint128_t& s1) NOEXCEPT -{ - const xint128_t t1 = i128::shuffle<0x1b>(s0); - const xint128_t t2 = i128::shuffle<0xb1>(s1); - s0 = blend<15>(t1, t2); - s1 = align_right<8>(t2, t1); -} - -#endif - -////void hash_shani(state& state, const blocks& blocks) NOEXCEPT; -void hash_shani(state& state, const block1& blocks) NOEXCEPT -{ - BC_PUSH_WARNING(NO_ARRAY_INDEXING) - - xint128_t m0, m1, m2, m3, so0, so1; - - // From unaligned (public). - auto s0 = load32x4u(state[0]); - auto s1 = load32x4u(state[4]); - - // state/SHA is LE, so why bswap? - // must be treating state as digest. - shuffle(s0, s1); - - // Each round is four sha rounds. - // One block in four lanes. - for (auto& block: blocks) - { - // Remember old state. - so0 = s0; - so1 = s1; - - // One block loaded 16 bytes (1 uint128) per each of 4 messages. - // load data and transform. - m0 = load(block[0]); - - // shift message computes next 4 messages from prevous 4. - // K: 0xe9b5dba5[3] 0xb5c0fbcfull[2] 0x71374491[1] 0x428a2f98ull[0] - round(s0, s1, m0, 0xe9b5dba5b5c0fbcfull, 0x71374491428a2f98ull); - m1 = load(block[16]); - round(s0, s1, m1, 0xab1c5ed5923f82a4ull, 0x59f111f13956c25bull); - shift_message(m0, m1); // new m0 from m1 - m2 = load(block[32]); - round(s0, s1, m2, 0x550c7dc3243185beull, 0x12835b01d807aa98ull); - shift_message(m1, m2); - m3 = load(block[48]); - - // shift messages computes next 4 messages from prevous 8. - round(s0, s1, m3, 0xc19bf1749bdc06a7ull, 0x80deb1fe72be5d74ull); - shift_messages(m2, m3, m0); - round(s0, s1, m0, 0x240ca1cc0fc19dc6ull, 0xefbe4786E49b69c1ull); - shift_messages(m3, m0, m1); - round(s0, s1, m1, 0x76f988da5cb0a9dcull, 0x4a7484aa2de92c6full); - shift_messages(m0, m1, m2); - round(s0, s1, m2, 0xbf597fc7b00327c8ull, 0xa831c66d983e5152ull); - shift_messages(m1, m2, m3); - round(s0, s1, m3, 0x1429296706ca6351ull, 0xd5a79147c6e00bf3ull); - shift_messages(m2, m3, m0); - round(s0, s1, m0, 0x53380d134d2c6dfcull, 0x2e1b213827b70a85ull); - shift_messages(m3, m0, m1); - round(s0, s1, m1, 0x92722c8581c2c92eull, 0x766a0abb650a7354ull); - shift_messages(m0, m1, m2); - round(s0, s1, m2, 0xc76c51A3c24b8b70ull, 0xa81a664ba2bfe8a1ull); - shift_messages(m1, m2, m3); - round(s0, s1, m3, 0x106aa070f40e3585ull, 0xd6990624d192e819ull); - shift_messages(m2, m3, m0); - round(s0, s1, m0, 0x34b0bcb52748774cull, 0x1e376c0819a4c116ull); - shift_messages(m3, m0, m1); - round(s0, s1, m1, 0x682e6ff35b9cca4full, 0x4ed8aa4a391c0cb3ull); - shift_message(m0, m1, m2); - round(s0, s1, m2, 0x8cc7020884c87814ull, 0x78a5636f748f82eeull); - shift_message(m1, m2, m3); - round(s0, s1, m3, 0xc67178f2bef9A3f7ull, 0xa4506ceb90befffaull); - - // Combine with old state. - s0 = sum(s0, so0); - s1 = sum(s1, so1); - } - - // state/SHA is LE, so why bswap? - // must be treating state as digest. - unshuffle(s0, s1); - - // To not aligned. - store32x4u(state[0], s0); - store32x4u(state[4], s1); - - BC_POP_WARNING() -} - -#endif // HAVE_XCPU - -#endif // DISABLED - -} // namespace sha256 -} // namespace system -} // namespace libbitcoin diff --git a/src/hash/vectorization/sha256_4_neon.cpp b/src/hash/vectorization/sha256_4_neon.cpp deleted file mode 100644 index 9e3dfa6c65..0000000000 --- a/src/hash/vectorization/sha256_4_neon.cpp +++ /dev/null @@ -1,224 +0,0 @@ -// Based on: -// sha256-arm.c - ARMv8 SHA extensions using C intrinsics -// Written and placed in public domain by Jeffrey Walton -// Based on code from ARM, and by Johannes Schneiders, Skip -// Hovsmith and Barry O'Rourke for the mbedTLS project. - -#include -#include -#include -#include -#include - -namespace libbitcoin { -namespace system { -namespace sha256 { - -#if defined (DISABLED) - -#if !defined(HAVE_ARM) - -void hash_neon(state&, const block1&) NOEXCEPT -{ - BC_ASSERT_MSG(false, "hash_neon undefined"); -} - -void merkle_neon(digest1&, const block1&) NOEXCEPT -{ - BC_ASSERT_MSG(false, "merkle_neon undefined"); -} - -#else - -void hash_neon(state& state, const block1& blocks) NOEXCEPT -{ - BC_PUSH_WARNING(NO_ARRAY_INDEXING) - - constexpr uint32_t k[] - { - 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5, - 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5, - 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3, - 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174, - 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC, - 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA, - 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7, - 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967, - 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13, - 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85, - 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3, - 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070, - 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5, - 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3, - 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208, - 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2, - }; - - uint32x4_t temp0, temp1, temp2; - uint32x4_t state0, state1, abef_save, cdgh_save; - uint32x4_t message0, message1, message2, message3; - - // Load state. - state0 = vld1q_u32(&state[0]); - state1 = vld1q_u32(&state[4]); - - // Each round is four sha rounds. - // One block in four lanes. - for (auto& block: blocks) - { - // Save state. - abef_save = state0; - cdgh_save = state1; - - // Load message. - message0 = vld1q_u32(pointer_cast(&block[0])); - message1 = vld1q_u32(pointer_cast(&block[16])); - message2 = vld1q_u32(pointer_cast(&block[32])); - message3 = vld1q_u32(pointer_cast(&block[48])); - - // Reverse for little endian. - message0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(message0))); - message1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(message1))); - message2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(message2))); - message3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(message3))); - - temp0 = vaddq_u32(message0, vld1q_u32(&k[0x00])); - - // Rounds 0-3. - message0 = vsha256su0q_u32(message0, message1); - temp2 = state0; - temp1 = vaddq_u32(message1, vld1q_u32(&k[0x04])); - state0 = vsha256hq_u32(state0, state1, temp0); - state1 = vsha256h2q_u32(state1, temp2, temp0); - message0 = vsha256su1q_u32(message0, message2, message3); - - // Rounds 4-7. - message1 = vsha256su0q_u32(message1, message2); - temp2 = state0; - temp0 = vaddq_u32(message2, vld1q_u32(&k[0x08])); - state0 = vsha256hq_u32(state0, state1, temp1); - state1 = vsha256h2q_u32(state1, temp2, temp1); - message1 = vsha256su1q_u32(message1, message3, message0); - - // Rounds 8-11. - message2 = vsha256su0q_u32(message2, message3); - temp2 = state0; - temp1 = vaddq_u32(message3, vld1q_u32(&k[0x0c])); - state0 = vsha256hq_u32(state0, state1, temp0); - state1 = vsha256h2q_u32(state1, temp2, temp0); - message2 = vsha256su1q_u32(message2, message0, message1); - - // Rounds 12-15. - message3 = vsha256su0q_u32(message3, message0); - temp2 = state0; - temp0 = vaddq_u32(message0, vld1q_u32(&k[0x10])); - state0 = vsha256hq_u32(state0, state1, temp1); - state1 = vsha256h2q_u32(state1, temp2, temp1); - message3 = vsha256su1q_u32(message3, message1, message2); - - // Rounds 16-19. - message0 = vsha256su0q_u32(message0, message1); - temp2 = state0; - temp1 = vaddq_u32(message1, vld1q_u32(&k[0x14])); - state0 = vsha256hq_u32(state0, state1, temp0); - state1 = vsha256h2q_u32(state1, temp2, temp0); - message0 = vsha256su1q_u32(message0, message2, message3); - - // Rounds 20-23. - message1 = vsha256su0q_u32(message1, message2); - temp2 = state0; - temp0 = vaddq_u32(message2, vld1q_u32(&k[0x18])); - state0 = vsha256hq_u32(state0, state1, temp1); - state1 = vsha256h2q_u32(state1, temp2, temp1); - message1 = vsha256su1q_u32(message1, message3, message0); - - // Rounds 24-27. - message2 = vsha256su0q_u32(message2, message3); - temp2 = state0; - temp1 = vaddq_u32(message3, vld1q_u32(&k[0x1c])); - state0 = vsha256hq_u32(state0, state1, temp0); - state1 = vsha256h2q_u32(state1, temp2, temp0); - message2 = vsha256su1q_u32(message2, message0, message1); - - // Rounds 28-31. - message3 = vsha256su0q_u32(message3, message0); - temp2 = state0; - temp0 = vaddq_u32(message0, vld1q_u32(&k[0x20])); - state0 = vsha256hq_u32(state0, state1, temp1); - state1 = vsha256h2q_u32(state1, temp2, temp1); - message3 = vsha256su1q_u32(message3, message1, message2); - - // Rounds 32-35. - message0 = vsha256su0q_u32(message0, message1); - temp2 = state0; - temp1 = vaddq_u32(message1, vld1q_u32(&k[0x24])); - state0 = vsha256hq_u32(state0, state1, temp0); - state1 = vsha256h2q_u32(state1, temp2, temp0); - message0 = vsha256su1q_u32(message0, message2, message3); - - // Rounds 36-39. - message1 = vsha256su0q_u32(message1, message2); - temp2 = state0; - temp0 = vaddq_u32(message2, vld1q_u32(&k[0x28])); - state0 = vsha256hq_u32(state0, state1, temp1); - state1 = vsha256h2q_u32(state1, temp2, temp1); - message1 = vsha256su1q_u32(message1, message3, message0); - - // Rounds 40-43. - message2 = vsha256su0q_u32(message2, message3); - temp2 = state0; - temp1 = vaddq_u32(message3, vld1q_u32(&k[0x2c])); - state0 = vsha256hq_u32(state0, state1, temp0); - state1 = vsha256h2q_u32(state1, temp2, temp0); - message2 = vsha256su1q_u32(message2, message0, message1); - - // Rounds 44-47. - message3 = vsha256su0q_u32(message3, message0); - temp2 = state0; - temp0 = vaddq_u32(message0, vld1q_u32(&k[0x30])); - state0 = vsha256hq_u32(state0, state1, temp1); - state1 = vsha256h2q_u32(state1, temp2, temp1); - message3 = vsha256su1q_u32(message3, message1, message2); - - // Rounds 48-51. - temp2 = state0; - temp1 = vaddq_u32(message1, vld1q_u32(&k[0x34])); - state0 = vsha256hq_u32(state0, state1, temp0); - state1 = vsha256h2q_u32(state1, temp2, temp0); - - // Rounds 52-55. - temp2 = state0; - temp0 = vaddq_u32(message2, vld1q_u32(&k[0x38])); - state0 = vsha256hq_u32(state0, state1, temp1); - state1 = vsha256h2q_u32(state1, temp2, temp1); - - // Rounds 56-59. - temp2 = state0; - temp1 = vaddq_u32(message3, vld1q_u32(&k[0x3c])); - state0 = vsha256hq_u32(state0, state1, temp0); - state1 = vsha256h2q_u32(state1, temp2, temp0); - - // Rounds 60-63. - temp2 = state0; - state0 = vsha256hq_u32(state0, state1, temp1); - state1 = vsha256h2q_u32(state1, temp2, temp1); - - // Combine state. - state0 = vaddq_u32(state0, abef_save); - state1 = vaddq_u32(state1, cdgh_save); - } - - // Save state. - vst1q_u32(&state[0], state0); - vst1q_u32(&state[4], state1); - - BC_POP_WARNING() -} - -#endif // HAVE_ARM - -#endif // DISABLED - -} // namespace sha256 -} // namespace system -} // namespace libbitcoin diff --git a/src/hash/vectorization/sha256_4_sse4.cpp b/src/hash/vectorization/sha256_4_sse4.cpp deleted file mode 100644 index 76760e6894..0000000000 --- a/src/hash/vectorization/sha256_4_sse4.cpp +++ /dev/null @@ -1,1033 +0,0 @@ -//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -//; Copyright (c) 2012, Intel Corporation -//; -//; All rights reserved. -//; -//; Redistribution and use in source and binary forms, with or without -//; modification, are permitted provided that the following conditions are -//; met: -//; -//; * Redistributions of source code must retain the above copyright -//; notice, this list of conditions and the following disclaimer. -//; -//; * Redistributions in binary form must reproduce the above copyright -//; notice, this list of conditions and the following disclaimer in the -//; documentation and/or other materials provided with the -//; distribution. -//; -//; * Neither the name of the Intel Corporation nor the names of its -//; contributors may be used to endorse or promote products derived from -//; this software without specific prior written permission. -//; -//; -//; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY -//; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -//; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -//; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR -//; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -//; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -//; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -//; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -//; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -//; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -//; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -//; -//; This code is described in an Intel White-Paper: -//; "Fast SHA-256 Implementations on Intel Architecture Processors" -//; -//; To find it, surf to https://www.intel.com/p/en_US/embedded -//; and search for that title. -//; The paper is expected to be released roughly at the end of April, 2012 -//; -//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -//; This code schedules 1 blocks at a time, with 4 lanes per block -//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -// -// Port to inline assembly provided by: -// Copyright (c) 2017-2019 The Bitcoin Core developers -// Distributed under the MIT software license, see the accompanying -// file COPYING or http://www.opensource.org/licenses/mit-license.php. - -#include -#include -#include - -namespace libbitcoin { -namespace system { -namespace sha256 { - -#if defined (DISABLED) - -#if defined(HAVE_XASSEMBLY) - -void hash_sse41a(state& state, const block1& blocks) NOEXCEPT -{ - alignas(16) constexpr uint32_t k256[] - { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, - 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, - 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, - 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, - 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, - 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, - 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, - 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, - }; - - alignas(16) constexpr uint32_t flip_mask[] - { - 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f - }; - - alignas(16) constexpr uint32_t shuffle_00ba[] - { - 0x03020100, 0x0b0a0908, 0xffffffff, 0xffffffff - }; - - alignas(16) constexpr uint32_t shuffle_dc00[] - { - 0xffffffff, 0xffffffff, 0x03020100, 0x0b0a0908 - }; - - uint32_t a, b, c, d, f, g, h, y0, y1, y2; - uint64_t table; - uint64_t input_end, input; - alignas(16) uint32_t transfer[4]; - auto state_integers = pointer_cast(&state); - const auto data_bytes = pointer_cast(blocks.data()); - -#ifndef VISUAL - __asm__ __volatile__( - "shl $0x6,%2;" - "je Ldone_hash_%=;" - "add %1,%2;" - "mov %2,%14;" - "mov (%0),%3;" - "mov 0x4(%0),%4;" - "mov 0x8(%0),%5;" - "mov 0xc(%0),%6;" - "mov 0x10(%0),%k2;" - "mov 0x14(%0),%7;" - "mov 0x18(%0),%8;" - "mov 0x1c(%0),%9;" - "movdqa %18,%%xmm12;" - "movdqa %19,%%xmm10;" - "movdqa %20,%%xmm11;" - - "Lloop0_%=:" - "lea %17,%13;" - "movdqu (%1),%%xmm4;" - "pshufb %%xmm12,%%xmm4;" - "movdqu 0x10(%1),%%xmm5;" - "pshufb %%xmm12,%%xmm5;" - "movdqu 0x20(%1),%%xmm6;" - "pshufb %%xmm12,%%xmm6;" - "movdqu 0x30(%1),%%xmm7;" - "pshufb %%xmm12,%%xmm7;" - "mov %1,%15;" - "mov $3,%1;" - - "Lloop1_%=:" - "movdqa 0x0(%13),%%xmm9;" - "paddd %%xmm4,%%xmm9;" - "movdqa %%xmm9,%16;" - "movdqa %%xmm7,%%xmm0;" - "mov %k2,%10;" - "ror $0xe,%10;" - "mov %3,%11;" - "palignr $0x4,%%xmm6,%%xmm0;" - "ror $0x9,%11;" - "xor %k2,%10;" - "mov %7,%12;" - "ror $0x5,%10;" - "movdqa %%xmm5,%%xmm1;" - "xor %3,%11;" - "xor %8,%12;" - "paddd %%xmm4,%%xmm0;" - "xor %k2,%10;" - "and %k2,%12;" - "ror $0xb,%11;" - "palignr $0x4,%%xmm4,%%xmm1;" - "xor %3,%11;" - "ror $0x6,%10;" - "xor %8,%12;" - "movdqa %%xmm1,%%xmm2;" - "ror $0x2,%11;" - "add %10,%12;" - "add %16,%12;" - "movdqa %%xmm1,%%xmm3;" - "mov %3,%10;" - "add %12,%9;" - "mov %3,%12;" - "pslld $0x19,%%xmm1;" - "or %5,%10;" - "add %9,%6;" - "and %5,%12;" - "psrld $0x7,%%xmm2;" - "and %4,%10;" - "add %11,%9;" - "por %%xmm2,%%xmm1;" - "or %12,%10;" - "add %10,%9;" - "movdqa %%xmm3,%%xmm2;" - "mov %6,%10;" - "mov %9,%11;" - "movdqa %%xmm3,%%xmm8;" - "ror $0xe,%10;" - "xor %6,%10;" - "mov %k2,%12;" - "ror $0x9,%11;" - "pslld $0xe,%%xmm3;" - "xor %9,%11;" - "ror $0x5,%10;" - "xor %7,%12;" - "psrld $0x12,%%xmm2;" - "ror $0xb,%11;" - "xor %6,%10;" - "and %6,%12;" - "ror $0x6,%10;" - "pxor %%xmm3,%%xmm1;" - "xor %9,%11;" - "xor %7,%12;" - "psrld $0x3,%%xmm8;" - "add %10,%12;" - "add 4+%16,%12;" - "ror $0x2,%11;" - "pxor %%xmm2,%%xmm1;" - "mov %9,%10;" - "add %12,%8;" - "mov %9,%12;" - "pxor %%xmm8,%%xmm1;" - "or %4,%10;" - "add %8,%5;" - "and %4,%12;" - "pshufd $0xfa,%%xmm7,%%xmm2;" - "and %3,%10;" - "add %11,%8;" - "paddd %%xmm1,%%xmm0;" - "or %12,%10;" - "add %10,%8;" - "movdqa %%xmm2,%%xmm3;" - "mov %5,%10;" - "mov %8,%11;" - "ror $0xe,%10;" - "movdqa %%xmm2,%%xmm8;" - "xor %5,%10;" - "ror $0x9,%11;" - "mov %6,%12;" - "xor %8,%11;" - "ror $0x5,%10;" - "psrlq $0x11,%%xmm2;" - "xor %k2,%12;" - "psrlq $0x13,%%xmm3;" - "xor %5,%10;" - "and %5,%12;" - "psrld $0xa,%%xmm8;" - "ror $0xb,%11;" - "xor %8,%11;" - "xor %k2,%12;" - "ror $0x6,%10;" - "pxor %%xmm3,%%xmm2;" - "add %10,%12;" - "ror $0x2,%11;" - "add 8+%16,%12;" - "pxor %%xmm2,%%xmm8;" - "mov %8,%10;" - "add %12,%7;" - "mov %8,%12;" - "pshufb %%xmm10,%%xmm8;" - "or %3,%10;" - "add %7,%4;" - "and %3,%12;" - "paddd %%xmm8,%%xmm0;" - "and %9,%10;" - "add %11,%7;" - "pshufd $0x50,%%xmm0,%%xmm2;" - "or %12,%10;" - "add %10,%7;" - "movdqa %%xmm2,%%xmm3;" - "mov %4,%10;" - "ror $0xe,%10;" - "mov %7,%11;" - "movdqa %%xmm2,%%xmm4;" - "ror $0x9,%11;" - "xor %4,%10;" - "mov %5,%12;" - "ror $0x5,%10;" - "psrlq $0x11,%%xmm2;" - "xor %7,%11;" - "xor %6,%12;" - "psrlq $0x13,%%xmm3;" - "xor %4,%10;" - "and %4,%12;" - "ror $0xb,%11;" - "psrld $0xa,%%xmm4;" - "xor %7,%11;" - "ror $0x6,%10;" - "xor %6,%12;" - "pxor %%xmm3,%%xmm2;" - "ror $0x2,%11;" - "add %10,%12;" - "add 12+%16,%12;" - "pxor %%xmm2,%%xmm4;" - "mov %7,%10;" - "add %12,%k2;" - "mov %7,%12;" - "pshufb %%xmm11,%%xmm4;" - "or %9,%10;" - "add %k2,%3;" - "and %9,%12;" - "paddd %%xmm0,%%xmm4;" - "and %8,%10;" - "add %11,%k2;" - "or %12,%10;" - "add %10,%k2;" - "movdqa 0x10(%13),%%xmm9;" - "paddd %%xmm5,%%xmm9;" - "movdqa %%xmm9,%16;" - "movdqa %%xmm4,%%xmm0;" - "mov %3,%10;" - "ror $0xe,%10;" - "mov %k2,%11;" - "palignr $0x4,%%xmm7,%%xmm0;" - "ror $0x9,%11;" - "xor %3,%10;" - "mov %4,%12;" - "ror $0x5,%10;" - "movdqa %%xmm6,%%xmm1;" - "xor %k2,%11;" - "xor %5,%12;" - "paddd %%xmm5,%%xmm0;" - "xor %3,%10;" - "and %3,%12;" - "ror $0xb,%11;" - "palignr $0x4,%%xmm5,%%xmm1;" - "xor %k2,%11;" - "ror $0x6,%10;" - "xor %5,%12;" - "movdqa %%xmm1,%%xmm2;" - "ror $0x2,%11;" - "add %10,%12;" - "add %16,%12;" - "movdqa %%xmm1,%%xmm3;" - "mov %k2,%10;" - "add %12,%6;" - "mov %k2,%12;" - "pslld $0x19,%%xmm1;" - "or %8,%10;" - "add %6,%9;" - "and %8,%12;" - "psrld $0x7,%%xmm2;" - "and %7,%10;" - "add %11,%6;" - "por %%xmm2,%%xmm1;" - "or %12,%10;" - "add %10,%6;" - "movdqa %%xmm3,%%xmm2;" - "mov %9,%10;" - "mov %6,%11;" - "movdqa %%xmm3,%%xmm8;" - "ror $0xe,%10;" - "xor %9,%10;" - "mov %3,%12;" - "ror $0x9,%11;" - "pslld $0xe,%%xmm3;" - "xor %6,%11;" - "ror $0x5,%10;" - "xor %4,%12;" - "psrld $0x12,%%xmm2;" - "ror $0xb,%11;" - "xor %9,%10;" - "and %9,%12;" - "ror $0x6,%10;" - "pxor %%xmm3,%%xmm1;" - "xor %6,%11;" - "xor %4,%12;" - "psrld $0x3,%%xmm8;" - "add %10,%12;" - "add 4+%16,%12;" - "ror $0x2,%11;" - "pxor %%xmm2,%%xmm1;" - "mov %6,%10;" - "add %12,%5;" - "mov %6,%12;" - "pxor %%xmm8,%%xmm1;" - "or %7,%10;" - "add %5,%8;" - "and %7,%12;" - "pshufd $0xfa,%%xmm4,%%xmm2;" - "and %k2,%10;" - "add %11,%5;" - "paddd %%xmm1,%%xmm0;" - "or %12,%10;" - "add %10,%5;" - "movdqa %%xmm2,%%xmm3;" - "mov %8,%10;" - "mov %5,%11;" - "ror $0xe,%10;" - "movdqa %%xmm2,%%xmm8;" - "xor %8,%10;" - "ror $0x9,%11;" - "mov %9,%12;" - "xor %5,%11;" - "ror $0x5,%10;" - "psrlq $0x11,%%xmm2;" - "xor %3,%12;" - "psrlq $0x13,%%xmm3;" - "xor %8,%10;" - "and %8,%12;" - "psrld $0xa,%%xmm8;" - "ror $0xb,%11;" - "xor %5,%11;" - "xor %3,%12;" - "ror $0x6,%10;" - "pxor %%xmm3,%%xmm2;" - "add %10,%12;" - "ror $0x2,%11;" - "add 8+%16,%12;" - "pxor %%xmm2,%%xmm8;" - "mov %5,%10;" - "add %12,%4;" - "mov %5,%12;" - "pshufb %%xmm10,%%xmm8;" - "or %k2,%10;" - "add %4,%7;" - "and %k2,%12;" - "paddd %%xmm8,%%xmm0;" - "and %6,%10;" - "add %11,%4;" - "pshufd $0x50,%%xmm0,%%xmm2;" - "or %12,%10;" - "add %10,%4;" - "movdqa %%xmm2,%%xmm3;" - "mov %7,%10;" - "ror $0xe,%10;" - "mov %4,%11;" - "movdqa %%xmm2,%%xmm5;" - "ror $0x9,%11;" - "xor %7,%10;" - "mov %8,%12;" - "ror $0x5,%10;" - "psrlq $0x11,%%xmm2;" - "xor %4,%11;" - "xor %9,%12;" - "psrlq $0x13,%%xmm3;" - "xor %7,%10;" - "and %7,%12;" - "ror $0xb,%11;" - "psrld $0xa,%%xmm5;" - "xor %4,%11;" - "ror $0x6,%10;" - "xor %9,%12;" - "pxor %%xmm3,%%xmm2;" - "ror $0x2,%11;" - "add %10,%12;" - "add 12+%16,%12;" - "pxor %%xmm2,%%xmm5;" - "mov %4,%10;" - "add %12,%3;" - "mov %4,%12;" - "pshufb %%xmm11,%%xmm5;" - "or %6,%10;" - "add %3,%k2;" - "and %6,%12;" - "paddd %%xmm0,%%xmm5;" - "and %5,%10;" - "add %11,%3;" - "or %12,%10;" - "add %10,%3;" - "movdqa 0x20(%13),%%xmm9;" - "paddd %%xmm6,%%xmm9;" - "movdqa %%xmm9,%16;" - "movdqa %%xmm5,%%xmm0;" - "mov %k2,%10;" - "ror $0xe,%10;" - "mov %3,%11;" - "palignr $0x4,%%xmm4,%%xmm0;" - "ror $0x9,%11;" - "xor %k2,%10;" - "mov %7,%12;" - "ror $0x5,%10;" - "movdqa %%xmm7,%%xmm1;" - "xor %3,%11;" - "xor %8,%12;" - "paddd %%xmm6,%%xmm0;" - "xor %k2,%10;" - "and %k2,%12;" - "ror $0xb,%11;" - "palignr $0x4,%%xmm6,%%xmm1;" - "xor %3,%11;" - "ror $0x6,%10;" - "xor %8,%12;" - "movdqa %%xmm1,%%xmm2;" - "ror $0x2,%11;" - "add %10,%12;" - "add %16,%12;" - "movdqa %%xmm1,%%xmm3;" - "mov %3,%10;" - "add %12,%9;" - "mov %3,%12;" - "pslld $0x19,%%xmm1;" - "or %5,%10;" - "add %9,%6;" - "and %5,%12;" - "psrld $0x7,%%xmm2;" - "and %4,%10;" - "add %11,%9;" - "por %%xmm2,%%xmm1;" - "or %12,%10;" - "add %10,%9;" - "movdqa %%xmm3,%%xmm2;" - "mov %6,%10;" - "mov %9,%11;" - "movdqa %%xmm3,%%xmm8;" - "ror $0xe,%10;" - "xor %6,%10;" - "mov %k2,%12;" - "ror $0x9,%11;" - "pslld $0xe,%%xmm3;" - "xor %9,%11;" - "ror $0x5,%10;" - "xor %7,%12;" - "psrld $0x12,%%xmm2;" - "ror $0xb,%11;" - "xor %6,%10;" - "and %6,%12;" - "ror $0x6,%10;" - "pxor %%xmm3,%%xmm1;" - "xor %9,%11;" - "xor %7,%12;" - "psrld $0x3,%%xmm8;" - "add %10,%12;" - "add 4+%16,%12;" - "ror $0x2,%11;" - "pxor %%xmm2,%%xmm1;" - "mov %9,%10;" - "add %12,%8;" - "mov %9,%12;" - "pxor %%xmm8,%%xmm1;" - "or %4,%10;" - "add %8,%5;" - "and %4,%12;" - "pshufd $0xfa,%%xmm5,%%xmm2;" - "and %3,%10;" - "add %11,%8;" - "paddd %%xmm1,%%xmm0;" - "or %12,%10;" - "add %10,%8;" - "movdqa %%xmm2,%%xmm3;" - "mov %5,%10;" - "mov %8,%11;" - "ror $0xe,%10;" - "movdqa %%xmm2,%%xmm8;" - "xor %5,%10;" - "ror $0x9,%11;" - "mov %6,%12;" - "xor %8,%11;" - "ror $0x5,%10;" - "psrlq $0x11,%%xmm2;" - "xor %k2,%12;" - "psrlq $0x13,%%xmm3;" - "xor %5,%10;" - "and %5,%12;" - "psrld $0xa,%%xmm8;" - "ror $0xb,%11;" - "xor %8,%11;" - "xor %k2,%12;" - "ror $0x6,%10;" - "pxor %%xmm3,%%xmm2;" - "add %10,%12;" - "ror $0x2,%11;" - "add 8+%16,%12;" - "pxor %%xmm2,%%xmm8;" - "mov %8,%10;" - "add %12,%7;" - "mov %8,%12;" - "pshufb %%xmm10,%%xmm8;" - "or %3,%10;" - "add %7,%4;" - "and %3,%12;" - "paddd %%xmm8,%%xmm0;" - "and %9,%10;" - "add %11,%7;" - "pshufd $0x50,%%xmm0,%%xmm2;" - "or %12,%10;" - "add %10,%7;" - "movdqa %%xmm2,%%xmm3;" - "mov %4,%10;" - "ror $0xe,%10;" - "mov %7,%11;" - "movdqa %%xmm2,%%xmm6;" - "ror $0x9,%11;" - "xor %4,%10;" - "mov %5,%12;" - "ror $0x5,%10;" - "psrlq $0x11,%%xmm2;" - "xor %7,%11;" - "xor %6,%12;" - "psrlq $0x13,%%xmm3;" - "xor %4,%10;" - "and %4,%12;" - "ror $0xb,%11;" - "psrld $0xa,%%xmm6;" - "xor %7,%11;" - "ror $0x6,%10;" - "xor %6,%12;" - "pxor %%xmm3,%%xmm2;" - "ror $0x2,%11;" - "add %10,%12;" - "add 12+%16,%12;" - "pxor %%xmm2,%%xmm6;" - "mov %7,%10;" - "add %12,%k2;" - "mov %7,%12;" - "pshufb %%xmm11,%%xmm6;" - "or %9,%10;" - "add %k2,%3;" - "and %9,%12;" - "paddd %%xmm0,%%xmm6;" - "and %8,%10;" - "add %11,%k2;" - "or %12,%10;" - "add %10,%k2;" - "movdqa 0x30(%13),%%xmm9;" - "paddd %%xmm7,%%xmm9;" - "movdqa %%xmm9,%16;" - "add $0x40,%13;" - "movdqa %%xmm6,%%xmm0;" - "mov %3,%10;" - "ror $0xe,%10;" - "mov %k2,%11;" - "palignr $0x4,%%xmm5,%%xmm0;" - "ror $0x9,%11;" - "xor %3,%10;" - "mov %4,%12;" - "ror $0x5,%10;" - "movdqa %%xmm4,%%xmm1;" - "xor %k2,%11;" - "xor %5,%12;" - "paddd %%xmm7,%%xmm0;" - "xor %3,%10;" - "and %3,%12;" - "ror $0xb,%11;" - "palignr $0x4,%%xmm7,%%xmm1;" - "xor %k2,%11;" - "ror $0x6,%10;" - "xor %5,%12;" - "movdqa %%xmm1,%%xmm2;" - "ror $0x2,%11;" - "add %10,%12;" - "add %16,%12;" - "movdqa %%xmm1,%%xmm3;" - "mov %k2,%10;" - "add %12,%6;" - "mov %k2,%12;" - "pslld $0x19,%%xmm1;" - "or %8,%10;" - "add %6,%9;" - "and %8,%12;" - "psrld $0x7,%%xmm2;" - "and %7,%10;" - "add %11,%6;" - "por %%xmm2,%%xmm1;" - "or %12,%10;" - "add %10,%6;" - "movdqa %%xmm3,%%xmm2;" - "mov %9,%10;" - "mov %6,%11;" - "movdqa %%xmm3,%%xmm8;" - "ror $0xe,%10;" - "xor %9,%10;" - "mov %3,%12;" - "ror $0x9,%11;" - "pslld $0xe,%%xmm3;" - "xor %6,%11;" - "ror $0x5,%10;" - "xor %4,%12;" - "psrld $0x12,%%xmm2;" - "ror $0xb,%11;" - "xor %9,%10;" - "and %9,%12;" - "ror $0x6,%10;" - "pxor %%xmm3,%%xmm1;" - "xor %6,%11;" - "xor %4,%12;" - "psrld $0x3,%%xmm8;" - "add %10,%12;" - "add 4+%16,%12;" - "ror $0x2,%11;" - "pxor %%xmm2,%%xmm1;" - "mov %6,%10;" - "add %12,%5;" - "mov %6,%12;" - "pxor %%xmm8,%%xmm1;" - "or %7,%10;" - "add %5,%8;" - "and %7,%12;" - "pshufd $0xfa,%%xmm6,%%xmm2;" - "and %k2,%10;" - "add %11,%5;" - "paddd %%xmm1,%%xmm0;" - "or %12,%10;" - "add %10,%5;" - "movdqa %%xmm2,%%xmm3;" - "mov %8,%10;" - "mov %5,%11;" - "ror $0xe,%10;" - "movdqa %%xmm2,%%xmm8;" - "xor %8,%10;" - "ror $0x9,%11;" - "mov %9,%12;" - "xor %5,%11;" - "ror $0x5,%10;" - "psrlq $0x11,%%xmm2;" - "xor %3,%12;" - "psrlq $0x13,%%xmm3;" - "xor %8,%10;" - "and %8,%12;" - "psrld $0xa,%%xmm8;" - "ror $0xb,%11;" - "xor %5,%11;" - "xor %3,%12;" - "ror $0x6,%10;" - "pxor %%xmm3,%%xmm2;" - "add %10,%12;" - "ror $0x2,%11;" - "add 8+%16,%12;" - "pxor %%xmm2,%%xmm8;" - "mov %5,%10;" - "add %12,%4;" - "mov %5,%12;" - "pshufb %%xmm10,%%xmm8;" - "or %k2,%10;" - "add %4,%7;" - "and %k2,%12;" - "paddd %%xmm8,%%xmm0;" - "and %6,%10;" - "add %11,%4;" - "pshufd $0x50,%%xmm0,%%xmm2;" - "or %12,%10;" - "add %10,%4;" - "movdqa %%xmm2,%%xmm3;" - "mov %7,%10;" - "ror $0xe,%10;" - "mov %4,%11;" - "movdqa %%xmm2,%%xmm7;" - "ror $0x9,%11;" - "xor %7,%10;" - "mov %8,%12;" - "ror $0x5,%10;" - "psrlq $0x11,%%xmm2;" - "xor %4,%11;" - "xor %9,%12;" - "psrlq $0x13,%%xmm3;" - "xor %7,%10;" - "and %7,%12;" - "ror $0xb,%11;" - "psrld $0xa,%%xmm7;" - "xor %4,%11;" - "ror $0x6,%10;" - "xor %9,%12;" - "pxor %%xmm3,%%xmm2;" - "ror $0x2,%11;" - "add %10,%12;" - "add 12+%16,%12;" - "pxor %%xmm2,%%xmm7;" - "mov %4,%10;" - "add %12,%3;" - "mov %4,%12;" - "pshufb %%xmm11,%%xmm7;" - "or %6,%10;" - "add %3,%k2;" - "and %6,%12;" - "paddd %%xmm0,%%xmm7;" - "and %5,%10;" - "add %11,%3;" - "or %12,%10;" - "add %10,%3;" - "sub $0x1,%1;" - "jne Lloop1_%=;" - "mov $0x2,%1;" - - "Lloop2_%=:" - "paddd 0x0(%13),%%xmm4;" - "movdqa %%xmm4,%16;" - "mov %k2,%10;" - "ror $0xe,%10;" - "mov %3,%11;" - "xor %k2,%10;" - "ror $0x9,%11;" - "mov %7,%12;" - "xor %3,%11;" - "ror $0x5,%10;" - "xor %8,%12;" - "xor %k2,%10;" - "ror $0xb,%11;" - "and %k2,%12;" - "xor %3,%11;" - "ror $0x6,%10;" - "xor %8,%12;" - "add %10,%12;" - "ror $0x2,%11;" - "add %16,%12;" - "mov %3,%10;" - "add %12,%9;" - "mov %3,%12;" - "or %5,%10;" - "add %9,%6;" - "and %5,%12;" - "and %4,%10;" - "add %11,%9;" - "or %12,%10;" - "add %10,%9;" - "mov %6,%10;" - "ror $0xe,%10;" - "mov %9,%11;" - "xor %6,%10;" - "ror $0x9,%11;" - "mov %k2,%12;" - "xor %9,%11;" - "ror $0x5,%10;" - "xor %7,%12;" - "xor %6,%10;" - "ror $0xb,%11;" - "and %6,%12;" - "xor %9,%11;" - "ror $0x6,%10;" - "xor %7,%12;" - "add %10,%12;" - "ror $0x2,%11;" - "add 4+%16,%12;" - "mov %9,%10;" - "add %12,%8;" - "mov %9,%12;" - "or %4,%10;" - "add %8,%5;" - "and %4,%12;" - "and %3,%10;" - "add %11,%8;" - "or %12,%10;" - "add %10,%8;" - "mov %5,%10;" - "ror $0xe,%10;" - "mov %8,%11;" - "xor %5,%10;" - "ror $0x9,%11;" - "mov %6,%12;" - "xor %8,%11;" - "ror $0x5,%10;" - "xor %k2,%12;" - "xor %5,%10;" - "ror $0xb,%11;" - "and %5,%12;" - "xor %8,%11;" - "ror $0x6,%10;" - "xor %k2,%12;" - "add %10,%12;" - "ror $0x2,%11;" - "add 8+%16,%12;" - "mov %8,%10;" - "add %12,%7;" - "mov %8,%12;" - "or %3,%10;" - "add %7,%4;" - "and %3,%12;" - "and %9,%10;" - "add %11,%7;" - "or %12,%10;" - "add %10,%7;" - "mov %4,%10;" - "ror $0xe,%10;" - "mov %7,%11;" - "xor %4,%10;" - "ror $0x9,%11;" - "mov %5,%12;" - "xor %7,%11;" - "ror $0x5,%10;" - "xor %6,%12;" - "xor %4,%10;" - "ror $0xb,%11;" - "and %4,%12;" - "xor %7,%11;" - "ror $0x6,%10;" - "xor %6,%12;" - "add %10,%12;" - "ror $0x2,%11;" - "add 12+%16,%12;" - "mov %7,%10;" - "add %12,%k2;" - "mov %7,%12;" - "or %9,%10;" - "add %k2,%3;" - "and %9,%12;" - "and %8,%10;" - "add %11,%k2;" - "or %12,%10;" - "add %10,%k2;" - "paddd 0x10(%13),%%xmm5;" - "movdqa %%xmm5,%16;" - "add $0x20,%13;" - "mov %3,%10;" - "ror $0xe,%10;" - "mov %k2,%11;" - "xor %3,%10;" - "ror $0x9,%11;" - "mov %4,%12;" - "xor %k2,%11;" - "ror $0x5,%10;" - "xor %5,%12;" - "xor %3,%10;" - "ror $0xb,%11;" - "and %3,%12;" - "xor %k2,%11;" - "ror $0x6,%10;" - "xor %5,%12;" - "add %10,%12;" - "ror $0x2,%11;" - "add %16,%12;" - "mov %k2,%10;" - "add %12,%6;" - "mov %k2,%12;" - "or %8,%10;" - "add %6,%9;" - "and %8,%12;" - "and %7,%10;" - "add %11,%6;" - "or %12,%10;" - "add %10,%6;" - "mov %9,%10;" - "ror $0xe,%10;" - "mov %6,%11;" - "xor %9,%10;" - "ror $0x9,%11;" - "mov %3,%12;" - "xor %6,%11;" - "ror $0x5,%10;" - "xor %4,%12;" - "xor %9,%10;" - "ror $0xb,%11;" - "and %9,%12;" - "xor %6,%11;" - "ror $0x6,%10;" - "xor %4,%12;" - "add %10,%12;" - "ror $0x2,%11;" - "add 4+%16,%12;" - "mov %6,%10;" - "add %12,%5;" - "mov %6,%12;" - "or %7,%10;" - "add %5,%8;" - "and %7,%12;" - "and %k2,%10;" - "add %11,%5;" - "or %12,%10;" - "add %10,%5;" - "mov %8,%10;" - "ror $0xe,%10;" - "mov %5,%11;" - "xor %8,%10;" - "ror $0x9,%11;" - "mov %9,%12;" - "xor %5,%11;" - "ror $0x5,%10;" - "xor %3,%12;" - "xor %8,%10;" - "ror $0xb,%11;" - "and %8,%12;" - "xor %5,%11;" - "ror $0x6,%10;" - "xor %3,%12;" - "add %10,%12;" - "ror $0x2,%11;" - "add 8+%16,%12;" - "mov %5,%10;" - "add %12,%4;" - "mov %5,%12;" - "or %k2,%10;" - "add %4,%7;" - "and %k2,%12;" - "and %6,%10;" - "add %11,%4;" - "or %12,%10;" - "add %10,%4;" - "mov %7,%10;" - "ror $0xe,%10;" - "mov %4,%11;" - "xor %7,%10;" - "ror $0x9,%11;" - "mov %8,%12;" - "xor %4,%11;" - "ror $0x5,%10;" - "xor %9,%12;" - "xor %7,%10;" - "ror $0xb,%11;" - "and %7,%12;" - "xor %4,%11;" - "ror $0x6,%10;" - "xor %9,%12;" - "add %10,%12;" - "ror $0x2,%11;" - "add 12+%16,%12;" - "mov %4,%10;" - "add %12,%3;" - "mov %4,%12;" - "or %6,%10;" - "add %3,%k2;" - "and %6,%12;" - "and %5,%10;" - "add %11,%3;" - "or %12,%10;" - "add %10,%3;" - "movdqa %%xmm6,%%xmm4;" - "movdqa %%xmm7,%%xmm5;" - "sub $0x1,%1;" - "jne Lloop2_%=;" - - "add (%0),%3;" - "mov %3,(%0);" - "add 0x4(%0),%4;" - "mov %4,0x4(%0);" - "add 0x8(%0),%5;" - "mov %5,0x8(%0);" - "add 0xc(%0),%6;" - "mov %6,0xc(%0);" - "add 0x10(%0),%k2;" - "mov %k2,0x10(%0);" - "add 0x14(%0),%7;" - "mov %7,0x14(%0);" - "add 0x18(%0),%8;" - "mov %8,0x18(%0);" - "add 0x1c(%0),%9;" - "mov %9,0x1c(%0);" - "mov %15,%1;" - "add $0x40,%1;" - "cmp %14,%1;" - "jne Lloop0_%=;" - -#endif // VISUAL - - "Ldone_hash_%=:" - - : "+r"(state_integers), "+r"(data_bytes), "+r"(blocks), "=r"(a), "=r"(b), "=r"(c), "=r"(d), /* e = chunk */ "=r"(f), "=r"(g), "=r"(h), "=r"(y0), "=r"(y1), "=r"(y2), "=r"(table), "+m"(input_end), "+m"(input), "+m"(transfer) - : "m"(k256), "m"(flip_mask), "m"(shuffle_00ba), "m"(shuffle_dc00) - : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12" - ); -} - -#endif // HAVE_XASSEMBLY - -#endif // DISABLED - -} // namespace sha256 -} // namespace system -} // namespace libbitcoin diff --git a/src/hash/vectorization/sha256_4_sse41.cpp b/src/hash/vectorization/sha256_4_sse41.cpp deleted file mode 100644 index 64eff5a706..0000000000 --- a/src/hash/vectorization/sha256_4_sse41.cpp +++ /dev/null @@ -1,461 +0,0 @@ -// Based on: -// sha256-x86.c - Intel SHA extensions using C intrinsics -// Written and place in public domain by Jeffrey Walton -// Based on code from Intel, and by Sean Gulley for the miTLS project. - -#include -#include -#include - -namespace libbitcoin { -namespace system { -namespace sha256 { - -#if defined (DISABLED) - -#if !defined(HAVE_XCPU) - -void merkle_sse41(digest4& out, const block4& blocks) NOEXCEPT -{ - BC_ASSERT_MSG(false, "merkle_sse41 undefined"); -} - -#else - -namespace i128 { - -using xint128_t = __m128i; - -template -uint32_t get(xint128_t a) noexcept -{ - return _mm_extract_epi32(a, Offset); -} - -// Broadcast 32-bit integer a to all elements of dst. -xint128_t set(uint32_t a) noexcept -{ - return _mm_set1_epi32(a); -} - -xint128_t set(uint64_t a, uint64_t b) noexcept -{ - return _mm_set_epi64x(a, b); -} - -xint128_t set(uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept -{ - return _mm_set_epi32(a, b, c, d); -} - -xint128_t sum(xint128_t a, xint128_t b) noexcept -{ - return _mm_add_epi32(a, b); -} - -xint128_t sum(xint128_t a, xint128_t b, xint128_t c) noexcept -{ - - return sum(sum(a, b), c); -} - -xint128_t sum(xint128_t a, xint128_t b, xint128_t c, - xint128_t d) noexcept -{ - return sum(sum(a, b), sum(c, d)); -} - -xint128_t sum(xint128_t a, xint128_t b, xint128_t c, xint128_t d, - xint128_t e) noexcept -{ - return sum(sum(a, b, c), sum(d, e)); -} - -xint128_t inc(xint128_t& outa, xint128_t b) noexcept -{ - return ((outa = sum(outa, b))); -} - -xint128_t inc(xint128_t& outa, xint128_t b, xint128_t c) noexcept -{ - return ((outa = sum(outa, b, c))); -} - -xint128_t inc(xint128_t& outa, xint128_t b, xint128_t c, - xint128_t d) noexcept -{ - return ((outa = sum(outa, b, c, d))); -} - -xint128_t exc(xint128_t a, xint128_t b) noexcept -{ - return _mm_xor_si128(a, b); -} - -xint128_t exc(xint128_t a, xint128_t b, xint128_t c) noexcept -{ - return exc(exc(a, b), c); -} - -xint128_t dis(xint128_t a, xint128_t b) noexcept -{ - return _mm_or_si128(a, b); -} - -xint128_t con(xint128_t a, xint128_t b) noexcept -{ - return _mm_and_si128(a, b); -} - -xint128_t shr(xint128_t a, uint32_t bits) noexcept -{ - return _mm_srli_epi32(a, bits); -} - -xint128_t shl(xint128_t a, uint32_t bits) noexcept -{ - return _mm_slli_epi32(a, bits); -} - -/// Concatenate two 16-byte blocks into a 32-byte temporary result, shift the -/// result right by Shift bytes, and return the low 16 bytes. -template -xint128_t align_right(xint128_t a, xint128_t b) noexcept -{ - return _mm_alignr_epi8(a, b, Shift); -} - -/// Blend two packed 16-bit integers using Mask. -template -xint128_t blend(xint128_t a, xint128_t b) noexcept -{ - return _mm_blend_epi16(a, b, Mask); -} - -/// Shuffle 32-bit integers using Control. -template -xint128_t shuffle(xint128_t a) noexcept -{ - return _mm_shuffle_epi32(a, Control); -} - -/// Shuffle packed 8-bit integers in a according to shuffle control mask in the -/// corresponding 8-bit element of b. -xint128_t shuffle(xint128_t a, xint128_t b) noexcept -{ - return _mm_shuffle_epi8(a, b); -} - -} // namespace i128 - -using namespace i128; - -xint128_t inline SIGMA0(xint128_t x) NOEXCEPT { return exc(dis(shr(x, 2), shl(x, 30)), dis(shr(x, 13), shl(x, 19)), dis(shr(x, 22), shl(x, 10))); } -xint128_t inline SIGMA1(xint128_t x) NOEXCEPT { return exc(dis(shr(x, 6), shl(x, 26)), dis(shr(x, 11), shl(x, 21)), dis(shr(x, 25), shl(x, 7))); } -xint128_t inline sigma0(xint128_t x) NOEXCEPT { return exc(dis(shr(x, 7), shl(x, 25)), dis(shr(x, 18), shl(x, 14)), shr(x, 3)); } -xint128_t inline sigma1(xint128_t x) NOEXCEPT { return exc(dis(shr(x, 17), shl(x, 15)), dis(shr(x, 19), shl(x, 13)), shr(x, 10)); } -xint128_t inline choice( xint128_t x, xint128_t y, xint128_t z) NOEXCEPT { return exc(z, con(x, exc(y, z))); } -xint128_t inline majority(xint128_t x, xint128_t y, xint128_t z) NOEXCEPT { return dis(con(x, y), con(z, dis(x, y))); } - -void inline round(xint128_t a, xint128_t b, xint128_t c, xint128_t& d, - xint128_t e, xint128_t f, xint128_t g, xint128_t& h, xint128_t k) NOEXCEPT -{ - const auto t1 = sum(h, SIGMA1(e), choice(e, f, g), k); - const auto t2 = sum( SIGMA0(a), majority(a, b, c)); - d = sum(d, t1); - h = sum(t1, t2); -} - -template -xint128_t inline read4(const block4& blocks) NOEXCEPT -{ - constexpr auto four = sizeof(uint32_t); - BC_PUSH_WARNING(NO_ARRAY_INDEXING) - const auto value = set( - from_little_endian(array_cast(blocks[0])), - from_little_endian(array_cast(blocks[1])), - from_little_endian(array_cast(blocks[2])), - from_little_endian(array_cast(blocks[3]))); - BC_POP_WARNING() - - // bswap_mask - return shuffle(value, set( - 0x0c0d0e0ful, 0x08090a0bul, 0x04050607ul, 0x00010203ul)); -} - -template -void inline write4(digest4& hashes, xint128_t value) NOEXCEPT -{ - // bswap_mask - value = shuffle(value, set( - 0x0c0d0e0ful, 0x08090a0bul, 0x04050607ul, 0x00010203ul)); - - constexpr auto four = sizeof(uint32_t); - BC_PUSH_WARNING(NO_ARRAY_INDEXING) - array_cast(hashes[0]) = to_little_endian(get<3>(value)); - array_cast(hashes[1]) = to_little_endian(get<2>(value)); - array_cast(hashes[2]) = to_little_endian(get<1>(value)); - array_cast(hashes[3]) = to_little_endian(get<0>(value)); - BC_POP_WARNING() -} - -// Four blocks in four lanes, doubled. -void merkle_sse41(digest4& out, const block4& blocks) NOEXCEPT -{ - // Transform 1. - auto a = set(0x6a09e667ul); - auto b = set(0xbb67ae85ul); - auto c = set(0x3c6ef372ul); - auto d = set(0xa54ff53aul); - auto e = set(0x510e527ful); - auto f = set(0x9b05688cul); - auto g = set(0x1f83d9abul); - auto h = set(0x5be0cd19ul); - - xint128_t w00, w01, w02, w03, w04, w05, w06, w07; - xint128_t w08, w09, w10, w11, w12, w13, w14, w15; - - round(a, b, c, d, e, f, g, h, sum(set(0x428a2f98ul), w00 = read4< 0>(blocks))); - round(h, a, b, c, d, e, f, g, sum(set(0x71374491ul), w01 = read4< 4>(blocks))); - round(g, h, a, b, c, d, e, f, sum(set(0xb5c0fbcful), w02 = read4< 8>(blocks))); - round(f, g, h, a, b, c, d, e, sum(set(0xe9b5dba5ul), w03 = read4<12>(blocks))); - round(e, f, g, h, a, b, c, d, sum(set(0x3956c25bul), w04 = read4<16>(blocks))); - round(d, e, f, g, h, a, b, c, sum(set(0x59f111f1ul), w05 = read4<20>(blocks))); - round(c, d, e, f, g, h, a, b, sum(set(0x923f82a4ul), w06 = read4<24>(blocks))); - round(b, c, d, e, f, g, h, a, sum(set(0xab1c5ed5ul), w07 = read4<28>(blocks))); - round(a, b, c, d, e, f, g, h, sum(set(0xd807aa98ul), w08 = read4<32>(blocks))); - round(h, a, b, c, d, e, f, g, sum(set(0x12835b01ul), w09 = read4<36>(blocks))); - round(g, h, a, b, c, d, e, f, sum(set(0x243185beul), w10 = read4<40>(blocks))); - round(f, g, h, a, b, c, d, e, sum(set(0x550c7dc3ul), w11 = read4<44>(blocks))); - round(e, f, g, h, a, b, c, d, sum(set(0x72be5d74ul), w12 = read4<48>(blocks))); - round(d, e, f, g, h, a, b, c, sum(set(0x80deb1feul), w13 = read4<52>(blocks))); - round(c, d, e, f, g, h, a, b, sum(set(0x9bdc06a7ul), w14 = read4<56>(blocks))); - round(b, c, d, e, f, g, h, a, sum(set(0xc19bf174ul), w15 = read4<60>(blocks))); - round(a, b, c, d, e, f, g, h, sum(set(0xe49b69c1ul), inc(w00, sigma1(w14), w09, sigma0(w01)))); - round(h, a, b, c, d, e, f, g, sum(set(0xefbe4786ul), inc(w01, sigma1(w15), w10, sigma0(w02)))); - round(g, h, a, b, c, d, e, f, sum(set(0x0fc19dc6ul), inc(w02, sigma1(w00), w11, sigma0(w03)))); - round(f, g, h, a, b, c, d, e, sum(set(0x240ca1ccul), inc(w03, sigma1(w01), w12, sigma0(w04)))); - round(e, f, g, h, a, b, c, d, sum(set(0x2de92c6ful), inc(w04, sigma1(w02), w13, sigma0(w05)))); - round(d, e, f, g, h, a, b, c, sum(set(0x4a7484aaul), inc(w05, sigma1(w03), w14, sigma0(w06)))); - round(c, d, e, f, g, h, a, b, sum(set(0x5cb0a9dcul), inc(w06, sigma1(w04), w15, sigma0(w07)))); - round(b, c, d, e, f, g, h, a, sum(set(0x76f988daul), inc(w07, sigma1(w05), w00, sigma0(w08)))); - round(a, b, c, d, e, f, g, h, sum(set(0x983e5152ul), inc(w08, sigma1(w06), w01, sigma0(w09)))); - round(h, a, b, c, d, e, f, g, sum(set(0xa831c66dul), inc(w09, sigma1(w07), w02, sigma0(w10)))); - round(g, h, a, b, c, d, e, f, sum(set(0xb00327c8ul), inc(w10, sigma1(w08), w03, sigma0(w11)))); - round(f, g, h, a, b, c, d, e, sum(set(0xbf597fc7ul), inc(w11, sigma1(w09), w04, sigma0(w12)))); - round(e, f, g, h, a, b, c, d, sum(set(0xc6e00bf3ul), inc(w12, sigma1(w10), w05, sigma0(w13)))); - round(d, e, f, g, h, a, b, c, sum(set(0xd5a79147ul), inc(w13, sigma1(w11), w06, sigma0(w14)))); - round(c, d, e, f, g, h, a, b, sum(set(0x06ca6351ul), inc(w14, sigma1(w12), w07, sigma0(w15)))); - round(b, c, d, e, f, g, h, a, sum(set(0x14292967ul), inc(w15, sigma1(w13), w08, sigma0(w00)))); - round(a, b, c, d, e, f, g, h, sum(set(0x27b70a85ul), inc(w00, sigma1(w14), w09, sigma0(w01)))); - round(h, a, b, c, d, e, f, g, sum(set(0x2e1b2138ul), inc(w01, sigma1(w15), w10, sigma0(w02)))); - round(g, h, a, b, c, d, e, f, sum(set(0x4d2c6dfcul), inc(w02, sigma1(w00), w11, sigma0(w03)))); - round(f, g, h, a, b, c, d, e, sum(set(0x53380d13ul), inc(w03, sigma1(w01), w12, sigma0(w04)))); - round(e, f, g, h, a, b, c, d, sum(set(0x650a7354ul), inc(w04, sigma1(w02), w13, sigma0(w05)))); - round(d, e, f, g, h, a, b, c, sum(set(0x766a0abbul), inc(w05, sigma1(w03), w14, sigma0(w06)))); - round(c, d, e, f, g, h, a, b, sum(set(0x81c2c92eul), inc(w06, sigma1(w04), w15, sigma0(w07)))); - round(b, c, d, e, f, g, h, a, sum(set(0x92722c85ul), inc(w07, sigma1(w05), w00, sigma0(w08)))); - round(a, b, c, d, e, f, g, h, sum(set(0xa2bfe8a1ul), inc(w08, sigma1(w06), w01, sigma0(w09)))); - round(h, a, b, c, d, e, f, g, sum(set(0xa81a664bul), inc(w09, sigma1(w07), w02, sigma0(w10)))); - round(g, h, a, b, c, d, e, f, sum(set(0xc24b8b70ul), inc(w10, sigma1(w08), w03, sigma0(w11)))); - round(f, g, h, a, b, c, d, e, sum(set(0xc76c51a3ul), inc(w11, sigma1(w09), w04, sigma0(w12)))); - round(e, f, g, h, a, b, c, d, sum(set(0xd192e819ul), inc(w12, sigma1(w10), w05, sigma0(w13)))); - round(d, e, f, g, h, a, b, c, sum(set(0xd6990624ul), inc(w13, sigma1(w11), w06, sigma0(w14)))); - round(c, d, e, f, g, h, a, b, sum(set(0xf40e3585ul), inc(w14, sigma1(w12), w07, sigma0(w15)))); - round(b, c, d, e, f, g, h, a, sum(set(0x106aa070ul), inc(w15, sigma1(w13), w08, sigma0(w00)))); - round(a, b, c, d, e, f, g, h, sum(set(0x19a4c116ul), inc(w00, sigma1(w14), w09, sigma0(w01)))); - round(h, a, b, c, d, e, f, g, sum(set(0x1e376c08ul), inc(w01, sigma1(w15), w10, sigma0(w02)))); - round(g, h, a, b, c, d, e, f, sum(set(0x2748774cul), inc(w02, sigma1(w00), w11, sigma0(w03)))); - round(f, g, h, a, b, c, d, e, sum(set(0x34b0bcb5ul), inc(w03, sigma1(w01), w12, sigma0(w04)))); - round(e, f, g, h, a, b, c, d, sum(set(0x391c0cb3ul), inc(w04, sigma1(w02), w13, sigma0(w05)))); - round(d, e, f, g, h, a, b, c, sum(set(0x4ed8aa4aul), inc(w05, sigma1(w03), w14, sigma0(w06)))); - round(c, d, e, f, g, h, a, b, sum(set(0x5b9cca4ful), inc(w06, sigma1(w04), w15, sigma0(w07)))); - round(b, c, d, e, f, g, h, a, sum(set(0x682e6ff3ul), inc(w07, sigma1(w05), w00, sigma0(w08)))); - round(a, b, c, d, e, f, g, h, sum(set(0x748f82eeul), inc(w08, sigma1(w06), w01, sigma0(w09)))); - round(h, a, b, c, d, e, f, g, sum(set(0x78a5636ful), inc(w09, sigma1(w07), w02, sigma0(w10)))); - round(g, h, a, b, c, d, e, f, sum(set(0x84c87814ul), inc(w10, sigma1(w08), w03, sigma0(w11)))); - round(f, g, h, a, b, c, d, e, sum(set(0x8cc70208ul), inc(w11, sigma1(w09), w04, sigma0(w12)))); - round(e, f, g, h, a, b, c, d, sum(set(0x90befffaul), inc(w12, sigma1(w10), w05, sigma0(w13)))); - round(d, e, f, g, h, a, b, c, sum(set(0xa4506cebul), inc(w13, sigma1(w11), w06, sigma0(w14)))); - round(c, d, e, f, g, h, a, b, sum(set(0xbef9a3f7ul), inc(w14, sigma1(w12), w07, sigma0(w15)))); - round(b, c, d, e, f, g, h, a, sum(set(0xc67178f2ul), inc(w15, sigma1(w13), w08, sigma0(w00)))); - - a = sum(a, set(0x6a09e667ul)); - b = sum(b, set(0xbb67ae85ul)); - c = sum(c, set(0x3c6ef372ul)); - d = sum(d, set(0xa54ff53aul)); - e = sum(e, set(0x510e527ful)); - f = sum(f, set(0x9b05688cul)); - g = sum(g, set(0x1f83d9abul)); - h = sum(h, set(0x5be0cd19ul)); - - const xint128_t t0 = a, t1 = b, t2 = c, t3 = d, t4 = e, t5 = f, t6 = g, t7 = h; - - // Transform 2. - round(a, b, c, d, e, f, g, h, set(0xc28a2f98ul)); - round(h, a, b, c, d, e, f, g, set(0x71374491ul)); - round(g, h, a, b, c, d, e, f, set(0xb5c0fbcful)); - round(f, g, h, a, b, c, d, e, set(0xe9b5dba5ul)); - round(e, f, g, h, a, b, c, d, set(0x3956c25bul)); - round(d, e, f, g, h, a, b, c, set(0x59f111f1ul)); - round(c, d, e, f, g, h, a, b, set(0x923f82a4ul)); - round(b, c, d, e, f, g, h, a, set(0xab1c5ed5ul)); - round(a, b, c, d, e, f, g, h, set(0xd807aa98ul)); - round(h, a, b, c, d, e, f, g, set(0x12835b01ul)); - round(g, h, a, b, c, d, e, f, set(0x243185beul)); - round(f, g, h, a, b, c, d, e, set(0x550c7dc3ul)); - round(e, f, g, h, a, b, c, d, set(0x72be5d74ul)); - round(d, e, f, g, h, a, b, c, set(0x80deb1feul)); - round(c, d, e, f, g, h, a, b, set(0x9bdc06a7ul)); - round(b, c, d, e, f, g, h, a, set(0xc19bf374ul)); - round(a, b, c, d, e, f, g, h, set(0x649b69c1ul)); - round(h, a, b, c, d, e, f, g, set(0xf0fe4786ul)); - round(g, h, a, b, c, d, e, f, set(0x0fe1edc6ul)); - round(f, g, h, a, b, c, d, e, set(0x240cf254ul)); - round(e, f, g, h, a, b, c, d, set(0x4fe9346ful)); - round(d, e, f, g, h, a, b, c, set(0x6cc984beul)); - round(c, d, e, f, g, h, a, b, set(0x61b9411eul)); - round(b, c, d, e, f, g, h, a, set(0x16f988faul)); - round(a, b, c, d, e, f, g, h, set(0xf2c65152ul)); - round(h, a, b, c, d, e, f, g, set(0xa88e5a6dul)); - round(g, h, a, b, c, d, e, f, set(0xb019fc65ul)); - round(f, g, h, a, b, c, d, e, set(0xb9d99ec7ul)); - round(e, f, g, h, a, b, c, d, set(0x9a1231c3ul)); - round(d, e, f, g, h, a, b, c, set(0xe70eeaa0ul)); - round(c, d, e, f, g, h, a, b, set(0xfdb1232bul)); - round(b, c, d, e, f, g, h, a, set(0xc7353eb0ul)); - round(a, b, c, d, e, f, g, h, set(0x3069bad5ul)); - round(h, a, b, c, d, e, f, g, set(0xcb976d5ful)); - round(g, h, a, b, c, d, e, f, set(0x5a0f118ful)); - round(f, g, h, a, b, c, d, e, set(0xdc1eeefdul)); - round(e, f, g, h, a, b, c, d, set(0x0a35b689ul)); - round(d, e, f, g, h, a, b, c, set(0xde0b7a04ul)); - round(c, d, e, f, g, h, a, b, set(0x58f4ca9dul)); - round(b, c, d, e, f, g, h, a, set(0xe15d5b16ul)); - round(a, b, c, d, e, f, g, h, set(0x007f3e86ul)); - round(h, a, b, c, d, e, f, g, set(0x37088980ul)); - round(g, h, a, b, c, d, e, f, set(0xa507ea32ul)); - round(f, g, h, a, b, c, d, e, set(0x6fab9537ul)); - round(e, f, g, h, a, b, c, d, set(0x17406110ul)); - round(d, e, f, g, h, a, b, c, set(0x0d8cd6f1ul)); - round(c, d, e, f, g, h, a, b, set(0xcdaa3b6dul)); - round(b, c, d, e, f, g, h, a, set(0xc0bbbe37ul)); - round(a, b, c, d, e, f, g, h, set(0x83613bdaul)); - round(h, a, b, c, d, e, f, g, set(0xdb48a363ul)); - round(g, h, a, b, c, d, e, f, set(0x0b02e931ul)); - round(f, g, h, a, b, c, d, e, set(0x6fd15ca7ul)); - round(e, f, g, h, a, b, c, d, set(0x521afacaul)); - round(d, e, f, g, h, a, b, c, set(0x31338431ul)); - round(c, d, e, f, g, h, a, b, set(0x6ed41a95ul)); - round(b, c, d, e, f, g, h, a, set(0x6d437890ul)); - round(a, b, c, d, e, f, g, h, set(0xc39c91f2ul)); - round(h, a, b, c, d, e, f, g, set(0x9eccabbdul)); - round(g, h, a, b, c, d, e, f, set(0xb5c9a0e6ul)); - round(f, g, h, a, b, c, d, e, set(0x532fb63cul)); - round(e, f, g, h, a, b, c, d, set(0xd2c741c6ul)); - round(d, e, f, g, h, a, b, c, set(0x07237ea3ul)); - round(c, d, e, f, g, h, a, b, set(0xa4954b68ul)); - round(b, c, d, e, f, g, h, a, set(0x4c191d76ul)); - - w00 = sum(t0, a); - w01 = sum(t1, b); - w02 = sum(t2, c); - w03 = sum(t3, d); - w04 = sum(t4, e); - w05 = sum(t5, f); - w06 = sum(t6, g); - w07 = sum(t7, h); - - // Transform 3. - a = set(0x6a09e667ul); - b = set(0xbb67ae85ul); - c = set(0x3c6ef372ul); - d = set(0xa54ff53aul); - e = set(0x510e527ful); - f = set(0x9b05688cul); - g = set(0x1f83d9abul); - h = set(0x5be0cd19ul); - - round(a, b, c, d, e, f, g, h, sum(set(0x428a2f98ul), w00)); - round(h, a, b, c, d, e, f, g, sum(set(0x71374491ul), w01)); - round(g, h, a, b, c, d, e, f, sum(set(0xb5c0fbcful), w02)); - round(f, g, h, a, b, c, d, e, sum(set(0xe9b5dba5ul), w03)); - round(e, f, g, h, a, b, c, d, sum(set(0x3956c25bul), w04)); - round(d, e, f, g, h, a, b, c, sum(set(0x59f111f1ul), w05)); - round(c, d, e, f, g, h, a, b, sum(set(0x923f82a4ul), w06)); - round(b, c, d, e, f, g, h, a, sum(set(0xab1c5ed5ul), w07)); - round(a, b, c, d, e, f, g, h, set(0x5807aa98ul)); - round(h, a, b, c, d, e, f, g, set(0x12835b01ul)); - round(g, h, a, b, c, d, e, f, set(0x243185beul)); - round(f, g, h, a, b, c, d, e, set(0x550c7dc3ul)); - round(e, f, g, h, a, b, c, d, set(0x72be5d74ul)); - round(d, e, f, g, h, a, b, c, set(0x80deb1feul)); - round(c, d, e, f, g, h, a, b, set(0x9bdc06a7ul)); - round(b, c, d, e, f, g, h, a, set(0xc19bf274ul)); - round(a, b, c, d, e, f, g, h, sum(set(0xe49b69c1ul), inc(w00, sigma0(w01)))); - round(h, a, b, c, d, e, f, g, sum(set(0xefbe4786ul), inc(w01, set(0xa00000ul), sigma0(w02)))); - round(g, h, a, b, c, d, e, f, sum(set(0x0fc19dc6ul), inc(w02, sigma1(w00), sigma0(w03)))); - round(f, g, h, a, b, c, d, e, sum(set(0x240ca1ccul), inc(w03, sigma1(w01), sigma0(w04)))); - round(e, f, g, h, a, b, c, d, sum(set(0x2de92c6ful), inc(w04, sigma1(w02), sigma0(w05)))); - round(d, e, f, g, h, a, b, c, sum(set(0x4a7484aaul), inc(w05, sigma1(w03), sigma0(w06)))); - round(c, d, e, f, g, h, a, b, sum(set(0x5cb0a9dcul), inc(w06, sigma1(w04), set(0x100ul), sigma0(w07)))); - round(b, c, d, e, f, g, h, a, sum(set(0x76f988daul), inc(w07, sigma1(w05), w00, set(0x11002000ul)))); - round(a, b, c, d, e, f, g, h, sum(set(0x983e5152ul), w08 = sum(set(0x80000000ul), sigma1(w06), w01))); - round(h, a, b, c, d, e, f, g, sum(set(0xa831c66dul), w09 = sum(sigma1(w07), w02))); - round(g, h, a, b, c, d, e, f, sum(set(0xb00327c8ul), w10 = sum(sigma1(w08), w03))); - round(f, g, h, a, b, c, d, e, sum(set(0xbf597fc7ul), w11 = sum(sigma1(w09), w04))); - round(e, f, g, h, a, b, c, d, sum(set(0xc6e00bf3ul), w12 = sum(sigma1(w10), w05))); - round(d, e, f, g, h, a, b, c, sum(set(0xd5a79147ul), w13 = sum(sigma1(w11), w06))); - round(c, d, e, f, g, h, a, b, sum(set(0x06ca6351ul), w14 = sum(sigma1(w12), w07, set(0x400022ul)))); - round(b, c, d, e, f, g, h, a, sum(set(0x14292967ul), w15 = sum(set(0x100ul), sigma1(w13), w08, sigma0(w00)))); - round(a, b, c, d, e, f, g, h, sum(set(0x27b70a85ul), inc(w00, sigma1(w14), w09, sigma0(w01)))); - round(h, a, b, c, d, e, f, g, sum(set(0x2e1b2138ul), inc(w01, sigma1(w15), w10, sigma0(w02)))); - round(g, h, a, b, c, d, e, f, sum(set(0x4d2c6dfcul), inc(w02, sigma1(w00), w11, sigma0(w03)))); - round(f, g, h, a, b, c, d, e, sum(set(0x53380d13ul), inc(w03, sigma1(w01), w12, sigma0(w04)))); - round(e, f, g, h, a, b, c, d, sum(set(0x650a7354ul), inc(w04, sigma1(w02), w13, sigma0(w05)))); - round(d, e, f, g, h, a, b, c, sum(set(0x766a0abbul), inc(w05, sigma1(w03), w14, sigma0(w06)))); - round(c, d, e, f, g, h, a, b, sum(set(0x81c2c92eul), inc(w06, sigma1(w04), w15, sigma0(w07)))); - round(b, c, d, e, f, g, h, a, sum(set(0x92722c85ul), inc(w07, sigma1(w05), w00, sigma0(w08)))); - round(a, b, c, d, e, f, g, h, sum(set(0xa2bfe8a1ul), inc(w08, sigma1(w06), w01, sigma0(w09)))); - round(h, a, b, c, d, e, f, g, sum(set(0xa81a664bul), inc(w09, sigma1(w07), w02, sigma0(w10)))); - round(g, h, a, b, c, d, e, f, sum(set(0xc24b8b70ul), inc(w10, sigma1(w08), w03, sigma0(w11)))); - round(f, g, h, a, b, c, d, e, sum(set(0xc76c51a3ul), inc(w11, sigma1(w09), w04, sigma0(w12)))); - round(e, f, g, h, a, b, c, d, sum(set(0xd192e819ul), inc(w12, sigma1(w10), w05, sigma0(w13)))); - round(d, e, f, g, h, a, b, c, sum(set(0xd6990624ul), inc(w13, sigma1(w11), w06, sigma0(w14)))); - round(c, d, e, f, g, h, a, b, sum(set(0xf40e3585ul), inc(w14, sigma1(w12), w07, sigma0(w15)))); - round(b, c, d, e, f, g, h, a, sum(set(0x106aa070ul), inc(w15, sigma1(w13), w08, sigma0(w00)))); - round(a, b, c, d, e, f, g, h, sum(set(0x19a4c116ul), inc(w00, sigma1(w14), w09, sigma0(w01)))); - round(h, a, b, c, d, e, f, g, sum(set(0x1e376c08ul), inc(w01, sigma1(w15), w10, sigma0(w02)))); - round(g, h, a, b, c, d, e, f, sum(set(0x2748774cul), inc(w02, sigma1(w00), w11, sigma0(w03)))); - round(f, g, h, a, b, c, d, e, sum(set(0x34b0bcb5ul), inc(w03, sigma1(w01), w12, sigma0(w04)))); - round(e, f, g, h, a, b, c, d, sum(set(0x391c0cb3ul), inc(w04, sigma1(w02), w13, sigma0(w05)))); - round(d, e, f, g, h, a, b, c, sum(set(0x4ed8aa4aul), inc(w05, sigma1(w03), w14, sigma0(w06)))); - round(c, d, e, f, g, h, a, b, sum(set(0x5b9cca4ful), inc(w06, sigma1(w04), w15, sigma0(w07)))); - round(b, c, d, e, f, g, h, a, sum(set(0x682e6ff3ul), inc(w07, sigma1(w05), w00, sigma0(w08)))); - round(a, b, c, d, e, f, g, h, sum(set(0x748f82eeul), inc(w08, sigma1(w06), w01, sigma0(w09)))); - round(h, a, b, c, d, e, f, g, sum(set(0x78a5636ful), inc(w09, sigma1(w07), w02, sigma0(w10)))); - round(g, h, a, b, c, d, e, f, sum(set(0x84c87814ul), inc(w10, sigma1(w08), w03, sigma0(w11)))); - round(f, g, h, a, b, c, d, e, sum(set(0x8cc70208ul), inc(w11, sigma1(w09), w04, sigma0(w12)))); - round(e, f, g, h, a, b, c, d, sum(set(0x90befffaul), inc(w12, sigma1(w10), w05, sigma0(w13)))); - round(d, e, f, g, h, a, b, c, sum(set(0xa4506cebul), inc(w13, sigma1(w11), w06, sigma0(w14)))); - round(c, d, e, f, g, h, a, b, sum(set(0xbef9a3f7ul), w14, sigma1(w12), w07, sigma0(w15))); - round(b, c, d, e, f, g, h, a, sum(set(0xc67178f2ul), w15, sigma1(w13), w08, sigma0(w00))); - - // Output. - write4< 0>(out, sum(a, set(0x6a09e667ul))); - write4< 4>(out, sum(b, set(0xbb67ae85ul))); - write4< 8>(out, sum(c, set(0x3c6ef372ul))); - write4<12>(out, sum(d, set(0xa54ff53aul))); - write4<16>(out, sum(e, set(0x510e527ful))); - write4<20>(out, sum(f, set(0x9b05688cul))); - write4<24>(out, sum(g, set(0x1f83d9abul))); - write4<28>(out, sum(h, set(0x5be0cd19ul))); -} - -#endif // HAVE_XCPU - -#endif // DISABLED - -} // namespace sha256 -} // namespace system -} // namespace libbitcoin diff --git a/src/hash/vectorization/sha256_8_avx2.cpp b/src/hash/vectorization/sha256_8_avx2.cpp deleted file mode 100644 index 6c701981b9..0000000000 --- a/src/hash/vectorization/sha256_8_avx2.cpp +++ /dev/null @@ -1,439 +0,0 @@ -// Based on: -// sha256-x86.c - Intel SHA extensions using C intrinsics -// Written and place in public domain by Jeffrey Walton -// Based on code from Intel, and by Sean Gulley for the miTLS project. - -#include -#include -#include - -namespace libbitcoin { -namespace system { -namespace sha256 { - -#if defined (DISABLED) - -#if !defined(HAVE_XCPU) - -void merkle_avx2(digest8& out, const block8& blocks) NOEXCEPT -{ - BC_ASSERT_MSG(false, "merkle_avx2 undefined"); -} - -#else - -namespace i256 { - -using xint256_t = __m256i; - -template -uint32_t get(xint256_t a) noexcept -{ - return _mm256_extract_epi32(a, Offset); -} - -xint256_t set(uint32_t a) noexcept -{ - return _mm256_set1_epi32(a); -} - -xint256_t set(uint32_t a, uint32_t b, uint32_t c, uint32_t d, - uint32_t e, uint32_t f, uint32_t g, uint32_t h) noexcept -{ - return _mm256_set_epi32(a, b, c, d, e, f, g, h); -} - -xint256_t shuffle(xint256_t a, xint256_t b) noexcept -{ - return _mm256_shuffle_epi8(a, b); -} - -xint256_t sum(xint256_t a, xint256_t b) noexcept -{ - return _mm256_add_epi32(a, b); -} - -xint256_t sum(xint256_t a, xint256_t b, xint256_t c) noexcept -{ - return sum(sum(a, b), c); -} - -xint256_t sum(xint256_t a, xint256_t b, xint256_t c, - xint256_t d) noexcept -{ - return sum(sum(a, b), sum(c, d)); -} - -xint256_t sum(xint256_t a, xint256_t b, xint256_t c, xint256_t d, - xint256_t e) noexcept -{ - return sum(sum(a, b, c), sum(d, e)); -} - -xint256_t inc(xint256_t& outa, xint256_t b) noexcept -{ - return ((outa = sum(outa, b))); -} - -xint256_t inc(xint256_t& outa, xint256_t b, xint256_t c) noexcept -{ - return ((outa = sum(outa, b, c))); -} - -xint256_t inc(xint256_t& outa, xint256_t b, xint256_t c, - xint256_t d) noexcept -{ - return ((outa = sum(outa, b, c, d))); -} - -xint256_t exc(xint256_t a, xint256_t b) noexcept -{ - return _mm256_xor_si256(a, b); -} - -xint256_t exc(xint256_t a, xint256_t b, xint256_t c) noexcept -{ - return exc(exc(a, b), c); -} - -xint256_t dis(xint256_t a, xint256_t b) noexcept -{ - return _mm256_or_si256(a, b); -} - -xint256_t con(xint256_t a, xint256_t b) noexcept -{ - return _mm256_and_si256(a, b); -} - -xint256_t shr(xint256_t a, uint32_t bits) noexcept -{ - return _mm256_srli_epi32(a, bits); -} - -xint256_t shl(xint256_t a, uint32_t bits) noexcept -{ - return _mm256_slli_epi32(a, bits); -} - -} // namespace i256 - -using namespace i256; - -xint256_t inline SIGMA0(xint256_t x) NOEXCEPT { return exc(dis(shr(x, 2), shl(x, 30)), dis(shr(x, 13), shl(x, 19)), dis(shr(x, 22), shl(x, 10))); } -xint256_t inline SIGMA1(xint256_t x) NOEXCEPT { return exc(dis(shr(x, 6), shl(x, 26)), dis(shr(x, 11), shl(x, 21)), dis(shr(x, 25), shl(x, 7))); } -xint256_t inline sigma0(xint256_t x) NOEXCEPT { return exc(dis(shr(x, 7), shl(x, 25)), dis(shr(x, 18), shl(x, 14)), shr(x, 3)); } -xint256_t inline sigma1(xint256_t x) NOEXCEPT { return exc(dis(shr(x, 17), shl(x, 15)), dis(shr(x, 19), shl(x, 13)), shr(x, 10)); } -xint256_t inline choice( xint256_t x, xint256_t y, xint256_t z) NOEXCEPT { return exc(z, con(x, exc(y, z))); } -xint256_t inline majority(xint256_t x, xint256_t y, xint256_t z) NOEXCEPT { return dis(con(x, y), con(z, dis(x, y))); } - -void inline round(xint256_t a, xint256_t b, xint256_t c, xint256_t& d, - xint256_t e, xint256_t f, xint256_t g, xint256_t& h, xint256_t k) NOEXCEPT -{ - const auto t1 = sum(h, SIGMA1(e), choice(e, f, g), k); - const auto t2 = sum( SIGMA0(a), majority(a, b, c)); - d = sum(d, t1); - h = sum(t1, t2); -} - -template -xint256_t inline read8(const block8& blocks) NOEXCEPT -{ - constexpr auto four = sizeof(uint32_t); - BC_PUSH_WARNING(NO_ARRAY_INDEXING) - const auto value = set( - from_little_endian(array_cast(blocks[0])), - from_little_endian(array_cast(blocks[1])), - from_little_endian(array_cast(blocks[2])), - from_little_endian(array_cast(blocks[3])), - from_little_endian(array_cast(blocks[4])), - from_little_endian(array_cast(blocks[5])), - from_little_endian(array_cast(blocks[6])), - from_little_endian(array_cast(blocks[7]))); - BC_POP_WARNING() - - return shuffle(value, set( - 0x0c0d0e0ful, 0x08090a0bul, 0x04050607ul, 0x00010203ul, - 0x0c0d0e0ful, 0x08090a0bul, 0x04050607ul, 0x00010203ul)); -} - -template -void inline write8(digest8& hashes, xint256_t value) NOEXCEPT -{ - value = shuffle(value, set( - 0x0c0d0e0ful, 0x08090a0bul, 0x04050607ul, 0x00010203ul, - 0x0c0d0e0ful, 0x08090a0bul, 0x04050607ul, 0x00010203ul)); - - constexpr auto four = sizeof(uint32_t); - BC_PUSH_WARNING(NO_ARRAY_INDEXING) - array_cast(hashes[0]) = to_little_endian(get<7>(value)); - array_cast(hashes[1]) = to_little_endian(get<6>(value)); - array_cast(hashes[2]) = to_little_endian(get<5>(value)); - array_cast(hashes[3]) = to_little_endian(get<4>(value)); - array_cast(hashes[4]) = to_little_endian(get<3>(value)); - array_cast(hashes[5]) = to_little_endian(get<2>(value)); - array_cast(hashes[6]) = to_little_endian(get<1>(value)); - array_cast(hashes[7]) = to_little_endian(get<0>(value)); - BC_POP_WARNING() -} - -// Eight blocks in eight lanes, doubled. -void merkle_avx2(digest8& out, const block8& blocks) NOEXCEPT -{ - // Transform 1. - auto a = set(0x6a09e667ul); - auto b = set(0xbb67ae85ul); - auto c = set(0x3c6ef372ul); - auto d = set(0xa54ff53aul); - auto e = set(0x510e527ful); - auto f = set(0x9b05688cul); - auto g = set(0x1f83d9abul); - auto h = set(0x5be0cd19ul); - - xint256_t w00, w01, w02, w03, w04, w05, w06, w07; - xint256_t w08, w09, w10, w11, w12, w13, w14, w15; - - round(a, b, c, d, e, f, g, h, sum(set(0x428a2f98ul), w00 = read8< 0>(blocks))); - round(h, a, b, c, d, e, f, g, sum(set(0x71374491ul), w01 = read8< 4>(blocks))); - round(g, h, a, b, c, d, e, f, sum(set(0xb5c0fbcful), w02 = read8< 8>(blocks))); - round(f, g, h, a, b, c, d, e, sum(set(0xe9b5dba5ul), w03 = read8<12>(blocks))); - round(e, f, g, h, a, b, c, d, sum(set(0x3956c25bul), w04 = read8<16>(blocks))); - round(d, e, f, g, h, a, b, c, sum(set(0x59f111f1ul), w05 = read8<20>(blocks))); - round(c, d, e, f, g, h, a, b, sum(set(0x923f82a4ul), w06 = read8<24>(blocks))); - round(b, c, d, e, f, g, h, a, sum(set(0xab1c5ed5ul), w07 = read8<28>(blocks))); - round(a, b, c, d, e, f, g, h, sum(set(0xd807aa98ul), w08 = read8<32>(blocks))); - round(h, a, b, c, d, e, f, g, sum(set(0x12835b01ul), w09 = read8<36>(blocks))); - round(g, h, a, b, c, d, e, f, sum(set(0x243185beul), w10 = read8<40>(blocks))); - round(f, g, h, a, b, c, d, e, sum(set(0x550c7dc3ul), w11 = read8<44>(blocks))); - round(e, f, g, h, a, b, c, d, sum(set(0x72be5d74ul), w12 = read8<48>(blocks))); - round(d, e, f, g, h, a, b, c, sum(set(0x80deb1feul), w13 = read8<52>(blocks))); - round(c, d, e, f, g, h, a, b, sum(set(0x9bdc06a7ul), w14 = read8<56>(blocks))); - round(b, c, d, e, f, g, h, a, sum(set(0xc19bf174ul), w15 = read8<60>(blocks))); - round(a, b, c, d, e, f, g, h, sum(set(0xe49b69c1ul), inc(w00, sigma1(w14), w09, sigma0(w01)))); - round(h, a, b, c, d, e, f, g, sum(set(0xefbe4786ul), inc(w01, sigma1(w15), w10, sigma0(w02)))); - round(g, h, a, b, c, d, e, f, sum(set(0x0fc19dc6ul), inc(w02, sigma1(w00), w11, sigma0(w03)))); - round(f, g, h, a, b, c, d, e, sum(set(0x240ca1ccul), inc(w03, sigma1(w01), w12, sigma0(w04)))); - round(e, f, g, h, a, b, c, d, sum(set(0x2de92c6ful), inc(w04, sigma1(w02), w13, sigma0(w05)))); - round(d, e, f, g, h, a, b, c, sum(set(0x4a7484aaul), inc(w05, sigma1(w03), w14, sigma0(w06)))); - round(c, d, e, f, g, h, a, b, sum(set(0x5cb0a9dcul), inc(w06, sigma1(w04), w15, sigma0(w07)))); - round(b, c, d, e, f, g, h, a, sum(set(0x76f988daul), inc(w07, sigma1(w05), w00, sigma0(w08)))); - round(a, b, c, d, e, f, g, h, sum(set(0x983e5152ul), inc(w08, sigma1(w06), w01, sigma0(w09)))); - round(h, a, b, c, d, e, f, g, sum(set(0xa831c66dul), inc(w09, sigma1(w07), w02, sigma0(w10)))); - round(g, h, a, b, c, d, e, f, sum(set(0xb00327c8ul), inc(w10, sigma1(w08), w03, sigma0(w11)))); - round(f, g, h, a, b, c, d, e, sum(set(0xbf597fc7ul), inc(w11, sigma1(w09), w04, sigma0(w12)))); - round(e, f, g, h, a, b, c, d, sum(set(0xc6e00bf3ul), inc(w12, sigma1(w10), w05, sigma0(w13)))); - round(d, e, f, g, h, a, b, c, sum(set(0xd5a79147ul), inc(w13, sigma1(w11), w06, sigma0(w14)))); - round(c, d, e, f, g, h, a, b, sum(set(0x06ca6351ul), inc(w14, sigma1(w12), w07, sigma0(w15)))); - round(b, c, d, e, f, g, h, a, sum(set(0x14292967ul), inc(w15, sigma1(w13), w08, sigma0(w00)))); - round(a, b, c, d, e, f, g, h, sum(set(0x27b70a85ul), inc(w00, sigma1(w14), w09, sigma0(w01)))); - round(h, a, b, c, d, e, f, g, sum(set(0x2e1b2138ul), inc(w01, sigma1(w15), w10, sigma0(w02)))); - round(g, h, a, b, c, d, e, f, sum(set(0x4d2c6dfcul), inc(w02, sigma1(w00), w11, sigma0(w03)))); - round(f, g, h, a, b, c, d, e, sum(set(0x53380d13ul), inc(w03, sigma1(w01), w12, sigma0(w04)))); - round(e, f, g, h, a, b, c, d, sum(set(0x650a7354ul), inc(w04, sigma1(w02), w13, sigma0(w05)))); - round(d, e, f, g, h, a, b, c, sum(set(0x766a0abbul), inc(w05, sigma1(w03), w14, sigma0(w06)))); - round(c, d, e, f, g, h, a, b, sum(set(0x81c2c92eul), inc(w06, sigma1(w04), w15, sigma0(w07)))); - round(b, c, d, e, f, g, h, a, sum(set(0x92722c85ul), inc(w07, sigma1(w05), w00, sigma0(w08)))); - round(a, b, c, d, e, f, g, h, sum(set(0xa2bfe8a1ul), inc(w08, sigma1(w06), w01, sigma0(w09)))); - round(h, a, b, c, d, e, f, g, sum(set(0xa81a664bul), inc(w09, sigma1(w07), w02, sigma0(w10)))); - round(g, h, a, b, c, d, e, f, sum(set(0xc24b8b70ul), inc(w10, sigma1(w08), w03, sigma0(w11)))); - round(f, g, h, a, b, c, d, e, sum(set(0xc76c51a3ul), inc(w11, sigma1(w09), w04, sigma0(w12)))); - round(e, f, g, h, a, b, c, d, sum(set(0xd192e819ul), inc(w12, sigma1(w10), w05, sigma0(w13)))); - round(d, e, f, g, h, a, b, c, sum(set(0xd6990624ul), inc(w13, sigma1(w11), w06, sigma0(w14)))); - round(c, d, e, f, g, h, a, b, sum(set(0xf40e3585ul), inc(w14, sigma1(w12), w07, sigma0(w15)))); - round(b, c, d, e, f, g, h, a, sum(set(0x106aa070ul), inc(w15, sigma1(w13), w08, sigma0(w00)))); - round(a, b, c, d, e, f, g, h, sum(set(0x19a4c116ul), inc(w00, sigma1(w14), w09, sigma0(w01)))); - round(h, a, b, c, d, e, f, g, sum(set(0x1e376c08ul), inc(w01, sigma1(w15), w10, sigma0(w02)))); - round(g, h, a, b, c, d, e, f, sum(set(0x2748774cul), inc(w02, sigma1(w00), w11, sigma0(w03)))); - round(f, g, h, a, b, c, d, e, sum(set(0x34b0bcb5ul), inc(w03, sigma1(w01), w12, sigma0(w04)))); - round(e, f, g, h, a, b, c, d, sum(set(0x391c0cb3ul), inc(w04, sigma1(w02), w13, sigma0(w05)))); - round(d, e, f, g, h, a, b, c, sum(set(0x4ed8aa4aul), inc(w05, sigma1(w03), w14, sigma0(w06)))); - round(c, d, e, f, g, h, a, b, sum(set(0x5b9cca4ful), inc(w06, sigma1(w04), w15, sigma0(w07)))); - round(b, c, d, e, f, g, h, a, sum(set(0x682e6ff3ul), inc(w07, sigma1(w05), w00, sigma0(w08)))); - round(a, b, c, d, e, f, g, h, sum(set(0x748f82eeul), inc(w08, sigma1(w06), w01, sigma0(w09)))); - round(h, a, b, c, d, e, f, g, sum(set(0x78a5636ful), inc(w09, sigma1(w07), w02, sigma0(w10)))); - round(g, h, a, b, c, d, e, f, sum(set(0x84c87814ul), inc(w10, sigma1(w08), w03, sigma0(w11)))); - round(f, g, h, a, b, c, d, e, sum(set(0x8cc70208ul), inc(w11, sigma1(w09), w04, sigma0(w12)))); - round(e, f, g, h, a, b, c, d, sum(set(0x90befffaul), inc(w12, sigma1(w10), w05, sigma0(w13)))); - round(d, e, f, g, h, a, b, c, sum(set(0xa4506cebul), inc(w13, sigma1(w11), w06, sigma0(w14)))); - round(c, d, e, f, g, h, a, b, sum(set(0xbef9a3f7ul), inc(w14, sigma1(w12), w07, sigma0(w15)))); - round(b, c, d, e, f, g, h, a, sum(set(0xc67178f2ul), inc(w15, sigma1(w13), w08, sigma0(w00)))); - - a = sum(a, set(0x6a09e667ul)); - b = sum(b, set(0xbb67ae85ul)); - c = sum(c, set(0x3c6ef372ul)); - d = sum(d, set(0xa54ff53aul)); - e = sum(e, set(0x510e527ful)); - f = sum(f, set(0x9b05688cul)); - g = sum(g, set(0x1f83d9abul)); - h = sum(h, set(0x5be0cd19ul)); - - const xint256_t t0 = a, t1 = b, t2 = c, t3 = d, t4 = e, t5 = f, t6 = g, t7 = h; - - // Transform 2. - round(a, b, c, d, e, f, g, h, set(0xc28a2f98ul)); - round(h, a, b, c, d, e, f, g, set(0x71374491ul)); - round(g, h, a, b, c, d, e, f, set(0xb5c0fbcful)); - round(f, g, h, a, b, c, d, e, set(0xe9b5dba5ul)); - round(e, f, g, h, a, b, c, d, set(0x3956c25bul)); - round(d, e, f, g, h, a, b, c, set(0x59f111f1ul)); - round(c, d, e, f, g, h, a, b, set(0x923f82a4ul)); - round(b, c, d, e, f, g, h, a, set(0xab1c5ed5ul)); - round(a, b, c, d, e, f, g, h, set(0xd807aa98ul)); - round(h, a, b, c, d, e, f, g, set(0x12835b01ul)); - round(g, h, a, b, c, d, e, f, set(0x243185beul)); - round(f, g, h, a, b, c, d, e, set(0x550c7dc3ul)); - round(e, f, g, h, a, b, c, d, set(0x72be5d74ul)); - round(d, e, f, g, h, a, b, c, set(0x80deb1feul)); - round(c, d, e, f, g, h, a, b, set(0x9bdc06a7ul)); - round(b, c, d, e, f, g, h, a, set(0xc19bf374ul)); - round(a, b, c, d, e, f, g, h, set(0x649b69c1ul)); - round(h, a, b, c, d, e, f, g, set(0xf0fe4786ul)); - round(g, h, a, b, c, d, e, f, set(0x0fe1edc6ul)); - round(f, g, h, a, b, c, d, e, set(0x240cf254ul)); - round(e, f, g, h, a, b, c, d, set(0x4fe9346ful)); - round(d, e, f, g, h, a, b, c, set(0x6cc984beul)); - round(c, d, e, f, g, h, a, b, set(0x61b9411eul)); - round(b, c, d, e, f, g, h, a, set(0x16f988faul)); - round(a, b, c, d, e, f, g, h, set(0xf2c65152ul)); - round(h, a, b, c, d, e, f, g, set(0xa88e5a6dul)); - round(g, h, a, b, c, d, e, f, set(0xb019fc65ul)); - round(f, g, h, a, b, c, d, e, set(0xb9d99ec7ul)); - round(e, f, g, h, a, b, c, d, set(0x9a1231c3ul)); - round(d, e, f, g, h, a, b, c, set(0xe70eeaa0ul)); - round(c, d, e, f, g, h, a, b, set(0xfdb1232bul)); - round(b, c, d, e, f, g, h, a, set(0xc7353eb0ul)); - round(a, b, c, d, e, f, g, h, set(0x3069bad5ul)); - round(h, a, b, c, d, e, f, g, set(0xcb976d5ful)); - round(g, h, a, b, c, d, e, f, set(0x5a0f118ful)); - round(f, g, h, a, b, c, d, e, set(0xdc1eeefdul)); - round(e, f, g, h, a, b, c, d, set(0x0a35b689ul)); - round(d, e, f, g, h, a, b, c, set(0xde0b7a04ul)); - round(c, d, e, f, g, h, a, b, set(0x58f4ca9dul)); - round(b, c, d, e, f, g, h, a, set(0xe15d5b16ul)); - round(a, b, c, d, e, f, g, h, set(0x007f3e86ul)); - round(h, a, b, c, d, e, f, g, set(0x37088980ul)); - round(g, h, a, b, c, d, e, f, set(0xa507ea32ul)); - round(f, g, h, a, b, c, d, e, set(0x6fab9537ul)); - round(e, f, g, h, a, b, c, d, set(0x17406110ul)); - round(d, e, f, g, h, a, b, c, set(0x0d8cd6f1ul)); - round(c, d, e, f, g, h, a, b, set(0xcdaa3b6dul)); - round(b, c, d, e, f, g, h, a, set(0xc0bbbe37ul)); - round(a, b, c, d, e, f, g, h, set(0x83613bdaul)); - round(h, a, b, c, d, e, f, g, set(0xdb48a363ul)); - round(g, h, a, b, c, d, e, f, set(0x0b02e931ul)); - round(f, g, h, a, b, c, d, e, set(0x6fd15ca7ul)); - round(e, f, g, h, a, b, c, d, set(0x521afacaul)); - round(d, e, f, g, h, a, b, c, set(0x31338431ul)); - round(c, d, e, f, g, h, a, b, set(0x6ed41a95ul)); - round(b, c, d, e, f, g, h, a, set(0x6d437890ul)); - round(a, b, c, d, e, f, g, h, set(0xc39c91f2ul)); - round(h, a, b, c, d, e, f, g, set(0x9eccabbdul)); - round(g, h, a, b, c, d, e, f, set(0xb5c9a0e6ul)); - round(f, g, h, a, b, c, d, e, set(0x532fb63cul)); - round(e, f, g, h, a, b, c, d, set(0xd2c741c6ul)); - round(d, e, f, g, h, a, b, c, set(0x07237ea3ul)); - round(c, d, e, f, g, h, a, b, set(0xa4954b68ul)); - round(b, c, d, e, f, g, h, a, set(0x4c191d76ul)); - - w00 = sum(t0, a); - w01 = sum(t1, b); - w02 = sum(t2, c); - w03 = sum(t3, d); - w04 = sum(t4, e); - w05 = sum(t5, f); - w06 = sum(t6, g); - w07 = sum(t7, h); - - // Transform 3. - a = set(0x6a09e667ul); - b = set(0xbb67ae85ul); - c = set(0x3c6ef372ul); - d = set(0xa54ff53aul); - e = set(0x510e527ful); - f = set(0x9b05688cul); - g = set(0x1f83d9abul); - h = set(0x5be0cd19ul); - - round(a, b, c, d, e, f, g, h, sum(set(0x428a2f98ul), w00)); - round(h, a, b, c, d, e, f, g, sum(set(0x71374491ul), w01)); - round(g, h, a, b, c, d, e, f, sum(set(0xb5c0fbcful), w02)); - round(f, g, h, a, b, c, d, e, sum(set(0xe9b5dba5ul), w03)); - round(e, f, g, h, a, b, c, d, sum(set(0x3956c25bul), w04)); - round(d, e, f, g, h, a, b, c, sum(set(0x59f111f1ul), w05)); - round(c, d, e, f, g, h, a, b, sum(set(0x923f82a4ul), w06)); - round(b, c, d, e, f, g, h, a, sum(set(0xab1c5ed5ul), w07)); - round(a, b, c, d, e, f, g, h, set(0x5807aa98ul)); - round(h, a, b, c, d, e, f, g, set(0x12835b01ul)); - round(g, h, a, b, c, d, e, f, set(0x243185beul)); - round(f, g, h, a, b, c, d, e, set(0x550c7dc3ul)); - round(e, f, g, h, a, b, c, d, set(0x72be5d74ul)); - round(d, e, f, g, h, a, b, c, set(0x80deb1feul)); - round(c, d, e, f, g, h, a, b, set(0x9bdc06a7ul)); - round(b, c, d, e, f, g, h, a, set(0xc19bf274ul)); - round(a, b, c, d, e, f, g, h, sum(set(0xe49b69c1ul), inc(w00, sigma0(w01)))); - round(h, a, b, c, d, e, f, g, sum(set(0xefbe4786ul), inc(w01, set(0xa00000ul), sigma0(w02)))); - round(g, h, a, b, c, d, e, f, sum(set(0x0fc19dc6ul), inc(w02, sigma1(w00), sigma0(w03)))); - round(f, g, h, a, b, c, d, e, sum(set(0x240ca1ccul), inc(w03, sigma1(w01), sigma0(w04)))); - round(e, f, g, h, a, b, c, d, sum(set(0x2de92c6ful), inc(w04, sigma1(w02), sigma0(w05)))); - round(d, e, f, g, h, a, b, c, sum(set(0x4a7484aaul), inc(w05, sigma1(w03), sigma0(w06)))); - round(c, d, e, f, g, h, a, b, sum(set(0x5cb0a9dcul), inc(w06, sigma1(w04), set(0x100ul), sigma0(w07)))); - round(b, c, d, e, f, g, h, a, sum(set(0x76f988daul), inc(w07, sigma1(w05), w00, set(0x11002000ul)))); - round(a, b, c, d, e, f, g, h, sum(set(0x983e5152ul), w08 = sum(set(0x80000000ul), sigma1(w06), w01))); - round(h, a, b, c, d, e, f, g, sum(set(0xa831c66dul), w09 = sum(sigma1(w07), w02))); - round(g, h, a, b, c, d, e, f, sum(set(0xb00327c8ul), w10 = sum(sigma1(w08), w03))); - round(f, g, h, a, b, c, d, e, sum(set(0xbf597fc7ul), w11 = sum(sigma1(w09), w04))); - round(e, f, g, h, a, b, c, d, sum(set(0xc6e00bf3ul), w12 = sum(sigma1(w10), w05))); - round(d, e, f, g, h, a, b, c, sum(set(0xd5a79147ul), w13 = sum(sigma1(w11), w06))); - round(c, d, e, f, g, h, a, b, sum(set(0x06ca6351ul), w14 = sum(sigma1(w12), w07, set(0x400022ul)))); - round(b, c, d, e, f, g, h, a, sum(set(0x14292967ul), w15 = sum(set(0x100ul), sigma1(w13), w08, sigma0(w00)))); - round(a, b, c, d, e, f, g, h, sum(set(0x27b70a85ul), inc(w00, sigma1(w14), w09, sigma0(w01)))); - round(h, a, b, c, d, e, f, g, sum(set(0x2e1b2138ul), inc(w01, sigma1(w15), w10, sigma0(w02)))); - round(g, h, a, b, c, d, e, f, sum(set(0x4d2c6dfcul), inc(w02, sigma1(w00), w11, sigma0(w03)))); - round(f, g, h, a, b, c, d, e, sum(set(0x53380d13ul), inc(w03, sigma1(w01), w12, sigma0(w04)))); - round(e, f, g, h, a, b, c, d, sum(set(0x650a7354ul), inc(w04, sigma1(w02), w13, sigma0(w05)))); - round(d, e, f, g, h, a, b, c, sum(set(0x766a0abbul), inc(w05, sigma1(w03), w14, sigma0(w06)))); - round(c, d, e, f, g, h, a, b, sum(set(0x81c2c92eul), inc(w06, sigma1(w04), w15, sigma0(w07)))); - round(b, c, d, e, f, g, h, a, sum(set(0x92722c85ul), inc(w07, sigma1(w05), w00, sigma0(w08)))); - round(a, b, c, d, e, f, g, h, sum(set(0xa2bfe8a1ul), inc(w08, sigma1(w06), w01, sigma0(w09)))); - round(h, a, b, c, d, e, f, g, sum(set(0xa81a664bul), inc(w09, sigma1(w07), w02, sigma0(w10)))); - round(g, h, a, b, c, d, e, f, sum(set(0xc24b8b70ul), inc(w10, sigma1(w08), w03, sigma0(w11)))); - round(f, g, h, a, b, c, d, e, sum(set(0xc76c51a3ul), inc(w11, sigma1(w09), w04, sigma0(w12)))); - round(e, f, g, h, a, b, c, d, sum(set(0xd192e819ul), inc(w12, sigma1(w10), w05, sigma0(w13)))); - round(d, e, f, g, h, a, b, c, sum(set(0xd6990624ul), inc(w13, sigma1(w11), w06, sigma0(w14)))); - round(c, d, e, f, g, h, a, b, sum(set(0xf40e3585ul), inc(w14, sigma1(w12), w07, sigma0(w15)))); - round(b, c, d, e, f, g, h, a, sum(set(0x106aa070ul), inc(w15, sigma1(w13), w08, sigma0(w00)))); - round(a, b, c, d, e, f, g, h, sum(set(0x19a4c116ul), inc(w00, sigma1(w14), w09, sigma0(w01)))); - round(h, a, b, c, d, e, f, g, sum(set(0x1e376c08ul), inc(w01, sigma1(w15), w10, sigma0(w02)))); - round(g, h, a, b, c, d, e, f, sum(set(0x2748774cul), inc(w02, sigma1(w00), w11, sigma0(w03)))); - round(f, g, h, a, b, c, d, e, sum(set(0x34b0bcb5ul), inc(w03, sigma1(w01), w12, sigma0(w04)))); - round(e, f, g, h, a, b, c, d, sum(set(0x391c0cb3ul), inc(w04, sigma1(w02), w13, sigma0(w05)))); - round(d, e, f, g, h, a, b, c, sum(set(0x4ed8aa4aul), inc(w05, sigma1(w03), w14, sigma0(w06)))); - round(c, d, e, f, g, h, a, b, sum(set(0x5b9cca4ful), inc(w06, sigma1(w04), w15, sigma0(w07)))); - round(b, c, d, e, f, g, h, a, sum(set(0x682e6ff3ul), inc(w07, sigma1(w05), w00, sigma0(w08)))); - round(a, b, c, d, e, f, g, h, sum(set(0x748f82eeul), inc(w08, sigma1(w06), w01, sigma0(w09)))); - round(h, a, b, c, d, e, f, g, sum(set(0x78a5636ful), inc(w09, sigma1(w07), w02, sigma0(w10)))); - round(g, h, a, b, c, d, e, f, sum(set(0x84c87814ul), inc(w10, sigma1(w08), w03, sigma0(w11)))); - round(f, g, h, a, b, c, d, e, sum(set(0x8cc70208ul), inc(w11, sigma1(w09), w04, sigma0(w12)))); - round(e, f, g, h, a, b, c, d, sum(set(0x90befffaul), inc(w12, sigma1(w10), w05, sigma0(w13)))); - round(d, e, f, g, h, a, b, c, sum(set(0xa4506cebul), inc(w13, sigma1(w11), w06, sigma0(w14)))); - round(c, d, e, f, g, h, a, b, sum(set(0xbef9a3f7ul), w14, sigma1(w12), w07, sigma0(w15))); - round(b, c, d, e, f, g, h, a, sum(set(0xc67178f2ul), w15, sigma1(w13), w08, sigma0(w00))); - - // Output. - write8< 0>(out, sum(a, set(0x6a09e667ul))); - write8< 4>(out, sum(b, set(0xbb67ae85ul))); - write8< 8>(out, sum(c, set(0x3c6ef372ul))); - write8<12>(out, sum(d, set(0xa54ff53aul))); - write8<16>(out, sum(e, set(0x510e527ful))); - write8<20>(out, sum(f, set(0x9b05688cul))); - write8<24>(out, sum(g, set(0x1f83d9abul))); - write8<28>(out, sum(h, set(0x5be0cd19ul))); -} - -#endif // HAVE_XCPU - -#endif // DISABLED - -} // namespace sha256 -} // namespace system -} // namespace libbitcoin diff --git a/test/hash/hash.hpp b/test/hash/hash.hpp index fc4ff29e74..f209cab12d 100644 --- a/test/hash/hash.hpp +++ b/test/hash/hash.hpp @@ -176,7 +176,7 @@ constexpr auto alpha2_count = 16'777'216_size; constexpr auto long_alpha_size = alpha2_size * alpha2_count; static const auto alpha2 = to_array("abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmno"); static const std_vector long_alpha(alpha2_count, alpha2); -static const auto long_alpha_data = pointer_cast(long_alpha.front().data()); +static const auto long_alpha_data = pointer_cast(long_alpha.front().data()); static const auto sha_6 = to_chunk(unsafe_array_cast(long_alpha_data)); constexpr auto sha_test_count = 6; #else diff --git a/test/hash/performance/performance.hpp b/test/hash/performance/performance.hpp index 1973f9818f..56bfb74653 100644 --- a/test/hash/performance/performance.hpp +++ b/test/hash/performance/performance.hpp @@ -329,7 +329,7 @@ bool test_merkle(std::ostream& out, float ghz = 3.0f, for (size_t seed = 0; seed < Count; ++seed) { constexpr auto size = array_count; - std_vector digests{}; + std::vector digests{}; digests.reserve(Size * two); for (size_t blocks = 0; blocks < Size; ++blocks)