diff --git a/Makefile.am b/Makefile.am
index ccb19304d4..f6d86b09d9 100755
--- a/Makefile.am
+++ b/Makefile.am
@@ -90,12 +90,6 @@ src_libbitcoin_system_la_SOURCES = \
src/hash/accumulator.cpp \
src/hash/checksum.cpp \
src/hash/siphash.cpp \
- src/hash/vectorization/sha256_1_native.cpp \
- src/hash/vectorization/sha256_2_shani.cpp \
- src/hash/vectorization/sha256_4_neon.cpp \
- src/hash/vectorization/sha256_4_sse4.cpp \
- src/hash/vectorization/sha256_4_sse41.cpp \
- src/hash/vectorization/sha256_8_avx2.cpp \
src/math/math.cpp \
src/radix/base_10.cpp \
src/radix/base_2048.cpp \
@@ -622,6 +616,7 @@ include_bitcoin_system_impl_hash_sha_HEADERS = \
include/bitcoin/system/impl/hash/sha/algorithm_double.ipp \
include/bitcoin/system/impl/hash/sha/algorithm_functions.ipp \
include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp \
+ include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp \
include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp \
include/bitcoin/system/impl/hash/sha/algorithm_native.ipp \
include/bitcoin/system/impl/hash/sha/algorithm_padding.ipp \
diff --git a/builds/cmake/CMakeLists.txt b/builds/cmake/CMakeLists.txt
index 7547906182..4ebdf5b59b 100644
--- a/builds/cmake/CMakeLists.txt
+++ b/builds/cmake/CMakeLists.txt
@@ -529,12 +529,6 @@ add_library( ${CANONICAL_LIB_NAME}
"../../src/hash/accumulator.cpp"
"../../src/hash/checksum.cpp"
"../../src/hash/siphash.cpp"
- "../../src/hash/vectorization/sha256_1_native.cpp"
- "../../src/hash/vectorization/sha256_2_shani.cpp"
- "../../src/hash/vectorization/sha256_4_neon.cpp"
- "../../src/hash/vectorization/sha256_4_sse4.cpp"
- "../../src/hash/vectorization/sha256_4_sse41.cpp"
- "../../src/hash/vectorization/sha256_8_avx2.cpp"
"../../src/math/math.cpp"
"../../src/radix/base_10.cpp"
"../../src/radix/base_2048.cpp"
diff --git a/builds/msvc/vs2022/libbitcoin-system/libbitcoin-system.vcxproj b/builds/msvc/vs2022/libbitcoin-system/libbitcoin-system.vcxproj
index a70cf70ca2..69f4210f14 100644
--- a/builds/msvc/vs2022/libbitcoin-system/libbitcoin-system.vcxproj
+++ b/builds/msvc/vs2022/libbitcoin-system/libbitcoin-system.vcxproj
@@ -155,12 +155,6 @@
-
-
-
-
-
-
@@ -548,6 +542,7 @@
+
diff --git a/builds/msvc/vs2022/libbitcoin-system/libbitcoin-system.vcxproj.filters b/builds/msvc/vs2022/libbitcoin-system/libbitcoin-system.vcxproj.filters
index 5596a34380..9f54b85f9c 100644
--- a/builds/msvc/vs2022/libbitcoin-system/libbitcoin-system.vcxproj.filters
+++ b/builds/msvc/vs2022/libbitcoin-system/libbitcoin-system.vcxproj.filters
@@ -8,157 +8,157 @@
- {39F60708-FF48-4C22-0000-000000000009}
+ {39F60708-FF48-4C22-0000-000000000008}
- {39F60708-FF48-4C22-0000-000000000010}
+ {39F60708-FF48-4C22-0000-000000000009}
- {39F60708-FF48-4C22-0000-0000000000A1}
+ {39F60708-FF48-4C22-0000-000000000010}
- {39F60708-FF48-4C22-0000-0000000000B1}
+ {39F60708-FF48-4C22-0000-0000000000A1}
- {39F60708-FF48-4C22-0000-0000000000C2}
+ {39F60708-FF48-4C22-0000-0000000000B2}
- {39F60708-FF48-4C22-0000-0000000000C1}
+ {39F60708-FF48-4C22-0000-0000000000B1}
- {39F60708-FF48-4C22-0000-0000000000D1}
+ {39F60708-FF48-4C22-0000-0000000000C1}
- {39F60708-FF48-4C22-0000-0000000000E1}
+ {39F60708-FF48-4C22-0000-0000000000D1}
- {39F60708-FF48-4C22-0000-0000000000F1}
+ {39F60708-FF48-4C22-0000-0000000000E1}
- {39F60708-FF48-4C22-0000-000000000002}
+ {39F60708-FF48-4C22-0000-0000000000F1}
- {39F60708-FF48-4C22-0000-000000000003}
+ {39F60708-FF48-4C22-0000-000000000002}
- {39F60708-FF48-4C22-0000-0000000000D2}
+ {39F60708-FF48-4C22-0000-0000000000C2}
- {39F60708-FF48-4C22-0000-0000000000E2}
+ {39F60708-FF48-4C22-0000-0000000000D2}
- {39F60708-FF48-4C22-0000-000000000004}
+ {39F60708-FF48-4C22-0000-000000000003}
- {39F60708-FF48-4C22-0000-0000000000F2}
+ {39F60708-FF48-4C22-0000-0000000000E2}
- {39F60708-FF48-4C22-0000-000000000003}
+ {39F60708-FF48-4C22-0000-0000000000F2}
- {39F60708-FF48-4C22-0000-000000000004}
+ {39F60708-FF48-4C22-0000-000000000003}
- {39F60708-FF48-4C22-0000-000000000005}
+ {39F60708-FF48-4C22-0000-000000000004}
- {39F60708-FF48-4C22-0000-0000000000A3}
+ {39F60708-FF48-4C22-0000-000000000012}
- {39F60708-FF48-4C22-0000-0000000000B3}
+ {39F60708-FF48-4C22-0000-0000000000A3}
- {39F60708-FF48-4C22-0000-000000000006}
+ {39F60708-FF48-4C22-0000-000000000005}
- {39F60708-FF48-4C22-0000-000000000007}
+ {39F60708-FF48-4C22-0000-000000000006}
- {39F60708-FF48-4C22-0000-000000000008}
+ {39F60708-FF48-4C22-0000-000000000007}
- {39F60708-FF48-4C22-0000-000000000009}
+ {39F60708-FF48-4C22-0000-000000000008}
- {39F60708-FF48-4C22-0000-000000000010}
+ {39F60708-FF48-4C22-0000-000000000009}
- {39F60708-FF48-4C22-0000-0000000000C3}
+ {39F60708-FF48-4C22-0000-0000000000B3}
- {39F60708-FF48-4C22-0000-0000000000D3}
+ {39F60708-FF48-4C22-0000-0000000000C3}
- {39F60708-FF48-4C22-0000-000000000011}
+ {39F60708-FF48-4C22-0000-000000000010}
- {39F60708-FF48-4C22-0000-0000000000E3}
+ {39F60708-FF48-4C22-0000-0000000000D3}
- {39F60708-FF48-4C22-0000-000000000012}
+ {39F60708-FF48-4C22-0000-000000000011}
- {39F60708-FF48-4C22-0000-000000000005}
+ {39F60708-FF48-4C22-0000-000000000004}
- {39F60708-FF48-4C22-0000-0000000000F3}
+ {39F60708-FF48-4C22-0000-0000000000E3}
- {39F60708-FF48-4C22-0000-000000000004}
+ {39F60708-FF48-4C22-0000-0000000000F3}
- {39F60708-FF48-4C22-0000-000000000006}
+ {39F60708-FF48-4C22-0000-000000000005}
- {39F60708-FF48-4C22-0000-000000000007}
+ {39F60708-FF48-4C22-0000-000000000006}
- {39F60708-FF48-4C22-0000-000000000008}
+ {39F60708-FF48-4C22-0000-000000000007}
- {39F60708-FF48-4C22-0000-000000000009}
+ {39F60708-FF48-4C22-0000-000000000008}
- {39F60708-FF48-4C22-0000-000000000010}
+ {39F60708-FF48-4C22-0000-000000000009}
- {39F60708-FF48-4C22-0000-000000000005}
+ {39F60708-FF48-4C22-0000-000000000004}
- {39F60708-FF48-4C22-0000-000000000006}
+ {39F60708-FF48-4C22-0000-000000000005}
- {39F60708-FF48-4C22-0000-000000000007}
+ {39F60708-FF48-4C22-0000-000000000006}
- {39F60708-FF48-4C22-0000-000000000008}
+ {39F60708-FF48-4C22-0000-000000000007}
- {39F60708-FF48-4C22-0000-000000000011}
+ {39F60708-FF48-4C22-0000-000000000010}
- {39F60708-FF48-4C22-0000-000000000009}
+ {39F60708-FF48-4C22-0000-000000000008}
- {39F60708-FF48-4C22-0000-0000000000A2}
+ {39F60708-FF48-4C22-0000-000000000011}
- {39F60708-FF48-4C22-0000-000000000010}
+ {39F60708-FF48-4C22-0000-000000000009}
- {39F60708-FF48-4C22-0000-000000000011}
+ {39F60708-FF48-4C22-0000-000000000010}
- {39F60708-FF48-4C22-0000-000000000012}
+ {39F60708-FF48-4C22-0000-000000000011}
- {39F60708-FF48-4C22-0000-0000000000B2}
+ {39F60708-FF48-4C22-0000-0000000000A2}
- {39F60708-FF48-4C22-0000-000000000013}
+ {39F60708-FF48-4C22-0000-000000000012}
- {39F60708-FF48-4C22-0000-0000000000A4}
+ {39F60708-FF48-4C22-0000-000000000013}
{39F60708-FF48-4C22-0000-000000000000}
@@ -187,9 +187,6 @@
{39F60708-FF48-4C22-0000-000000000007}
-
- {39F60708-FF48-4C22-0000-000000000001}
-
{39F60708-FF48-4C22-0000-000000000008}
@@ -206,31 +203,31 @@
{39F60708-FF48-4C22-0000-00000000000C}
- {39F60708-FF48-4C22-0000-000000000002}
+ {39F60708-FF48-4C22-0000-000000000001}
{39F60708-FF48-4C22-0000-00000000000D}
- {39F60708-FF48-4C22-0000-000000000003}
+ {39F60708-FF48-4C22-0000-000000000002}
- {39F60708-FF48-4C22-0000-000000000006}
+ {39F60708-FF48-4C22-0000-000000000005}
- {39F60708-FF48-4C22-0000-000000000004}
+ {39F60708-FF48-4C22-0000-000000000003}
- {39F60708-FF48-4C22-0000-000000000007}
+ {39F60708-FF48-4C22-0000-000000000006}
- {39F60708-FF48-4C22-0000-000000000005}
+ {39F60708-FF48-4C22-0000-000000000004}
{39F60708-FF48-4C22-0000-00000000000E}
- {39F60708-FF48-4C22-0000-000000000008}
+ {39F60708-FF48-4C22-0000-000000000007}
@@ -390,24 +387,6 @@
src\hash
-
- src\hash\vectorization
-
-
- src\hash\vectorization
-
-
- src\hash\vectorization
-
-
- src\hash\vectorization
-
-
- src\hash\vectorization
-
-
- src\hash\vectorization
-
src\math
@@ -1519,6 +1498,9 @@
include\bitcoin\system\impl\hash\sha
+
+ include\bitcoin\system\impl\hash\sha
+
include\bitcoin\system\impl\hash\sha
diff --git a/include/bitcoin/system/hash/sha/algorithm.hpp b/include/bitcoin/system/hash/sha/algorithm.hpp
index 077beb84b2..48caa62dac 100644
--- a/include/bitcoin/system/hash/sha/algorithm.hpp
+++ b/include/bitcoin/system/hash/sha/algorithm.hpp
@@ -144,7 +144,8 @@ class algorithm
/// Intrinsics types.
/// -----------------------------------------------------------------------
- /// Extended integer capacity for uint32_t/uint64_t is 2/4/8/16 only.
+ /// Expand is multiple of buffer/state for Lane concurrent blocks.
+ /// Multiple blocks are "striped" across the expanded buffer in xWords.
template > = true>
using xblock_t = std_array;
@@ -157,6 +158,17 @@ class algorithm
template = true>
using xchunk_t = std_array;
+ /// Wide is casting of buffer_t to xWord for single block concurrency.
+ /// This is not multi-block or block striping, just larger words.
+ template = true>
+ using wbuffer_t = std_array;
+
+ template = true>
+ using wstate_t = std_array;
+
+ /// Other types.
+ /// -----------------------------------------------------------------------
+
using uint = unsigned int;
using idigests_t = mutable_iterable;
using pad_t = std_array
INLINE static constexpr void prepare(auto& buffer) NOEXCEPT;
- INLINE static constexpr void add_k(auto& buffer) NOEXCEPT;
static constexpr void schedule_(auto& buffer) NOEXCEPT;
static constexpr void schedule(buffer_t& buffer) NOEXCEPT;
@@ -242,7 +253,7 @@ class algorithm
static constexpr void reinput(auto& buffer, const auto& state) NOEXCEPT;
- /// Iteration.
+ /// Iteration (message scheduling vectorized for multiple blocks).
/// -----------------------------------------------------------------------
template
@@ -280,7 +291,7 @@ class algorithm
const ablocks_t& blocks) NOEXCEPT;
INLINE static void iterate(state_t& state, iblocks_t& blocks) NOEXCEPT;
- /// Merkle hashing.
+ /// Merkle hashing (fully vectorized for multiple blocks).
/// -----------------------------------------------------------------------
template
@@ -311,7 +322,7 @@ class algorithm
VCONSTEXPR static void merkle_hash_(digests_t& digests,
size_t offset=zero) NOEXCEPT;
- /// sigma0 vectorization.
+ /// sigma0 vectorization (single blocks).
/// -----------------------------------------------------------------------
template = true>
@@ -328,22 +339,45 @@ class algorithm
INLINE static void schedule_sigma(xbuffer_t& xbuffer) NOEXCEPT;
INLINE static void schedule_sigma(buffer_t& buffer) NOEXCEPT;
- /// Native.
+ /// [K]onstant vectorization (single and multiple blocks).
+ /// -----------------------------------------------------------------------
+
+ template
+ INLINE static constexpr void konstant(auto& buffer) NOEXCEPT;
+
+ template
+ INLINE static void vector_konstant(wbuffer_t& wbuffer) NOEXCEPT;
+ INLINE static void vector_konstant(buffer_t& buffer) NOEXCEPT;
+
+ template
+ static constexpr void konstant(xbuffer_t& xbuffer) NOEXCEPT;
+ static constexpr void konstant(buffer_t& buffer) NOEXCEPT;
+ static constexpr void konstant_(auto& buffer) NOEXCEPT;
+
+ /// Native SHA optimizations (single blocks).
/// -----------------------------------------------------------------------
- static constexpr auto native_lanes = capacity;
- static constexpr auto native_rounds = SHA::rounds / native_lanes;
- using cbuffer_t = std_array;
- using cstate_t = std_array;
template
- INLINE static void prepare(cbuffer_t& buffer) NOEXCEPT;
- INLINE static void add_k(cbuffer_t& buffer) NOEXCEPT;
- static void schedule(cbuffer_t& buffer) NOEXCEPT;
+ INLINE static void prepare_native(wbuffer_t& wbuffer) NOEXCEPT;
+ static void schedule(wbuffer_t& wbuffer) NOEXCEPT;
template
INLINE static void schedule_native(xbuffer_t& xbuffer) NOEXCEPT;
INLINE static void schedule_native(buffer_t& buffer) NOEXCEPT;
+ template
+ INLINE static void round_native(wstate_t& state,
+ const wbuffer_t& wk) NOEXCEPT;
+
+ INLINE static void shuffle(wstate_t& wstate) NOEXCEPT;
+ INLINE static void unshuffle(wstate_t& wstate) NOEXCEPT;
+ INLINE static void summarize_native(wstate_t& out,
+ const wstate_t& in) NOEXCEPT;
+
+ template
+ INLINE static void compress_native(wstate_t& state,
+ const wbuffer_t& wbuffer) NOEXCEPT;
+
template
INLINE static void compress_native(xstate_t& xstate,
const xbuffer_t& xbuffer) NOEXCEPT;
@@ -381,6 +415,7 @@ BC_PUSH_WARNING(NO_POINTER_ARITHMETIC)
BC_PUSH_WARNING(NO_ARRAY_INDEXING)
#include
+#include
#include
#include
#include
diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp
index ada7bae8f4..f9da36fd0f 100644
--- a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp
+++ b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp
@@ -173,7 +173,6 @@ template
constexpr void CLASS::
compress_(auto& state, const auto& buffer) NOEXCEPT
{
- // SHA-NI/256: 64/4 = 16 quad rounds, 8/4 = 2 state elements.
// This is a copy (state type varies due to vectorization).
const auto start = state;
diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp
new file mode 100644
index 0000000000..32afdfaec4
--- /dev/null
+++ b/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp
@@ -0,0 +1,279 @@
+/**
+ * Copyright (c) 2011-2024 libbitcoin developers (see AUTHORS)
+ *
+ * This file is part of libbitcoin.
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see .
+ */
+#ifndef LIBBITCOIN_SYSTEM_HASH_SHA_ALGORITHM_KONSTANT_IPP
+#define LIBBITCOIN_SYSTEM_HASH_SHA_ALGORITHM_KONSTANT_IPP
+
+#include
+
+// [K]onstant adding.
+// ============================================================================
+
+namespace libbitcoin {
+namespace system {
+namespace sha {
+
+// single or expanded (vectorized) buffer
+// ----------------------------------------------------------------------------
+// protected
+
+TEMPLATE
+template
+INLINE constexpr void CLASS::
+konstant(auto& buffer) NOEXCEPT
+{
+ // K is broadcast across blocks.
+ buffer[Round] = f::addc(buffer[Round]);
+}
+
+// wide (vectorized) buffer
+// ----------------------------------------------------------------------------
+// protected
+
+TEMPLATE
+template
+INLINE void CLASS::
+vector_konstant(wbuffer_t& wbuffer) NOEXCEPT
+{
+ constexpr auto r = Round;
+ constexpr auto s = SHA::word_bits;
+ constexpr auto lanes = capacity;
+
+ if constexpr (lanes == 2)
+ {
+ wbuffer[Round] = f::add(wbuffer[Round], set(
+ K::get[r + 1], K::get[r + 0]));
+ }
+ else if constexpr (lanes == 4)
+ {
+ wbuffer[Round] = f::add(wbuffer[Round], set(
+ K::get[r + 3], K::get[r + 2], K::get[r + 1], K::get[r + 0]));
+ }
+ else if constexpr (lanes == 8)
+ {
+ wbuffer[Round] = f::add(wbuffer[Round], set(
+ K::get[r + 7], K::get[r + 6], K::get[r + 5], K::get[r + 4],
+ K::get[r + 3], K::get[r + 2], K::get[r + 1], K::get[r + 0]));
+ }
+ else if constexpr (lanes == 16)
+ {
+ wbuffer[Round] = f::add(wbuffer[Round], set(
+ K::get[r + 15], K::get[r + 14], K::get[r + 13], K::get[r + 12],
+ K::get[r + 11], K::get[r + 10], K::get[r + 9], K::get[r + 8],
+ K::get[r + 7], K::get[r + 6], K::get[r + 5], K::get[r + 4],
+ K::get[r + 3], K::get[r + 2], K::get[r + 1], K::get[r + 0]));
+ }
+}
+
+TEMPLATE
+void CLASS::
+vector_konstant(buffer_t& buffer) NOEXCEPT
+{
+ if constexpr (use_x512)
+ {
+ auto& wbuffer = array_cast(buffer);
+ vector_konstant<0>(wbuffer);
+ vector_konstant<1>(wbuffer);
+ vector_konstant<2>(wbuffer);
+ vector_konstant<3>(wbuffer);
+
+ if constexpr (SHA::rounds == 80)
+ {
+ vector_konstant<4>(wbuffer);
+ }
+ }
+ else if constexpr (use_x256)
+ {
+ auto& wbuffer = array_cast(buffer);
+ vector_konstant<0>(wbuffer);
+ vector_konstant<1>(wbuffer);
+ vector_konstant<2>(wbuffer);
+ vector_konstant<3>(wbuffer);
+ vector_konstant<4>(wbuffer);
+ vector_konstant<5>(wbuffer);
+ vector_konstant<6>(wbuffer);
+ vector_konstant<7>(wbuffer);
+
+ if constexpr (SHA::rounds == 80)
+ {
+ vector_konstant<8>(wbuffer);
+ vector_konstant<9>(wbuffer);
+ }
+ }
+ else if constexpr (use_x128)
+ {
+ auto& wbuffer = array_cast(buffer);
+ vector_konstant<0>(wbuffer);
+ vector_konstant<1>(wbuffer);
+ vector_konstant<2>(wbuffer);
+ vector_konstant<3>(wbuffer);
+ vector_konstant<4>(wbuffer);
+ vector_konstant<5>(wbuffer);
+ vector_konstant<6>(wbuffer);
+ vector_konstant<7>(wbuffer);
+ vector_konstant<8>(wbuffer);
+ vector_konstant<9>(wbuffer);
+ vector_konstant<10>(wbuffer);
+ vector_konstant<11>(wbuffer);
+ vector_konstant<12>(wbuffer);
+ vector_konstant<13>(wbuffer);
+ vector_konstant<14>(wbuffer);
+ vector_konstant<15>(wbuffer);
+
+ if constexpr (SHA::rounds == 80)
+ {
+ vector_konstant<16>(wbuffer);
+ vector_konstant<17>(wbuffer);
+ vector_konstant<18>(wbuffer);
+ vector_konstant<19>(wbuffer);
+ }
+ }
+ else
+ {
+ konstant_(buffer);
+ }
+}
+
+// dispatch
+// ----------------------------------------------------------------------------
+// protected
+
+TEMPLATE
+constexpr void CLASS::
+konstant_(auto& buffer) NOEXCEPT
+{
+ konstant<0>(buffer);
+ konstant<1>(buffer);
+ konstant<2>(buffer);
+ konstant<3>(buffer);
+ konstant<4>(buffer);
+ konstant<5>(buffer);
+ konstant<6>(buffer);
+ konstant<7>(buffer);
+ konstant<8>(buffer);
+ konstant<9>(buffer);
+ konstant<10>(buffer);
+ konstant<11>(buffer);
+ konstant<12>(buffer);
+ konstant<13>(buffer);
+ konstant<14>(buffer);
+ konstant<15>(buffer);
+
+ konstant<16>(buffer);
+ konstant<17>(buffer);
+ konstant<18>(buffer);
+ konstant<19>(buffer);
+ konstant<20>(buffer);
+ konstant<21>(buffer);
+ konstant<22>(buffer);
+ konstant<23>(buffer);
+ konstant<24>(buffer);
+ konstant<25>(buffer);
+ konstant<26>(buffer);
+ konstant<27>(buffer);
+ konstant<28>(buffer);
+ konstant<29>(buffer);
+ konstant<30>(buffer);
+ konstant<31>(buffer);
+
+ konstant<32>(buffer);
+ konstant<33>(buffer);
+ konstant<34>(buffer);
+ konstant<35>(buffer);
+ konstant<36>(buffer);
+ konstant<37>(buffer);
+ konstant<38>(buffer);
+ konstant<39>(buffer);
+ konstant<40>(buffer);
+ konstant<41>(buffer);
+ konstant<42>(buffer);
+ konstant<43>(buffer);
+ konstant<44>(buffer);
+ konstant<45>(buffer);
+ konstant<46>(buffer);
+ konstant<47>(buffer);
+
+ konstant<48>(buffer);
+ konstant<49>(buffer);
+ konstant<50>(buffer);
+ konstant<51>(buffer);
+ konstant<52>(buffer);
+ konstant<53>(buffer);
+ konstant<54>(buffer);
+ konstant<55>(buffer);
+ konstant<56>(buffer);
+ konstant<57>(buffer);
+ konstant<58>(buffer);
+ konstant<59>(buffer);
+ konstant<60>(buffer);
+ konstant<61>(buffer);
+ konstant<62>(buffer);
+ konstant<63>(buffer);
+
+ if constexpr (SHA::rounds == 80)
+ {
+ konstant<64>(buffer);
+ konstant<65>(buffer);
+ konstant<66>(buffer);
+ konstant<67>(buffer);
+ konstant<68>(buffer);
+ konstant<69>(buffer);
+ konstant<70>(buffer);
+ konstant<71>(buffer);
+ konstant<72>(buffer);
+ konstant<73>(buffer);
+ konstant<74>(buffer);
+ konstant<75>(buffer);
+ konstant<76>(buffer);
+ konstant<77>(buffer);
+ konstant<78>(buffer);
+ konstant<79>(buffer);
+ }
+}
+
+TEMPLATE
+template
+INLINE constexpr void CLASS::
+konstant(xbuffer_t& xbuffer) NOEXCEPT
+{
+ konstant_(xbuffer);
+}
+
+TEMPLATE
+INLINE constexpr void CLASS::
+konstant(buffer_t& buffer) NOEXCEPT
+{
+ if (std::is_constant_evaluated())
+ {
+ konstant_(buffer);
+ }
+ ////else if constexpr (vector)
+ ////{
+ //// vector_konstant(buffer);
+ ////}
+ else
+ {
+ konstant_(buffer);
+ }
+}
+
+} // namespace sha
+} // namespace system
+} // namespace libbitcoin
+
+#endif
diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp
index 1255d925bb..06553308d5 100644
--- a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp
+++ b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp
@@ -35,147 +35,91 @@
namespace libbitcoin {
namespace system {
namespace sha {
+
+// schedule
+// ----------------------------------------------------------------------------
+// protected
TEMPLATE
template
INLINE void CLASS::
-prepare(cbuffer_t& buffer) NOEXCEPT
+prepare_native(wbuffer_t& wbuffer) NOEXCEPT
{
- // K-adding is shifted 16 words, with last 16 added after scheduling.
-
if constexpr (SHA::strength == 160)
{
- ////static_assert(false, "sha160 not implemented");
- }
- else if constexpr (use_neon)
- {
- ////static_assert(false, "neon not implemented");
+ if constexpr (use_neon)
+ {
+ }
+ else if constexpr (use_shani)
+ {
+ }
}
- else
+ else if constexpr (SHA::strength == 256)
{
- static_assert(SHA::strength == 256);
-
- constexpr auto r1 = Round - 1;
- constexpr auto r2 = sub1(r1);
- constexpr auto r3 = sub1(r2);
- constexpr auto r4 = sub1(r3);
- constexpr auto k0 = Round * 4 - 16;
- constexpr auto k1 = add1(k0);
- constexpr auto k2 = add1(k1);
- constexpr auto k3 = add1(k2);
-
- buffer[Round] = mm_sha256msg2_epu32
+ if constexpr (use_neon)
+ {
+ }
+ else if constexpr (use_shani)
+ {
+ wbuffer[Round] = mm_sha256msg2_epu32
(
mm_add_epi32
(
mm_alignr_epi8
(
- buffer[r1], buffer[r2], SHA::word_bytes
+ wbuffer[Round - 1], wbuffer[Round - 2], SHA::word_bytes
),
mm_sha256msg1_epu32
(
- buffer[r4], buffer[r3]
+ wbuffer[Round - 4], wbuffer[Round - 3]
)
),
- buffer[r1]
- );
-
- buffer[r4] = mm_add_epi32
- (
- buffer[r4],
- mm_set_epi32(K::get[k3], K::get[k2], K::get[k1], K::get[k0])
+ wbuffer[Round - 1]
);
+ }
}
}
TEMPLATE
INLINE void CLASS::
-add_k(cbuffer_t& buffer) NOEXCEPT
+schedule(wbuffer_t& wbuffer) NOEXCEPT
{
- // Add K to last 16 words.
- // TODO: Consolidated K-adding can be performed in 4/8/16 lanes.
- constexpr auto k = SHA::rounds - SHA::block_words;
- constexpr auto r = k / native_lanes;
-
- buffer[r + 0] = mm_add_epi32
- (
- buffer[r + 0],
- mm_set_epi32(
- K::get[k + 3], K::get[k + 2],
- K::get[k + 1], K::get[k + 0])
- );
-
- buffer[r + 1] = mm_add_epi32
- (
- buffer[r + 1],
- mm_set_epi32(
- K::get[k + 7], K::get[k + 6],
- K::get[k + 5], K::get[k + 4])
- );
-
- buffer[r + 2] = mm_add_epi32
- (
- buffer[r + 2],
- mm_set_epi32(
- K::get[k + 11], K::get[k + 10],
- K::get[k + 9], K::get[k + 8])
- );
-
- buffer[r + 3] = mm_add_epi32
- (
- buffer[r + 3],
- mm_set_epi32(
- K::get[k + 15], K::get[k + 14],
- K::get[k + 13], K::get[k + 12])
- );
-}
+ prepare_native<4>(wbuffer);
+ prepare_native<5>(wbuffer);
+ prepare_native<6>(wbuffer);
+ prepare_native<7>(wbuffer);
+ prepare_native<8>(wbuffer);
+ prepare_native<9>(wbuffer);
+ prepare_native<10>(wbuffer);
+ prepare_native<11>(wbuffer);
+ prepare_native<12>(wbuffer);
+ prepare_native<13>(wbuffer);
+ prepare_native<14>(wbuffer);
+ prepare_native<15>(wbuffer);
-TEMPLATE
-INLINE void CLASS::
-schedule(cbuffer_t& buffer) NOEXCEPT
-{
- auto& cbuffer = array_cast(buffer);
-
- prepare<4>(cbuffer);
- prepare<5>(cbuffer);
- prepare<6>(cbuffer);
- prepare<7>(cbuffer);
- prepare<8>(cbuffer);
- prepare<9>(cbuffer);
- prepare<10>(cbuffer);
- prepare<11>(cbuffer);
- prepare<12>(cbuffer);
- prepare<13>(cbuffer);
- prepare<14>(cbuffer);
- prepare<15>(cbuffer);
-
- ////if constexpr (SHA::rounds == 80)
- ////{
- //// prepare<16>(buffer);
- //// prepare<17>(buffer);
- //// prepare<18>(buffer);
- //// prepare<19>(buffer);
- ////}
+ if constexpr (SHA::rounds == 80)
+ {
+ prepare_native<16>(wbuffer);
+ prepare_native<17>(wbuffer);
+ prepare_native<18>(wbuffer);
+ prepare_native<19>(wbuffer);
+ }
- add_k(buffer);
+ konstant(array_cast(wbuffer));
}
-// schedule
-// ----------------------------------------------------------------------------
-// protected
-
TEMPLATE
INLINE void CLASS::
schedule_native(buffer_t& buffer) NOEXCEPT
{
// neon and sha160 not yet implemented, sha512 is not native.
- if constexpr (SHA::strength == 160 || SHA::strength == 512 || use_neon)
+ if constexpr (SHA::strength == 256 && !use_neon)
{
- schedule_(buffer);
+ schedule(array_cast(buffer));
}
else
{
- schedule(array_cast(buffer));
+ schedule_(buffer);
}
}
@@ -192,6 +136,134 @@ schedule_native(xbuffer_t& xbuffer) NOEXCEPT
// ----------------------------------------------------------------------------
// protected
+TEMPLATE
+template
+INLINE void CLASS::
+round_native(wstate_t& state,
+ const wbuffer_t& wk) NOEXCEPT
+{
+ if constexpr (SHA::strength == 160)
+ {
+ if constexpr (use_neon)
+ {
+ }
+ else if constexpr (use_shani)
+ {
+ }
+ }
+ else if constexpr (SHA::strength == 256)
+ {
+ if constexpr (use_neon)
+ {
+ }
+ else if constexpr (use_shani)
+ {
+ // Process wk[Round][0..1], [HGDC][FEBA] (initial state)
+ state[1] = mm_sha256rnds2_epu32(state[1], state[0], wk[Round]);
+
+ // Process wk[Round][2..3] (shifted down)
+ state[0] = mm_sha256rnds2_epu32(state[0], state[1],
+ mm_shuffle_epi32(wk[Round], 0x0e));
+ }
+ }
+}
+
+TEMPLATE
+INLINE void CLASS::
+summarize_native(wstate_t& out,
+ const wstate_t& in) NOEXCEPT
+{
+ if constexpr (SHA::strength == 160)
+ {
+ if constexpr (use_neon)
+ {
+ }
+ else if constexpr (use_shani)
+ {
+ }
+ }
+ else if constexpr (SHA::strength == 256)
+ {
+ if constexpr (use_neon)
+ {
+ }
+ else if constexpr (use_shani)
+ {
+ out[0] = mm_add_epi32(out[0], in[0]);
+ out[1] = mm_add_epi32(out[1], in[1]);
+ }
+ }
+}
+
+TEMPLATE
+INLINE void CLASS::
+shuffle(wstate_t& wstate) NOEXCEPT
+{
+ // Change wstate to mm_sha256rnds2_epu32 expected form:
+ // [ABCD][EFGH] -> [FEBA][HGDC] (ordered low to high).
+ const auto t1 = mm_shuffle_epi32(wstate[0], 0xb1);
+ const auto t2 = mm_shuffle_epi32(wstate[1], 0x1b);
+ wstate[0] = mm_alignr_epi8(t1, t2, 8);
+ wstate[1] = mm_blend_epi16(t2, t1, 15);
+}
+
+TEMPLATE
+INLINE void CLASS::
+unshuffle(wstate_t& wstate) NOEXCEPT
+{
+ // Restore wstate to normal form:
+ // [FEBA][HGDC] -> [ABCD][EFGH] (ordered low to high).
+ const auto t1 = mm_shuffle_epi32(wstate[0], 0x1b);
+ const auto t2 = mm_shuffle_epi32(wstate[1], 0xb1);
+ wstate[0] = mm_blend_epi16(t1, t2, 15);
+ wstate[1] = mm_alignr_epi8(t2, t1, 8);
+}
+
+TEMPLATE
+template
+INLINE void CLASS::
+compress_native(wstate_t& wstate,
+ const wbuffer_t& wbuffer) NOEXCEPT
+{
+ // Shuffle and unshuffle can be done outside of all blocks, but this would
+ // leave state in a non-normal form, so presently absorbing that cost.
+ shuffle(wstate);
+
+ // This is a copy.
+ const auto start = wstate;
+
+ round_native< 0, Lane>(wstate, wbuffer);
+ round_native< 1, Lane>(wstate, wbuffer);
+ round_native< 2, Lane>(wstate, wbuffer);
+ round_native< 3, Lane>(wstate, wbuffer);
+ round_native< 4, Lane>(wstate, wbuffer);
+ round_native< 5, Lane>(wstate, wbuffer);
+ round_native< 6, Lane>(wstate, wbuffer);
+ round_native< 7, Lane>(wstate, wbuffer);
+ round_native< 8, Lane>(wstate, wbuffer);
+ round_native< 9, Lane>(wstate, wbuffer);
+ round_native<10, Lane>(wstate, wbuffer);
+ round_native<11, Lane>(wstate, wbuffer);
+ round_native<12, Lane>(wstate, wbuffer);
+ round_native<13, Lane>(wstate, wbuffer);
+ round_native<14, Lane>(wstate, wbuffer);
+ round_native<15, Lane>(wstate, wbuffer);
+
+ if constexpr (SHA::rounds == 80)
+ {
+ round_native<16, Lane>(wstate, wbuffer);
+ round_native<17, Lane>(wstate, wbuffer);
+ round_native<18, Lane>(wstate, wbuffer);
+ round_native<19, Lane>(wstate, wbuffer);
+ }
+
+ // This is just a vectorized version of summarize().
+ summarize_native(wstate, start);
+
+ // See above comments on shuffle().
+ unshuffle(wstate);
+}
+
TEMPLATE
template
INLINE void CLASS::
@@ -216,8 +288,18 @@ template
INLINE void CLASS::
compress_native(state_t& state, const buffer_t& buffer) NOEXCEPT
{
- // TODO: Single block compression.
- compress_(state, buffer);
+ // TODO: debug.
+ // TODO: sha160 state is too small to array cast into two xwords.
+ // neon and sha160 not yet implemented, sha512 is not native.
+ ////if constexpr (SHA::strength == 256 && !use_neon)
+ ////{
+ //// compress_native(array_cast(state),
+ //// array_cast(buffer));
+ ////}
+ ////else
+ {
+ compress_(state, buffer);
+ }
}
} // namespace sha
diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp
index cf4b99ae8c..f2796d4b5c 100644
--- a/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp
+++ b/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp
@@ -36,63 +36,22 @@ template
INLINE constexpr void CLASS::
prepare(auto& buffer) NOEXCEPT
{
- // K-adding is shifted 16 words, with last 16 added after scheduling.
constexpr auto s = SHA::word_bits;
if constexpr (SHA::strength == 160)
{
- constexpr auto r03 = Round - 3;
- constexpr auto r08 = Round - 8;
- constexpr auto r14 = Round - 14;
- constexpr auto r16 = Round - 16;
-
buffer[Round] = f::rol<1, s>(f::xor_(
- f::xor_(buffer[r16], buffer[r14]),
- f::xor_(buffer[r08], buffer[r03])));
-
- buffer[r16] = f::addc(buffer[r16]);
+ f::xor_(buffer[Round - 16], buffer[Round - 14]),
+ f::xor_(buffer[Round - 8], buffer[Round - 3])));
}
else
{
- constexpr auto r02 = Round - 2;
- constexpr auto r07 = Round - 7;
- constexpr auto r15 = Round - 15;
- constexpr auto r16 = Round - 16;
-
buffer[Round] = f::add(
- f::add(buffer[r16], sigma0(buffer[r15])),
- f::add(buffer[r07], sigma1(buffer[r02])));
-
- buffer[r16] = f::addc(buffer[r16]);
+ f::add(buffer[Round - 16], sigma0(buffer[Round - 15])),
+ f::add(buffer[Round - 7], sigma1(buffer[Round - 2])));
}
}
-TEMPLATE
-INLINE constexpr void CLASS::
-add_k(auto& buffer) NOEXCEPT
-{
- // Add K to last 16 words.
- // TODO: Consolidated K-adding can be performed in 4/8/16 lanes.
- constexpr auto s = SHA::word_bits;
- constexpr auto r = SHA::rounds - SHA::block_words;
- buffer[r + 0] = f::addc(buffer[r + 0]);
- buffer[r + 1] = f::addc(buffer[r + 1]);
- buffer[r + 2] = f::addc(buffer[r + 2]);
- buffer[r + 3] = f::addc(buffer[r + 3]);
- buffer[r + 4] = f::addc(buffer[r + 4]);
- buffer[r + 5] = f::addc(buffer[r + 5]);
- buffer[r + 6] = f::addc(buffer[r + 6]);
- buffer[r + 7] = f::addc(buffer[r + 7]);
- buffer[r + 8] = f::addc(buffer[r + 8]);
- buffer[r + 9] = f::addc(buffer[r + 9]);
- buffer[r + 10] = f::addc(buffer[r + 10]);
- buffer[r + 11] = f::addc(buffer[r + 11]);
- buffer[r + 12] = f::addc(buffer[r + 12]);
- buffer[r + 13] = f::addc(buffer[r + 13]);
- buffer[r + 14] = f::addc(buffer[r + 14]);
- buffer[r + 15] = f::addc(buffer[r + 15]);
-}
-
TEMPLATE
constexpr void CLASS::
schedule_(auto& buffer) NOEXCEPT
@@ -168,7 +127,7 @@ schedule_(auto& buffer) NOEXCEPT
prepare<79>(buffer);
}
- add_k(buffer);
+ konstant(buffer);
}
TEMPLATE
diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp
index 5f82112743..e6f880dd64 100644
--- a/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp
+++ b/include/bitcoin/system/impl/hash/sha/algorithm_sigma.ipp
@@ -49,12 +49,10 @@ prepare1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT
constexpr auto r16 = Round - 16;
constexpr auto s = SHA::word_bits;
- // buffer[r07 + 7] is buffer[Round + 0]
- // This is why sigma0 is limited to 8 lanes (vs 16).
+ // buffer[r07 + 7] is buffer[Round + 0], so sigma0 is limited to 8 lanes.
buffer[Round + Offset] = f::add(
f::add(buffer[r16 + Offset], get(xsigma0)),
f::add(buffer[r07 + Offset], sigma1(buffer[r02 + Offset])));
- buffer[r16 + Offset] = f::addc(buffer[r16 + Offset]);
}
TEMPLATE
@@ -95,7 +93,7 @@ schedule_sigma(xbuffer_t& xbuffer) NOEXCEPT
}
TEMPLATE
-INLINE void CLASS::
+void CLASS::
schedule_sigma(buffer_t& buffer) NOEXCEPT
{
if constexpr (SHA::strength != 160 && have_lanes())
@@ -113,7 +111,7 @@ schedule_sigma(buffer_t& buffer) NOEXCEPT
prepare8<72>(buffer);
}
- add_k(buffer);
+ konstant(buffer);
}
else
{
diff --git a/include/bitcoin/system/intrinsics/xcpu/defines.hpp b/include/bitcoin/system/intrinsics/xcpu/defines.hpp
index 79e5b06334..7594327053 100644
--- a/include/bitcoin/system/intrinsics/xcpu/defines.hpp
+++ b/include/bitcoin/system/intrinsics/xcpu/defines.hpp
@@ -133,6 +133,8 @@ BC_POP_WARNING()
#define mm_extract_epi32(a, Lane) {}
#define mm_extract_epi64(a, Lane) {}
#define mm_shuffle_epi8(a, mask) (a)
+ #define mm_shuffle_epi32(a, mask) (a)
+ #define mm_blend_epi16(a, b, mask) (a)
#define mm_load_si128(a) {}
#define mm_loadu_si128(a) {}
#define mm_store_si128(memory, a)
@@ -167,6 +169,8 @@ BC_POP_WARNING()
#define mm_extract_epi32(a, Lane) _mm_extract_epi32(a, Lane)
#define mm_extract_epi64(a, Lane) _mm_extract_epi64(a, Lane) // undefined for X32
#define mm_shuffle_epi8(a, mask) _mm_shuffle_epi8(a, mask)
+ #define mm_shuffle_epi32(a, mask) _mm_shuffle_epi32(a, mask)
+ #define mm_blend_epi16(a, b, mask) _mm_blend_epi16(a, b, mask)
#define mm_load_si128(a) _mm_load_si128(a)
#define mm_loadu_si128(a) _mm_loadu_si128(a)
#define mm_store_si128(memory, a) _mm_store_si128(memory, a)
diff --git a/src/hash/vectorization/sha256_1_native.cpp b/src/hash/vectorization/sha256_1_native.cpp
deleted file mode 100644
index bcb0b7b96e..0000000000
--- a/src/hash/vectorization/sha256_1_native.cpp
+++ /dev/null
@@ -1,302 +0,0 @@
-/**
- * Copyright (c) 2011-2023 libbitcoin developers (see AUTHORS)
- *
- * This file is part of libbitcoin.
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU Affero General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU Affero General Public License for more details.
- *
- * You should have received a copy of the GNU Affero General Public License
- * along with this program. If not, see .
- */
-#include
-#include
-#include
-#include
-#include
-#include
-
-// sha256: movable-type.co.uk/scripts/sha256.html
-// Use inline vs. constexpr to obtain intrinsic std::rotr.
-
-namespace libbitcoin {
-namespace system {
-namespace sha256 {
-
-#if defined (DISABLED)
-
-constexpr auto choice(auto x, auto y, auto z) NOEXCEPT
-{
- return (x & (y ^ z)) ^ z;
-}
-
-constexpr auto majority(auto x, auto y, auto z) NOEXCEPT
-{
- return (x & (y | z)) | (y & z);
-}
-
-inline auto SIGMA0(auto a) NOEXCEPT
-{
- return std::rotr(a, 2) ^ std::rotr(a, 13) ^ std::rotr(a, 22);
-}
-
-inline auto SIGMA1(auto a) NOEXCEPT
-{
- return std::rotr(a, 6) ^ std::rotr(a, 11) ^ std::rotr(a, 25);
-}
-
-inline auto sigma0(auto a) NOEXCEPT
-{
- return std::rotr(a, 7) ^ std::rotr(a, 18) ^ (a >> 3);
-}
-
-inline auto sigma1(auto a) NOEXCEPT
-{
- return std::rotr(a, 17) ^ std::rotr(a, 19) ^ (a >> 10);
-}
-
-inline void round(auto a, auto b, auto c, auto& out_d, auto e, auto f, auto g,
- auto& out_h, auto k) NOEXCEPT
-{
- const auto t0 = SIGMA1(e) + choice(e, f, g) + out_h + k;
- const auto t1 = SIGMA0(a) + majority(a, b, c);
- out_d += t0;
- out_h = t0 + t1;
-}
-
-template
-constexpr void round(state& out, const buffer& in) NOEXCEPT
-{
- BC_PUSH_WARNING(NO_ARRAY_INDEXING)
- round(
- out[(block_size + 0 - Round) % state_size],
- out[(block_size + 1 - Round) % state_size],
- out[(block_size + 2 - Round) % state_size],
- out[(block_size + 3 - Round) % state_size], // in/out
- out[(block_size + 4 - Round) % state_size],
- out[(block_size + 5 - Round) % state_size],
- out[(block_size + 6 - Round) % state_size],
- out[(block_size + 7 - Round) % state_size], // in/out
- in[Round] + K);
- BC_POP_WARNING()
-}
-
-template
-inline void set(buffer& out) NOEXCEPT
-{
- BC_PUSH_WARNING(NO_ARRAY_INDEXING)
- out[Offset] =
- sigma1(out[Offset - 2]) + out[Offset - 7] +
- sigma0(out[Offset - 15]) + out[Offset - 16];
- BC_POP_WARNING()
-}
-
-inline void expand48(buffer& out) NOEXCEPT
-{
- set<16>(out);
- set<17>(out);
- set<18>(out);
- set<19>(out);
- set<20>(out);
- set<21>(out);
- set<22>(out);
- set<23>(out);
- set<24>(out);
- set<25>(out);
- set<26>(out);
- set<27>(out);
- set<28>(out);
- set<29>(out);
- set<30>(out);
- set<31>(out);
- set<32>(out);
- set<33>(out);
- set<34>(out);
- set<35>(out);
- set<36>(out);
- set<37>(out);
- set<38>(out);
- set<39>(out);
- set<40>(out);
- set<41>(out);
- set<42>(out);
- set<43>(out);
- set<44>(out);
- set<45>(out);
- set<46>(out);
- set<47>(out);
- set<48>(out);
- set<49>(out);
- set<50>(out);
- set<51>(out);
- set<52>(out);
- set<53>(out);
- set<54>(out);
- set<55>(out);
- set<56>(out);
- set<57>(out);
- set<58>(out);
- set<59>(out);
- set<60>(out);
- set<61>(out);
- set<62>(out);
- set<63>(out);
-}
-
-inline void rounds64(state& out, const buffer& in) NOEXCEPT
-{
- round< 0, 0x428a2f98>(out, in);
- round< 1, 0x71374491>(out, in);
- round< 2, 0xb5c0fbcf>(out, in);
- round< 3, 0xe9b5dba5>(out, in);
- round< 4, 0x3956c25b>(out, in);
- round< 5, 0x59f111f1>(out, in);
- round< 6, 0x923f82a4>(out, in);
- round< 7, 0xab1c5ed5>(out, in);
- round< 8, 0xd807aa98>(out, in);
- round< 9, 0x12835b01>(out, in);
- round<10, 0x243185be>(out, in);
- round<11, 0x550c7dc3>(out, in);
- round<12, 0x72be5d74>(out, in);
- round<13, 0x80deb1fe>(out, in);
- round<14, 0x9bdc06a7>(out, in);
- round<15, 0xc19bf174>(out, in);
- round<16, 0xe49b69c1>(out, in);
- round<17, 0xefbe4786>(out, in);
- round<18, 0x0fc19dc6>(out, in);
- round<19, 0x240ca1cc>(out, in);
- round<20, 0x2de92c6f>(out, in);
- round<21, 0x4a7484aa>(out, in);
- round<22, 0x5cb0a9dc>(out, in);
- round<23, 0x76f988da>(out, in);
- round<24, 0x983e5152>(out, in);
- round<25, 0xa831c66d>(out, in);
- round<26, 0xb00327c8>(out, in);
- round<27, 0xbf597fc7>(out, in);
- round<28, 0xc6e00bf3>(out, in);
- round<29, 0xd5a79147>(out, in);
- round<30, 0x06ca6351>(out, in);
- round<31, 0x14292967>(out, in);
- round<32, 0x27b70a85>(out, in);
- round<33, 0x2e1b2138>(out, in);
- round<34, 0x4d2c6dfc>(out, in);
- round<35, 0x53380d13>(out, in);
- round<36, 0x650a7354>(out, in);
- round<37, 0x766a0abb>(out, in);
- round<38, 0x81c2c92e>(out, in);
- round<39, 0x92722c85>(out, in);
- round<40, 0xa2bfe8a1>(out, in);
- round<41, 0xa81a664b>(out, in);
- round<42, 0xc24b8b70>(out, in);
- round<43, 0xc76c51a3>(out, in);
- round<44, 0xd192e819>(out, in);
- round<45, 0xd6990624>(out, in);
- round<46, 0xf40e3585>(out, in);
- round<47, 0x106aa070>(out, in);
- round<48, 0x19a4c116>(out, in);
- round<49, 0x1e376c08>(out, in);
- round<50, 0x2748774c>(out, in);
- round<51, 0x34b0bcb5>(out, in);
- round<52, 0x391c0cb3>(out, in);
- round<53, 0x4ed8aa4a>(out, in);
- round<54, 0x5b9cca4f>(out, in);
- round<55, 0x682e6ff3>(out, in);
- round<56, 0x748f82ee>(out, in);
- round<57, 0x78a5636f>(out, in);
- round<58, 0x84c87814>(out, in);
- round<59, 0x8cc70208>(out, in);
- round<60, 0x90befffa>(out, in);
- round<61, 0xa4506ceb>(out, in);
- round<62, 0xbef9a3f7>(out, in);
- round<63, 0xc67178f2>(out, in);
-}
-
-inline void summary8(state& out, const state& in) NOEXCEPT
-{
- BC_PUSH_WARNING(NO_ARRAY_INDEXING)
- out[0] += in[0];
- out[1] += in[1];
- out[2] += in[2];
- out[3] += in[3];
- out[4] += in[4];
- out[5] += in[5];
- out[6] += in[6];
- out[7] += in[7];
- BC_POP_WARNING()
-}
-
-inline void copying8(buffer& out, const state& in) NOEXCEPT
-{
- auto& to = array_cast(out);
- to = in;
-}
-
-inline void copyin64(buffer& out, const buffer& in) NOEXCEPT
-{
- out = in;
-}
-
-inline void bigend16(buffer& out, const block& in) NOEXCEPT
-{
- constexpr auto size = block_size / sizeof(uint32_t);
- auto& from = array_cast(in);
- auto& to = array_cast(out);
- from_big_endians(to, from);
-}
-
-inline void bigend08(digest& out, const state& in) NOEXCEPT
-{
- auto& to = array_cast(out);
- to_big_endians(to, in);
-}
-
-inline void padding8(buffer& out) NOEXCEPT
-{
- auto& to = array_cast(out);
- to = pad32;
-}
-
-// This requires 32 more words of memory than a circular variables buffer.
-// According to FIPS180 this is more performant given no memory constraint.
-void hash_native(state& state, const block& block) NOEXCEPT
-{
- buffer words;
- const sha256::state start{ state };
- bigend16(words, block);
- expand48(words);
- rounds64(state, words);
- summary8(state, start);
-}
-
-// TODO: template with sized array of blocks.
-void hash_native(state& state, const block1& blocks) NOEXCEPT
-{
- buffer words;
-
- for (auto& block: blocks)
- {
- const sha256::state start{ state };
- bigend16(words, block);
- expand48(words);
- rounds64(state, words);
- summary8(state, start);
- }
-}
-
-void hash_finalize(digest& digest, const state& state) NOEXCEPT
-{
- bigend08(digest, state);
-}
-
-#endif
-
-} // namespace sha256
-} // namespace system
-} // namespace libbitcoin
diff --git a/src/hash/vectorization/sha256_2_shani.cpp b/src/hash/vectorization/sha256_2_shani.cpp
deleted file mode 100644
index 54a789e358..0000000000
--- a/src/hash/vectorization/sha256_2_shani.cpp
+++ /dev/null
@@ -1,252 +0,0 @@
-// Based on:
-// sha256-x86.c - Intel SHA extensions using C intrinsics
-// Written and place in public domain by Jeffrey Walton
-// Based on code from Intel, and by Sean Gulley for the miTLS project.
-
-#include
-#include
-#include
-#include
-#include
-
-namespace libbitcoin {
-namespace system {
-namespace sha256 {
-
-#if defined (DISABLED)
-
-#if !defined(HAVE_XCPU)
-
-void hash_shani(state&, const block1&) NOEXCEPT
-{
- BC_ASSERT_MSG(false, "hash_shani undefined");
-}
-
-#else
-
-// See sse41 for defines.
-using namespace i128;
-
-#ifndef VISUAL
-
-alignas(xint128_t) constexpr uint8_t mask[sizeof(xint128_t)]
-{
- 0x03, 0x02, 0x01, 0x00, // 0x00010203ul
- 0x07, 0x06, 0x05, 0x04, // 0x04050607ul
- 0x0b, 0x0a, 0x09, 0x08, // 0x08090a0bul
- 0x0f, 0x0e, 0x0d, 0x0c // 0x0c0d0e0ful
-};
-
-// Half of little endian IV.
-alignas(xint128_t) constexpr uint8_t initial0[sizeof(xint128_t)]
-{
- 0x8c, 0x68, 0x05, 0x9b, // 0x9b05688cul [5]
- 0x7f, 0x52, 0x0e, 0x51, // 0x510e527ful [4]
- 0x85, 0xae, 0x67, 0xbb, // 0xbb67ae85ul [1]
- 0x67, 0xe6, 0x09, 0x6a // 0x6a09e667ul [0]
-};
-
-// Half of little endian IV.
-alignas(xint128_t) constexpr uint8_t initial1[sizeof(xint128_t)]
-{
- 0x19, 0xcd, 0xe0, 0x5b, // 0x5be0cd19ul [7]
- 0xab, 0xd9, 0x83, 0x1f, // 0x1f83d9abul [6]
- 0x3a, 0xf5, 0x4f, 0xa5, // 0xa54ff53aul [3]
- 0x72, 0xf3, 0x6e, 0x3c // 0x3c6ef372ul [2]
-};
-
-// load/store i128
-// ----------------------------------------------------------------------------
-
-// Loading is just an array_cast into the buffer.
-
-// Aligned only, do not use with unaligned values.
-xint128_t load32x4a(const uint8_t& bytes) NOEXCEPT
-{
- return _mm_load_si128(pointer_cast(&bytes));
-}
-
-xint128_t load32x4u(const uint32_t& bytes) NOEXCEPT
-{
- return _mm_loadu_si128(pointer_cast(&bytes));
-}
-void store32x4u(uint8_t& bytes, xint128_t value) NOEXCEPT
-{
- _mm_storeu_si128(pointer_cast(&bytes), value);
-}
-
-// Aligned but for public data?
-xint128_t load(const uint8_t& data) NOEXCEPT
-{
- static const auto flipper = load32x4a(mask[0]);
- return i128::shuffle(load32x4a(data), flipper);
-}
-
-// Aligned but for public data?
-void store(uint8_t& out, xint128_t value) NOEXCEPT
-{
- static const auto flipper = load32x4a(mask[0]);
- store32x4u(out, i128::shuffle(value, flipper));
-}
-
-// sha256
-// ----------------------------------------------------------------------------
-// intel.com/content/www/us/en/developer/articles/technical/intel-sha-extensions.html
-// intel.com/content/dam/develop/external/us/en/documents/intel-sha-extensions-white-paper.pdf
-
-// _mm_sha256rnds2_epu32 is power of sha-ni, round reduction to 4 lane native.
-// But this needs to be applied to preparation as well, to retain that model.
-// Otherwise the round dispath must be modified to use the circular var queue.
-// And this changes the size of buffer_t (to words_t).
-// _mm_sha1rnds4_epu32 is provided for sha160. This would optimize only script
-// evaluation of the uncommon opcode, but will be almost free to implement.
-
-// _mm_sha256rnds2_epu32 performs two rounds, so this is four.
-void round(xint128_t& s0, xint128_t& s1, uint64_t k1, uint64_t k0) NOEXCEPT
-{
- // This is actually m + k precomputed for fixed single block padding.
- const auto value = set(k1, k0);
-
- s1 = _mm_sha256rnds2_epu32(s1, s0, value);
- s0 = _mm_sha256rnds2_epu32(s0, s1, i128::shuffle<0x0e>(value));
-}
-
-void round(xint128_t& s0, xint128_t& s1, xint128_t m, uint64_t k1, uint64_t k0) NOEXCEPT
-{
- // The sum m + k is computed in the message schedule.
- const auto value = sum(m, set(k1, k0));
-
- s1 = _mm_sha256rnds2_epu32(s1, s0, value);
- s0 = _mm_sha256rnds2_epu32(s0, s1, i128::shuffle<0x0e>(value));
-}
-
-void shift_message(xint128_t& out, xint128_t m) NOEXCEPT
-{
- out = _mm_sha256msg1_epu32(out, m);
-}
-
-void shift_message(xint128_t m0, xint128_t m1, xint128_t& out) NOEXCEPT
-{
- constexpr auto shift = sizeof(uint32_t);
- out = _mm_sha256msg2_epu32(sum(out, align_right(m1, m0)), m1);
-}
-
-void shift_messages(xint128_t& out0, xint128_t m,
- xint128_t& out1) NOEXCEPT
-{
- shift_message(out0, m, out1);
- shift_message(out0, m);
-}
-
-// endianness
-// ----------------------------------------------------------------------------
-
-// Endianness of the buffer/digest should be computed outside of hash function.
-// Given the full mutable buffer, can be parallallized and vectorized in place.
-
-void shuffle(xint128_t& s0, xint128_t& s1) NOEXCEPT
-{
- const auto t1 = i128::shuffle<0xb1>(s0);
- const auto t2 = i128::shuffle<0x1b>(s1);
- s0 = align_right<8>(t1, t2);
- s1 = blend<15>(t2, t1);
-}
-
-void unshuffle(xint128_t& s0, xint128_t& s1) NOEXCEPT
-{
- const xint128_t t1 = i128::shuffle<0x1b>(s0);
- const xint128_t t2 = i128::shuffle<0xb1>(s1);
- s0 = blend<15>(t1, t2);
- s1 = align_right<8>(t2, t1);
-}
-
-#endif
-
-////void hash_shani(state& state, const blocks& blocks) NOEXCEPT;
-void hash_shani(state& state, const block1& blocks) NOEXCEPT
-{
- BC_PUSH_WARNING(NO_ARRAY_INDEXING)
-
- xint128_t m0, m1, m2, m3, so0, so1;
-
- // From unaligned (public).
- auto s0 = load32x4u(state[0]);
- auto s1 = load32x4u(state[4]);
-
- // state/SHA is LE, so why bswap?
- // must be treating state as digest.
- shuffle(s0, s1);
-
- // Each round is four sha rounds.
- // One block in four lanes.
- for (auto& block: blocks)
- {
- // Remember old state.
- so0 = s0;
- so1 = s1;
-
- // One block loaded 16 bytes (1 uint128) per each of 4 messages.
- // load data and transform.
- m0 = load(block[0]);
-
- // shift message computes next 4 messages from prevous 4.
- // K: 0xe9b5dba5[3] 0xb5c0fbcfull[2] 0x71374491[1] 0x428a2f98ull[0]
- round(s0, s1, m0, 0xe9b5dba5b5c0fbcfull, 0x71374491428a2f98ull);
- m1 = load(block[16]);
- round(s0, s1, m1, 0xab1c5ed5923f82a4ull, 0x59f111f13956c25bull);
- shift_message(m0, m1); // new m0 from m1
- m2 = load(block[32]);
- round(s0, s1, m2, 0x550c7dc3243185beull, 0x12835b01d807aa98ull);
- shift_message(m1, m2);
- m3 = load(block[48]);
-
- // shift messages computes next 4 messages from prevous 8.
- round(s0, s1, m3, 0xc19bf1749bdc06a7ull, 0x80deb1fe72be5d74ull);
- shift_messages(m2, m3, m0);
- round(s0, s1, m0, 0x240ca1cc0fc19dc6ull, 0xefbe4786E49b69c1ull);
- shift_messages(m3, m0, m1);
- round(s0, s1, m1, 0x76f988da5cb0a9dcull, 0x4a7484aa2de92c6full);
- shift_messages(m0, m1, m2);
- round(s0, s1, m2, 0xbf597fc7b00327c8ull, 0xa831c66d983e5152ull);
- shift_messages(m1, m2, m3);
- round(s0, s1, m3, 0x1429296706ca6351ull, 0xd5a79147c6e00bf3ull);
- shift_messages(m2, m3, m0);
- round(s0, s1, m0, 0x53380d134d2c6dfcull, 0x2e1b213827b70a85ull);
- shift_messages(m3, m0, m1);
- round(s0, s1, m1, 0x92722c8581c2c92eull, 0x766a0abb650a7354ull);
- shift_messages(m0, m1, m2);
- round(s0, s1, m2, 0xc76c51A3c24b8b70ull, 0xa81a664ba2bfe8a1ull);
- shift_messages(m1, m2, m3);
- round(s0, s1, m3, 0x106aa070f40e3585ull, 0xd6990624d192e819ull);
- shift_messages(m2, m3, m0);
- round(s0, s1, m0, 0x34b0bcb52748774cull, 0x1e376c0819a4c116ull);
- shift_messages(m3, m0, m1);
- round(s0, s1, m1, 0x682e6ff35b9cca4full, 0x4ed8aa4a391c0cb3ull);
- shift_message(m0, m1, m2);
- round(s0, s1, m2, 0x8cc7020884c87814ull, 0x78a5636f748f82eeull);
- shift_message(m1, m2, m3);
- round(s0, s1, m3, 0xc67178f2bef9A3f7ull, 0xa4506ceb90befffaull);
-
- // Combine with old state.
- s0 = sum(s0, so0);
- s1 = sum(s1, so1);
- }
-
- // state/SHA is LE, so why bswap?
- // must be treating state as digest.
- unshuffle(s0, s1);
-
- // To not aligned.
- store32x4u(state[0], s0);
- store32x4u(state[4], s1);
-
- BC_POP_WARNING()
-}
-
-#endif // HAVE_XCPU
-
-#endif // DISABLED
-
-} // namespace sha256
-} // namespace system
-} // namespace libbitcoin
diff --git a/src/hash/vectorization/sha256_4_neon.cpp b/src/hash/vectorization/sha256_4_neon.cpp
deleted file mode 100644
index 9e3dfa6c65..0000000000
--- a/src/hash/vectorization/sha256_4_neon.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-// Based on:
-// sha256-arm.c - ARMv8 SHA extensions using C intrinsics
-// Written and placed in public domain by Jeffrey Walton
-// Based on code from ARM, and by Johannes Schneiders, Skip
-// Hovsmith and Barry O'Rourke for the mbedTLS project.
-
-#include
-#include
-#include
-#include
-#include
-
-namespace libbitcoin {
-namespace system {
-namespace sha256 {
-
-#if defined (DISABLED)
-
-#if !defined(HAVE_ARM)
-
-void hash_neon(state&, const block1&) NOEXCEPT
-{
- BC_ASSERT_MSG(false, "hash_neon undefined");
-}
-
-void merkle_neon(digest1&, const block1&) NOEXCEPT
-{
- BC_ASSERT_MSG(false, "merkle_neon undefined");
-}
-
-#else
-
-void hash_neon(state& state, const block1& blocks) NOEXCEPT
-{
- BC_PUSH_WARNING(NO_ARRAY_INDEXING)
-
- constexpr uint32_t k[]
- {
- 0x428A2F98, 0x71374491, 0xB5C0FBCF, 0xE9B5DBA5,
- 0x3956C25B, 0x59F111F1, 0x923F82A4, 0xAB1C5ED5,
- 0xD807AA98, 0x12835B01, 0x243185BE, 0x550C7DC3,
- 0x72BE5D74, 0x80DEB1FE, 0x9BDC06A7, 0xC19BF174,
- 0xE49B69C1, 0xEFBE4786, 0x0FC19DC6, 0x240CA1CC,
- 0x2DE92C6F, 0x4A7484AA, 0x5CB0A9DC, 0x76F988DA,
- 0x983E5152, 0xA831C66D, 0xB00327C8, 0xBF597FC7,
- 0xC6E00BF3, 0xD5A79147, 0x06CA6351, 0x14292967,
- 0x27B70A85, 0x2E1B2138, 0x4D2C6DFC, 0x53380D13,
- 0x650A7354, 0x766A0ABB, 0x81C2C92E, 0x92722C85,
- 0xA2BFE8A1, 0xA81A664B, 0xC24B8B70, 0xC76C51A3,
- 0xD192E819, 0xD6990624, 0xF40E3585, 0x106AA070,
- 0x19A4C116, 0x1E376C08, 0x2748774C, 0x34B0BCB5,
- 0x391C0CB3, 0x4ED8AA4A, 0x5B9CCA4F, 0x682E6FF3,
- 0x748F82EE, 0x78A5636F, 0x84C87814, 0x8CC70208,
- 0x90BEFFFA, 0xA4506CEB, 0xBEF9A3F7, 0xC67178F2,
- };
-
- uint32x4_t temp0, temp1, temp2;
- uint32x4_t state0, state1, abef_save, cdgh_save;
- uint32x4_t message0, message1, message2, message3;
-
- // Load state.
- state0 = vld1q_u32(&state[0]);
- state1 = vld1q_u32(&state[4]);
-
- // Each round is four sha rounds.
- // One block in four lanes.
- for (auto& block: blocks)
- {
- // Save state.
- abef_save = state0;
- cdgh_save = state1;
-
- // Load message.
- message0 = vld1q_u32(pointer_cast(&block[0]));
- message1 = vld1q_u32(pointer_cast(&block[16]));
- message2 = vld1q_u32(pointer_cast(&block[32]));
- message3 = vld1q_u32(pointer_cast(&block[48]));
-
- // Reverse for little endian.
- message0 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(message0)));
- message1 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(message1)));
- message2 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(message2)));
- message3 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(message3)));
-
- temp0 = vaddq_u32(message0, vld1q_u32(&k[0x00]));
-
- // Rounds 0-3.
- message0 = vsha256su0q_u32(message0, message1);
- temp2 = state0;
- temp1 = vaddq_u32(message1, vld1q_u32(&k[0x04]));
- state0 = vsha256hq_u32(state0, state1, temp0);
- state1 = vsha256h2q_u32(state1, temp2, temp0);
- message0 = vsha256su1q_u32(message0, message2, message3);
-
- // Rounds 4-7.
- message1 = vsha256su0q_u32(message1, message2);
- temp2 = state0;
- temp0 = vaddq_u32(message2, vld1q_u32(&k[0x08]));
- state0 = vsha256hq_u32(state0, state1, temp1);
- state1 = vsha256h2q_u32(state1, temp2, temp1);
- message1 = vsha256su1q_u32(message1, message3, message0);
-
- // Rounds 8-11.
- message2 = vsha256su0q_u32(message2, message3);
- temp2 = state0;
- temp1 = vaddq_u32(message3, vld1q_u32(&k[0x0c]));
- state0 = vsha256hq_u32(state0, state1, temp0);
- state1 = vsha256h2q_u32(state1, temp2, temp0);
- message2 = vsha256su1q_u32(message2, message0, message1);
-
- // Rounds 12-15.
- message3 = vsha256su0q_u32(message3, message0);
- temp2 = state0;
- temp0 = vaddq_u32(message0, vld1q_u32(&k[0x10]));
- state0 = vsha256hq_u32(state0, state1, temp1);
- state1 = vsha256h2q_u32(state1, temp2, temp1);
- message3 = vsha256su1q_u32(message3, message1, message2);
-
- // Rounds 16-19.
- message0 = vsha256su0q_u32(message0, message1);
- temp2 = state0;
- temp1 = vaddq_u32(message1, vld1q_u32(&k[0x14]));
- state0 = vsha256hq_u32(state0, state1, temp0);
- state1 = vsha256h2q_u32(state1, temp2, temp0);
- message0 = vsha256su1q_u32(message0, message2, message3);
-
- // Rounds 20-23.
- message1 = vsha256su0q_u32(message1, message2);
- temp2 = state0;
- temp0 = vaddq_u32(message2, vld1q_u32(&k[0x18]));
- state0 = vsha256hq_u32(state0, state1, temp1);
- state1 = vsha256h2q_u32(state1, temp2, temp1);
- message1 = vsha256su1q_u32(message1, message3, message0);
-
- // Rounds 24-27.
- message2 = vsha256su0q_u32(message2, message3);
- temp2 = state0;
- temp1 = vaddq_u32(message3, vld1q_u32(&k[0x1c]));
- state0 = vsha256hq_u32(state0, state1, temp0);
- state1 = vsha256h2q_u32(state1, temp2, temp0);
- message2 = vsha256su1q_u32(message2, message0, message1);
-
- // Rounds 28-31.
- message3 = vsha256su0q_u32(message3, message0);
- temp2 = state0;
- temp0 = vaddq_u32(message0, vld1q_u32(&k[0x20]));
- state0 = vsha256hq_u32(state0, state1, temp1);
- state1 = vsha256h2q_u32(state1, temp2, temp1);
- message3 = vsha256su1q_u32(message3, message1, message2);
-
- // Rounds 32-35.
- message0 = vsha256su0q_u32(message0, message1);
- temp2 = state0;
- temp1 = vaddq_u32(message1, vld1q_u32(&k[0x24]));
- state0 = vsha256hq_u32(state0, state1, temp0);
- state1 = vsha256h2q_u32(state1, temp2, temp0);
- message0 = vsha256su1q_u32(message0, message2, message3);
-
- // Rounds 36-39.
- message1 = vsha256su0q_u32(message1, message2);
- temp2 = state0;
- temp0 = vaddq_u32(message2, vld1q_u32(&k[0x28]));
- state0 = vsha256hq_u32(state0, state1, temp1);
- state1 = vsha256h2q_u32(state1, temp2, temp1);
- message1 = vsha256su1q_u32(message1, message3, message0);
-
- // Rounds 40-43.
- message2 = vsha256su0q_u32(message2, message3);
- temp2 = state0;
- temp1 = vaddq_u32(message3, vld1q_u32(&k[0x2c]));
- state0 = vsha256hq_u32(state0, state1, temp0);
- state1 = vsha256h2q_u32(state1, temp2, temp0);
- message2 = vsha256su1q_u32(message2, message0, message1);
-
- // Rounds 44-47.
- message3 = vsha256su0q_u32(message3, message0);
- temp2 = state0;
- temp0 = vaddq_u32(message0, vld1q_u32(&k[0x30]));
- state0 = vsha256hq_u32(state0, state1, temp1);
- state1 = vsha256h2q_u32(state1, temp2, temp1);
- message3 = vsha256su1q_u32(message3, message1, message2);
-
- // Rounds 48-51.
- temp2 = state0;
- temp1 = vaddq_u32(message1, vld1q_u32(&k[0x34]));
- state0 = vsha256hq_u32(state0, state1, temp0);
- state1 = vsha256h2q_u32(state1, temp2, temp0);
-
- // Rounds 52-55.
- temp2 = state0;
- temp0 = vaddq_u32(message2, vld1q_u32(&k[0x38]));
- state0 = vsha256hq_u32(state0, state1, temp1);
- state1 = vsha256h2q_u32(state1, temp2, temp1);
-
- // Rounds 56-59.
- temp2 = state0;
- temp1 = vaddq_u32(message3, vld1q_u32(&k[0x3c]));
- state0 = vsha256hq_u32(state0, state1, temp0);
- state1 = vsha256h2q_u32(state1, temp2, temp0);
-
- // Rounds 60-63.
- temp2 = state0;
- state0 = vsha256hq_u32(state0, state1, temp1);
- state1 = vsha256h2q_u32(state1, temp2, temp1);
-
- // Combine state.
- state0 = vaddq_u32(state0, abef_save);
- state1 = vaddq_u32(state1, cdgh_save);
- }
-
- // Save state.
- vst1q_u32(&state[0], state0);
- vst1q_u32(&state[4], state1);
-
- BC_POP_WARNING()
-}
-
-#endif // HAVE_ARM
-
-#endif // DISABLED
-
-} // namespace sha256
-} // namespace system
-} // namespace libbitcoin
diff --git a/src/hash/vectorization/sha256_4_sse4.cpp b/src/hash/vectorization/sha256_4_sse4.cpp
deleted file mode 100644
index 76760e6894..0000000000
--- a/src/hash/vectorization/sha256_4_sse4.cpp
+++ /dev/null
@@ -1,1033 +0,0 @@
-//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-//; Copyright (c) 2012, Intel Corporation
-//;
-//; All rights reserved.
-//;
-//; Redistribution and use in source and binary forms, with or without
-//; modification, are permitted provided that the following conditions are
-//; met:
-//;
-//; * Redistributions of source code must retain the above copyright
-//; notice, this list of conditions and the following disclaimer.
-//;
-//; * Redistributions in binary form must reproduce the above copyright
-//; notice, this list of conditions and the following disclaimer in the
-//; documentation and/or other materials provided with the
-//; distribution.
-//;
-//; * Neither the name of the Intel Corporation nor the names of its
-//; contributors may be used to endorse or promote products derived from
-//; this software without specific prior written permission.
-//;
-//;
-//; THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION "AS IS" AND ANY
-//; EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-//; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-//; PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR
-//; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
-//; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
-//; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
-//; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
-//; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
-//; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-//; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-//;
-//; This code is described in an Intel White-Paper:
-//; "Fast SHA-256 Implementations on Intel Architecture Processors"
-//;
-//; To find it, surf to https://www.intel.com/p/en_US/embedded
-//; and search for that title.
-//; The paper is expected to be released roughly at the end of April, 2012
-//;
-//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-//; This code schedules 1 blocks at a time, with 4 lanes per block
-//;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-//
-// Port to inline assembly provided by:
-// Copyright (c) 2017-2019 The Bitcoin Core developers
-// Distributed under the MIT software license, see the accompanying
-// file COPYING or http://www.opensource.org/licenses/mit-license.php.
-
-#include
-#include
-#include
-
-namespace libbitcoin {
-namespace system {
-namespace sha256 {
-
-#if defined (DISABLED)
-
-#if defined(HAVE_XASSEMBLY)
-
-void hash_sse41a(state& state, const block1& blocks) NOEXCEPT
-{
- alignas(16) constexpr uint32_t k256[]
- {
- 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
- 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
- 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
- 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
- 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
- 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
- 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
- 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
- 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
- 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
- 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
- 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
- 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
- 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
- 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
- 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
- };
-
- alignas(16) constexpr uint32_t flip_mask[]
- {
- 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
- };
-
- alignas(16) constexpr uint32_t shuffle_00ba[]
- {
- 0x03020100, 0x0b0a0908, 0xffffffff, 0xffffffff
- };
-
- alignas(16) constexpr uint32_t shuffle_dc00[]
- {
- 0xffffffff, 0xffffffff, 0x03020100, 0x0b0a0908
- };
-
- uint32_t a, b, c, d, f, g, h, y0, y1, y2;
- uint64_t table;
- uint64_t input_end, input;
- alignas(16) uint32_t transfer[4];
- auto state_integers = pointer_cast(&state);
- const auto data_bytes = pointer_cast(blocks.data());
-
-#ifndef VISUAL
- __asm__ __volatile__(
- "shl $0x6,%2;"
- "je Ldone_hash_%=;"
- "add %1,%2;"
- "mov %2,%14;"
- "mov (%0),%3;"
- "mov 0x4(%0),%4;"
- "mov 0x8(%0),%5;"
- "mov 0xc(%0),%6;"
- "mov 0x10(%0),%k2;"
- "mov 0x14(%0),%7;"
- "mov 0x18(%0),%8;"
- "mov 0x1c(%0),%9;"
- "movdqa %18,%%xmm12;"
- "movdqa %19,%%xmm10;"
- "movdqa %20,%%xmm11;"
-
- "Lloop0_%=:"
- "lea %17,%13;"
- "movdqu (%1),%%xmm4;"
- "pshufb %%xmm12,%%xmm4;"
- "movdqu 0x10(%1),%%xmm5;"
- "pshufb %%xmm12,%%xmm5;"
- "movdqu 0x20(%1),%%xmm6;"
- "pshufb %%xmm12,%%xmm6;"
- "movdqu 0x30(%1),%%xmm7;"
- "pshufb %%xmm12,%%xmm7;"
- "mov %1,%15;"
- "mov $3,%1;"
-
- "Lloop1_%=:"
- "movdqa 0x0(%13),%%xmm9;"
- "paddd %%xmm4,%%xmm9;"
- "movdqa %%xmm9,%16;"
- "movdqa %%xmm7,%%xmm0;"
- "mov %k2,%10;"
- "ror $0xe,%10;"
- "mov %3,%11;"
- "palignr $0x4,%%xmm6,%%xmm0;"
- "ror $0x9,%11;"
- "xor %k2,%10;"
- "mov %7,%12;"
- "ror $0x5,%10;"
- "movdqa %%xmm5,%%xmm1;"
- "xor %3,%11;"
- "xor %8,%12;"
- "paddd %%xmm4,%%xmm0;"
- "xor %k2,%10;"
- "and %k2,%12;"
- "ror $0xb,%11;"
- "palignr $0x4,%%xmm4,%%xmm1;"
- "xor %3,%11;"
- "ror $0x6,%10;"
- "xor %8,%12;"
- "movdqa %%xmm1,%%xmm2;"
- "ror $0x2,%11;"
- "add %10,%12;"
- "add %16,%12;"
- "movdqa %%xmm1,%%xmm3;"
- "mov %3,%10;"
- "add %12,%9;"
- "mov %3,%12;"
- "pslld $0x19,%%xmm1;"
- "or %5,%10;"
- "add %9,%6;"
- "and %5,%12;"
- "psrld $0x7,%%xmm2;"
- "and %4,%10;"
- "add %11,%9;"
- "por %%xmm2,%%xmm1;"
- "or %12,%10;"
- "add %10,%9;"
- "movdqa %%xmm3,%%xmm2;"
- "mov %6,%10;"
- "mov %9,%11;"
- "movdqa %%xmm3,%%xmm8;"
- "ror $0xe,%10;"
- "xor %6,%10;"
- "mov %k2,%12;"
- "ror $0x9,%11;"
- "pslld $0xe,%%xmm3;"
- "xor %9,%11;"
- "ror $0x5,%10;"
- "xor %7,%12;"
- "psrld $0x12,%%xmm2;"
- "ror $0xb,%11;"
- "xor %6,%10;"
- "and %6,%12;"
- "ror $0x6,%10;"
- "pxor %%xmm3,%%xmm1;"
- "xor %9,%11;"
- "xor %7,%12;"
- "psrld $0x3,%%xmm8;"
- "add %10,%12;"
- "add 4+%16,%12;"
- "ror $0x2,%11;"
- "pxor %%xmm2,%%xmm1;"
- "mov %9,%10;"
- "add %12,%8;"
- "mov %9,%12;"
- "pxor %%xmm8,%%xmm1;"
- "or %4,%10;"
- "add %8,%5;"
- "and %4,%12;"
- "pshufd $0xfa,%%xmm7,%%xmm2;"
- "and %3,%10;"
- "add %11,%8;"
- "paddd %%xmm1,%%xmm0;"
- "or %12,%10;"
- "add %10,%8;"
- "movdqa %%xmm2,%%xmm3;"
- "mov %5,%10;"
- "mov %8,%11;"
- "ror $0xe,%10;"
- "movdqa %%xmm2,%%xmm8;"
- "xor %5,%10;"
- "ror $0x9,%11;"
- "mov %6,%12;"
- "xor %8,%11;"
- "ror $0x5,%10;"
- "psrlq $0x11,%%xmm2;"
- "xor %k2,%12;"
- "psrlq $0x13,%%xmm3;"
- "xor %5,%10;"
- "and %5,%12;"
- "psrld $0xa,%%xmm8;"
- "ror $0xb,%11;"
- "xor %8,%11;"
- "xor %k2,%12;"
- "ror $0x6,%10;"
- "pxor %%xmm3,%%xmm2;"
- "add %10,%12;"
- "ror $0x2,%11;"
- "add 8+%16,%12;"
- "pxor %%xmm2,%%xmm8;"
- "mov %8,%10;"
- "add %12,%7;"
- "mov %8,%12;"
- "pshufb %%xmm10,%%xmm8;"
- "or %3,%10;"
- "add %7,%4;"
- "and %3,%12;"
- "paddd %%xmm8,%%xmm0;"
- "and %9,%10;"
- "add %11,%7;"
- "pshufd $0x50,%%xmm0,%%xmm2;"
- "or %12,%10;"
- "add %10,%7;"
- "movdqa %%xmm2,%%xmm3;"
- "mov %4,%10;"
- "ror $0xe,%10;"
- "mov %7,%11;"
- "movdqa %%xmm2,%%xmm4;"
- "ror $0x9,%11;"
- "xor %4,%10;"
- "mov %5,%12;"
- "ror $0x5,%10;"
- "psrlq $0x11,%%xmm2;"
- "xor %7,%11;"
- "xor %6,%12;"
- "psrlq $0x13,%%xmm3;"
- "xor %4,%10;"
- "and %4,%12;"
- "ror $0xb,%11;"
- "psrld $0xa,%%xmm4;"
- "xor %7,%11;"
- "ror $0x6,%10;"
- "xor %6,%12;"
- "pxor %%xmm3,%%xmm2;"
- "ror $0x2,%11;"
- "add %10,%12;"
- "add 12+%16,%12;"
- "pxor %%xmm2,%%xmm4;"
- "mov %7,%10;"
- "add %12,%k2;"
- "mov %7,%12;"
- "pshufb %%xmm11,%%xmm4;"
- "or %9,%10;"
- "add %k2,%3;"
- "and %9,%12;"
- "paddd %%xmm0,%%xmm4;"
- "and %8,%10;"
- "add %11,%k2;"
- "or %12,%10;"
- "add %10,%k2;"
- "movdqa 0x10(%13),%%xmm9;"
- "paddd %%xmm5,%%xmm9;"
- "movdqa %%xmm9,%16;"
- "movdqa %%xmm4,%%xmm0;"
- "mov %3,%10;"
- "ror $0xe,%10;"
- "mov %k2,%11;"
- "palignr $0x4,%%xmm7,%%xmm0;"
- "ror $0x9,%11;"
- "xor %3,%10;"
- "mov %4,%12;"
- "ror $0x5,%10;"
- "movdqa %%xmm6,%%xmm1;"
- "xor %k2,%11;"
- "xor %5,%12;"
- "paddd %%xmm5,%%xmm0;"
- "xor %3,%10;"
- "and %3,%12;"
- "ror $0xb,%11;"
- "palignr $0x4,%%xmm5,%%xmm1;"
- "xor %k2,%11;"
- "ror $0x6,%10;"
- "xor %5,%12;"
- "movdqa %%xmm1,%%xmm2;"
- "ror $0x2,%11;"
- "add %10,%12;"
- "add %16,%12;"
- "movdqa %%xmm1,%%xmm3;"
- "mov %k2,%10;"
- "add %12,%6;"
- "mov %k2,%12;"
- "pslld $0x19,%%xmm1;"
- "or %8,%10;"
- "add %6,%9;"
- "and %8,%12;"
- "psrld $0x7,%%xmm2;"
- "and %7,%10;"
- "add %11,%6;"
- "por %%xmm2,%%xmm1;"
- "or %12,%10;"
- "add %10,%6;"
- "movdqa %%xmm3,%%xmm2;"
- "mov %9,%10;"
- "mov %6,%11;"
- "movdqa %%xmm3,%%xmm8;"
- "ror $0xe,%10;"
- "xor %9,%10;"
- "mov %3,%12;"
- "ror $0x9,%11;"
- "pslld $0xe,%%xmm3;"
- "xor %6,%11;"
- "ror $0x5,%10;"
- "xor %4,%12;"
- "psrld $0x12,%%xmm2;"
- "ror $0xb,%11;"
- "xor %9,%10;"
- "and %9,%12;"
- "ror $0x6,%10;"
- "pxor %%xmm3,%%xmm1;"
- "xor %6,%11;"
- "xor %4,%12;"
- "psrld $0x3,%%xmm8;"
- "add %10,%12;"
- "add 4+%16,%12;"
- "ror $0x2,%11;"
- "pxor %%xmm2,%%xmm1;"
- "mov %6,%10;"
- "add %12,%5;"
- "mov %6,%12;"
- "pxor %%xmm8,%%xmm1;"
- "or %7,%10;"
- "add %5,%8;"
- "and %7,%12;"
- "pshufd $0xfa,%%xmm4,%%xmm2;"
- "and %k2,%10;"
- "add %11,%5;"
- "paddd %%xmm1,%%xmm0;"
- "or %12,%10;"
- "add %10,%5;"
- "movdqa %%xmm2,%%xmm3;"
- "mov %8,%10;"
- "mov %5,%11;"
- "ror $0xe,%10;"
- "movdqa %%xmm2,%%xmm8;"
- "xor %8,%10;"
- "ror $0x9,%11;"
- "mov %9,%12;"
- "xor %5,%11;"
- "ror $0x5,%10;"
- "psrlq $0x11,%%xmm2;"
- "xor %3,%12;"
- "psrlq $0x13,%%xmm3;"
- "xor %8,%10;"
- "and %8,%12;"
- "psrld $0xa,%%xmm8;"
- "ror $0xb,%11;"
- "xor %5,%11;"
- "xor %3,%12;"
- "ror $0x6,%10;"
- "pxor %%xmm3,%%xmm2;"
- "add %10,%12;"
- "ror $0x2,%11;"
- "add 8+%16,%12;"
- "pxor %%xmm2,%%xmm8;"
- "mov %5,%10;"
- "add %12,%4;"
- "mov %5,%12;"
- "pshufb %%xmm10,%%xmm8;"
- "or %k2,%10;"
- "add %4,%7;"
- "and %k2,%12;"
- "paddd %%xmm8,%%xmm0;"
- "and %6,%10;"
- "add %11,%4;"
- "pshufd $0x50,%%xmm0,%%xmm2;"
- "or %12,%10;"
- "add %10,%4;"
- "movdqa %%xmm2,%%xmm3;"
- "mov %7,%10;"
- "ror $0xe,%10;"
- "mov %4,%11;"
- "movdqa %%xmm2,%%xmm5;"
- "ror $0x9,%11;"
- "xor %7,%10;"
- "mov %8,%12;"
- "ror $0x5,%10;"
- "psrlq $0x11,%%xmm2;"
- "xor %4,%11;"
- "xor %9,%12;"
- "psrlq $0x13,%%xmm3;"
- "xor %7,%10;"
- "and %7,%12;"
- "ror $0xb,%11;"
- "psrld $0xa,%%xmm5;"
- "xor %4,%11;"
- "ror $0x6,%10;"
- "xor %9,%12;"
- "pxor %%xmm3,%%xmm2;"
- "ror $0x2,%11;"
- "add %10,%12;"
- "add 12+%16,%12;"
- "pxor %%xmm2,%%xmm5;"
- "mov %4,%10;"
- "add %12,%3;"
- "mov %4,%12;"
- "pshufb %%xmm11,%%xmm5;"
- "or %6,%10;"
- "add %3,%k2;"
- "and %6,%12;"
- "paddd %%xmm0,%%xmm5;"
- "and %5,%10;"
- "add %11,%3;"
- "or %12,%10;"
- "add %10,%3;"
- "movdqa 0x20(%13),%%xmm9;"
- "paddd %%xmm6,%%xmm9;"
- "movdqa %%xmm9,%16;"
- "movdqa %%xmm5,%%xmm0;"
- "mov %k2,%10;"
- "ror $0xe,%10;"
- "mov %3,%11;"
- "palignr $0x4,%%xmm4,%%xmm0;"
- "ror $0x9,%11;"
- "xor %k2,%10;"
- "mov %7,%12;"
- "ror $0x5,%10;"
- "movdqa %%xmm7,%%xmm1;"
- "xor %3,%11;"
- "xor %8,%12;"
- "paddd %%xmm6,%%xmm0;"
- "xor %k2,%10;"
- "and %k2,%12;"
- "ror $0xb,%11;"
- "palignr $0x4,%%xmm6,%%xmm1;"
- "xor %3,%11;"
- "ror $0x6,%10;"
- "xor %8,%12;"
- "movdqa %%xmm1,%%xmm2;"
- "ror $0x2,%11;"
- "add %10,%12;"
- "add %16,%12;"
- "movdqa %%xmm1,%%xmm3;"
- "mov %3,%10;"
- "add %12,%9;"
- "mov %3,%12;"
- "pslld $0x19,%%xmm1;"
- "or %5,%10;"
- "add %9,%6;"
- "and %5,%12;"
- "psrld $0x7,%%xmm2;"
- "and %4,%10;"
- "add %11,%9;"
- "por %%xmm2,%%xmm1;"
- "or %12,%10;"
- "add %10,%9;"
- "movdqa %%xmm3,%%xmm2;"
- "mov %6,%10;"
- "mov %9,%11;"
- "movdqa %%xmm3,%%xmm8;"
- "ror $0xe,%10;"
- "xor %6,%10;"
- "mov %k2,%12;"
- "ror $0x9,%11;"
- "pslld $0xe,%%xmm3;"
- "xor %9,%11;"
- "ror $0x5,%10;"
- "xor %7,%12;"
- "psrld $0x12,%%xmm2;"
- "ror $0xb,%11;"
- "xor %6,%10;"
- "and %6,%12;"
- "ror $0x6,%10;"
- "pxor %%xmm3,%%xmm1;"
- "xor %9,%11;"
- "xor %7,%12;"
- "psrld $0x3,%%xmm8;"
- "add %10,%12;"
- "add 4+%16,%12;"
- "ror $0x2,%11;"
- "pxor %%xmm2,%%xmm1;"
- "mov %9,%10;"
- "add %12,%8;"
- "mov %9,%12;"
- "pxor %%xmm8,%%xmm1;"
- "or %4,%10;"
- "add %8,%5;"
- "and %4,%12;"
- "pshufd $0xfa,%%xmm5,%%xmm2;"
- "and %3,%10;"
- "add %11,%8;"
- "paddd %%xmm1,%%xmm0;"
- "or %12,%10;"
- "add %10,%8;"
- "movdqa %%xmm2,%%xmm3;"
- "mov %5,%10;"
- "mov %8,%11;"
- "ror $0xe,%10;"
- "movdqa %%xmm2,%%xmm8;"
- "xor %5,%10;"
- "ror $0x9,%11;"
- "mov %6,%12;"
- "xor %8,%11;"
- "ror $0x5,%10;"
- "psrlq $0x11,%%xmm2;"
- "xor %k2,%12;"
- "psrlq $0x13,%%xmm3;"
- "xor %5,%10;"
- "and %5,%12;"
- "psrld $0xa,%%xmm8;"
- "ror $0xb,%11;"
- "xor %8,%11;"
- "xor %k2,%12;"
- "ror $0x6,%10;"
- "pxor %%xmm3,%%xmm2;"
- "add %10,%12;"
- "ror $0x2,%11;"
- "add 8+%16,%12;"
- "pxor %%xmm2,%%xmm8;"
- "mov %8,%10;"
- "add %12,%7;"
- "mov %8,%12;"
- "pshufb %%xmm10,%%xmm8;"
- "or %3,%10;"
- "add %7,%4;"
- "and %3,%12;"
- "paddd %%xmm8,%%xmm0;"
- "and %9,%10;"
- "add %11,%7;"
- "pshufd $0x50,%%xmm0,%%xmm2;"
- "or %12,%10;"
- "add %10,%7;"
- "movdqa %%xmm2,%%xmm3;"
- "mov %4,%10;"
- "ror $0xe,%10;"
- "mov %7,%11;"
- "movdqa %%xmm2,%%xmm6;"
- "ror $0x9,%11;"
- "xor %4,%10;"
- "mov %5,%12;"
- "ror $0x5,%10;"
- "psrlq $0x11,%%xmm2;"
- "xor %7,%11;"
- "xor %6,%12;"
- "psrlq $0x13,%%xmm3;"
- "xor %4,%10;"
- "and %4,%12;"
- "ror $0xb,%11;"
- "psrld $0xa,%%xmm6;"
- "xor %7,%11;"
- "ror $0x6,%10;"
- "xor %6,%12;"
- "pxor %%xmm3,%%xmm2;"
- "ror $0x2,%11;"
- "add %10,%12;"
- "add 12+%16,%12;"
- "pxor %%xmm2,%%xmm6;"
- "mov %7,%10;"
- "add %12,%k2;"
- "mov %7,%12;"
- "pshufb %%xmm11,%%xmm6;"
- "or %9,%10;"
- "add %k2,%3;"
- "and %9,%12;"
- "paddd %%xmm0,%%xmm6;"
- "and %8,%10;"
- "add %11,%k2;"
- "or %12,%10;"
- "add %10,%k2;"
- "movdqa 0x30(%13),%%xmm9;"
- "paddd %%xmm7,%%xmm9;"
- "movdqa %%xmm9,%16;"
- "add $0x40,%13;"
- "movdqa %%xmm6,%%xmm0;"
- "mov %3,%10;"
- "ror $0xe,%10;"
- "mov %k2,%11;"
- "palignr $0x4,%%xmm5,%%xmm0;"
- "ror $0x9,%11;"
- "xor %3,%10;"
- "mov %4,%12;"
- "ror $0x5,%10;"
- "movdqa %%xmm4,%%xmm1;"
- "xor %k2,%11;"
- "xor %5,%12;"
- "paddd %%xmm7,%%xmm0;"
- "xor %3,%10;"
- "and %3,%12;"
- "ror $0xb,%11;"
- "palignr $0x4,%%xmm7,%%xmm1;"
- "xor %k2,%11;"
- "ror $0x6,%10;"
- "xor %5,%12;"
- "movdqa %%xmm1,%%xmm2;"
- "ror $0x2,%11;"
- "add %10,%12;"
- "add %16,%12;"
- "movdqa %%xmm1,%%xmm3;"
- "mov %k2,%10;"
- "add %12,%6;"
- "mov %k2,%12;"
- "pslld $0x19,%%xmm1;"
- "or %8,%10;"
- "add %6,%9;"
- "and %8,%12;"
- "psrld $0x7,%%xmm2;"
- "and %7,%10;"
- "add %11,%6;"
- "por %%xmm2,%%xmm1;"
- "or %12,%10;"
- "add %10,%6;"
- "movdqa %%xmm3,%%xmm2;"
- "mov %9,%10;"
- "mov %6,%11;"
- "movdqa %%xmm3,%%xmm8;"
- "ror $0xe,%10;"
- "xor %9,%10;"
- "mov %3,%12;"
- "ror $0x9,%11;"
- "pslld $0xe,%%xmm3;"
- "xor %6,%11;"
- "ror $0x5,%10;"
- "xor %4,%12;"
- "psrld $0x12,%%xmm2;"
- "ror $0xb,%11;"
- "xor %9,%10;"
- "and %9,%12;"
- "ror $0x6,%10;"
- "pxor %%xmm3,%%xmm1;"
- "xor %6,%11;"
- "xor %4,%12;"
- "psrld $0x3,%%xmm8;"
- "add %10,%12;"
- "add 4+%16,%12;"
- "ror $0x2,%11;"
- "pxor %%xmm2,%%xmm1;"
- "mov %6,%10;"
- "add %12,%5;"
- "mov %6,%12;"
- "pxor %%xmm8,%%xmm1;"
- "or %7,%10;"
- "add %5,%8;"
- "and %7,%12;"
- "pshufd $0xfa,%%xmm6,%%xmm2;"
- "and %k2,%10;"
- "add %11,%5;"
- "paddd %%xmm1,%%xmm0;"
- "or %12,%10;"
- "add %10,%5;"
- "movdqa %%xmm2,%%xmm3;"
- "mov %8,%10;"
- "mov %5,%11;"
- "ror $0xe,%10;"
- "movdqa %%xmm2,%%xmm8;"
- "xor %8,%10;"
- "ror $0x9,%11;"
- "mov %9,%12;"
- "xor %5,%11;"
- "ror $0x5,%10;"
- "psrlq $0x11,%%xmm2;"
- "xor %3,%12;"
- "psrlq $0x13,%%xmm3;"
- "xor %8,%10;"
- "and %8,%12;"
- "psrld $0xa,%%xmm8;"
- "ror $0xb,%11;"
- "xor %5,%11;"
- "xor %3,%12;"
- "ror $0x6,%10;"
- "pxor %%xmm3,%%xmm2;"
- "add %10,%12;"
- "ror $0x2,%11;"
- "add 8+%16,%12;"
- "pxor %%xmm2,%%xmm8;"
- "mov %5,%10;"
- "add %12,%4;"
- "mov %5,%12;"
- "pshufb %%xmm10,%%xmm8;"
- "or %k2,%10;"
- "add %4,%7;"
- "and %k2,%12;"
- "paddd %%xmm8,%%xmm0;"
- "and %6,%10;"
- "add %11,%4;"
- "pshufd $0x50,%%xmm0,%%xmm2;"
- "or %12,%10;"
- "add %10,%4;"
- "movdqa %%xmm2,%%xmm3;"
- "mov %7,%10;"
- "ror $0xe,%10;"
- "mov %4,%11;"
- "movdqa %%xmm2,%%xmm7;"
- "ror $0x9,%11;"
- "xor %7,%10;"
- "mov %8,%12;"
- "ror $0x5,%10;"
- "psrlq $0x11,%%xmm2;"
- "xor %4,%11;"
- "xor %9,%12;"
- "psrlq $0x13,%%xmm3;"
- "xor %7,%10;"
- "and %7,%12;"
- "ror $0xb,%11;"
- "psrld $0xa,%%xmm7;"
- "xor %4,%11;"
- "ror $0x6,%10;"
- "xor %9,%12;"
- "pxor %%xmm3,%%xmm2;"
- "ror $0x2,%11;"
- "add %10,%12;"
- "add 12+%16,%12;"
- "pxor %%xmm2,%%xmm7;"
- "mov %4,%10;"
- "add %12,%3;"
- "mov %4,%12;"
- "pshufb %%xmm11,%%xmm7;"
- "or %6,%10;"
- "add %3,%k2;"
- "and %6,%12;"
- "paddd %%xmm0,%%xmm7;"
- "and %5,%10;"
- "add %11,%3;"
- "or %12,%10;"
- "add %10,%3;"
- "sub $0x1,%1;"
- "jne Lloop1_%=;"
- "mov $0x2,%1;"
-
- "Lloop2_%=:"
- "paddd 0x0(%13),%%xmm4;"
- "movdqa %%xmm4,%16;"
- "mov %k2,%10;"
- "ror $0xe,%10;"
- "mov %3,%11;"
- "xor %k2,%10;"
- "ror $0x9,%11;"
- "mov %7,%12;"
- "xor %3,%11;"
- "ror $0x5,%10;"
- "xor %8,%12;"
- "xor %k2,%10;"
- "ror $0xb,%11;"
- "and %k2,%12;"
- "xor %3,%11;"
- "ror $0x6,%10;"
- "xor %8,%12;"
- "add %10,%12;"
- "ror $0x2,%11;"
- "add %16,%12;"
- "mov %3,%10;"
- "add %12,%9;"
- "mov %3,%12;"
- "or %5,%10;"
- "add %9,%6;"
- "and %5,%12;"
- "and %4,%10;"
- "add %11,%9;"
- "or %12,%10;"
- "add %10,%9;"
- "mov %6,%10;"
- "ror $0xe,%10;"
- "mov %9,%11;"
- "xor %6,%10;"
- "ror $0x9,%11;"
- "mov %k2,%12;"
- "xor %9,%11;"
- "ror $0x5,%10;"
- "xor %7,%12;"
- "xor %6,%10;"
- "ror $0xb,%11;"
- "and %6,%12;"
- "xor %9,%11;"
- "ror $0x6,%10;"
- "xor %7,%12;"
- "add %10,%12;"
- "ror $0x2,%11;"
- "add 4+%16,%12;"
- "mov %9,%10;"
- "add %12,%8;"
- "mov %9,%12;"
- "or %4,%10;"
- "add %8,%5;"
- "and %4,%12;"
- "and %3,%10;"
- "add %11,%8;"
- "or %12,%10;"
- "add %10,%8;"
- "mov %5,%10;"
- "ror $0xe,%10;"
- "mov %8,%11;"
- "xor %5,%10;"
- "ror $0x9,%11;"
- "mov %6,%12;"
- "xor %8,%11;"
- "ror $0x5,%10;"
- "xor %k2,%12;"
- "xor %5,%10;"
- "ror $0xb,%11;"
- "and %5,%12;"
- "xor %8,%11;"
- "ror $0x6,%10;"
- "xor %k2,%12;"
- "add %10,%12;"
- "ror $0x2,%11;"
- "add 8+%16,%12;"
- "mov %8,%10;"
- "add %12,%7;"
- "mov %8,%12;"
- "or %3,%10;"
- "add %7,%4;"
- "and %3,%12;"
- "and %9,%10;"
- "add %11,%7;"
- "or %12,%10;"
- "add %10,%7;"
- "mov %4,%10;"
- "ror $0xe,%10;"
- "mov %7,%11;"
- "xor %4,%10;"
- "ror $0x9,%11;"
- "mov %5,%12;"
- "xor %7,%11;"
- "ror $0x5,%10;"
- "xor %6,%12;"
- "xor %4,%10;"
- "ror $0xb,%11;"
- "and %4,%12;"
- "xor %7,%11;"
- "ror $0x6,%10;"
- "xor %6,%12;"
- "add %10,%12;"
- "ror $0x2,%11;"
- "add 12+%16,%12;"
- "mov %7,%10;"
- "add %12,%k2;"
- "mov %7,%12;"
- "or %9,%10;"
- "add %k2,%3;"
- "and %9,%12;"
- "and %8,%10;"
- "add %11,%k2;"
- "or %12,%10;"
- "add %10,%k2;"
- "paddd 0x10(%13),%%xmm5;"
- "movdqa %%xmm5,%16;"
- "add $0x20,%13;"
- "mov %3,%10;"
- "ror $0xe,%10;"
- "mov %k2,%11;"
- "xor %3,%10;"
- "ror $0x9,%11;"
- "mov %4,%12;"
- "xor %k2,%11;"
- "ror $0x5,%10;"
- "xor %5,%12;"
- "xor %3,%10;"
- "ror $0xb,%11;"
- "and %3,%12;"
- "xor %k2,%11;"
- "ror $0x6,%10;"
- "xor %5,%12;"
- "add %10,%12;"
- "ror $0x2,%11;"
- "add %16,%12;"
- "mov %k2,%10;"
- "add %12,%6;"
- "mov %k2,%12;"
- "or %8,%10;"
- "add %6,%9;"
- "and %8,%12;"
- "and %7,%10;"
- "add %11,%6;"
- "or %12,%10;"
- "add %10,%6;"
- "mov %9,%10;"
- "ror $0xe,%10;"
- "mov %6,%11;"
- "xor %9,%10;"
- "ror $0x9,%11;"
- "mov %3,%12;"
- "xor %6,%11;"
- "ror $0x5,%10;"
- "xor %4,%12;"
- "xor %9,%10;"
- "ror $0xb,%11;"
- "and %9,%12;"
- "xor %6,%11;"
- "ror $0x6,%10;"
- "xor %4,%12;"
- "add %10,%12;"
- "ror $0x2,%11;"
- "add 4+%16,%12;"
- "mov %6,%10;"
- "add %12,%5;"
- "mov %6,%12;"
- "or %7,%10;"
- "add %5,%8;"
- "and %7,%12;"
- "and %k2,%10;"
- "add %11,%5;"
- "or %12,%10;"
- "add %10,%5;"
- "mov %8,%10;"
- "ror $0xe,%10;"
- "mov %5,%11;"
- "xor %8,%10;"
- "ror $0x9,%11;"
- "mov %9,%12;"
- "xor %5,%11;"
- "ror $0x5,%10;"
- "xor %3,%12;"
- "xor %8,%10;"
- "ror $0xb,%11;"
- "and %8,%12;"
- "xor %5,%11;"
- "ror $0x6,%10;"
- "xor %3,%12;"
- "add %10,%12;"
- "ror $0x2,%11;"
- "add 8+%16,%12;"
- "mov %5,%10;"
- "add %12,%4;"
- "mov %5,%12;"
- "or %k2,%10;"
- "add %4,%7;"
- "and %k2,%12;"
- "and %6,%10;"
- "add %11,%4;"
- "or %12,%10;"
- "add %10,%4;"
- "mov %7,%10;"
- "ror $0xe,%10;"
- "mov %4,%11;"
- "xor %7,%10;"
- "ror $0x9,%11;"
- "mov %8,%12;"
- "xor %4,%11;"
- "ror $0x5,%10;"
- "xor %9,%12;"
- "xor %7,%10;"
- "ror $0xb,%11;"
- "and %7,%12;"
- "xor %4,%11;"
- "ror $0x6,%10;"
- "xor %9,%12;"
- "add %10,%12;"
- "ror $0x2,%11;"
- "add 12+%16,%12;"
- "mov %4,%10;"
- "add %12,%3;"
- "mov %4,%12;"
- "or %6,%10;"
- "add %3,%k2;"
- "and %6,%12;"
- "and %5,%10;"
- "add %11,%3;"
- "or %12,%10;"
- "add %10,%3;"
- "movdqa %%xmm6,%%xmm4;"
- "movdqa %%xmm7,%%xmm5;"
- "sub $0x1,%1;"
- "jne Lloop2_%=;"
-
- "add (%0),%3;"
- "mov %3,(%0);"
- "add 0x4(%0),%4;"
- "mov %4,0x4(%0);"
- "add 0x8(%0),%5;"
- "mov %5,0x8(%0);"
- "add 0xc(%0),%6;"
- "mov %6,0xc(%0);"
- "add 0x10(%0),%k2;"
- "mov %k2,0x10(%0);"
- "add 0x14(%0),%7;"
- "mov %7,0x14(%0);"
- "add 0x18(%0),%8;"
- "mov %8,0x18(%0);"
- "add 0x1c(%0),%9;"
- "mov %9,0x1c(%0);"
- "mov %15,%1;"
- "add $0x40,%1;"
- "cmp %14,%1;"
- "jne Lloop0_%=;"
-
-#endif // VISUAL
-
- "Ldone_hash_%=:"
-
- : "+r"(state_integers), "+r"(data_bytes), "+r"(blocks), "=r"(a), "=r"(b), "=r"(c), "=r"(d), /* e = chunk */ "=r"(f), "=r"(g), "=r"(h), "=r"(y0), "=r"(y1), "=r"(y2), "=r"(table), "+m"(input_end), "+m"(input), "+m"(transfer)
- : "m"(k256), "m"(flip_mask), "m"(shuffle_00ba), "m"(shuffle_dc00)
- : "cc", "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm12"
- );
-}
-
-#endif // HAVE_XASSEMBLY
-
-#endif // DISABLED
-
-} // namespace sha256
-} // namespace system
-} // namespace libbitcoin
diff --git a/src/hash/vectorization/sha256_4_sse41.cpp b/src/hash/vectorization/sha256_4_sse41.cpp
deleted file mode 100644
index 64eff5a706..0000000000
--- a/src/hash/vectorization/sha256_4_sse41.cpp
+++ /dev/null
@@ -1,461 +0,0 @@
-// Based on:
-// sha256-x86.c - Intel SHA extensions using C intrinsics
-// Written and place in public domain by Jeffrey Walton
-// Based on code from Intel, and by Sean Gulley for the miTLS project.
-
-#include
-#include
-#include
-
-namespace libbitcoin {
-namespace system {
-namespace sha256 {
-
-#if defined (DISABLED)
-
-#if !defined(HAVE_XCPU)
-
-void merkle_sse41(digest4& out, const block4& blocks) NOEXCEPT
-{
- BC_ASSERT_MSG(false, "merkle_sse41 undefined");
-}
-
-#else
-
-namespace i128 {
-
-using xint128_t = __m128i;
-
-template
-uint32_t get(xint128_t a) noexcept
-{
- return _mm_extract_epi32(a, Offset);
-}
-
-// Broadcast 32-bit integer a to all elements of dst.
-xint128_t set(uint32_t a) noexcept
-{
- return _mm_set1_epi32(a);
-}
-
-xint128_t set(uint64_t a, uint64_t b) noexcept
-{
- return _mm_set_epi64x(a, b);
-}
-
-xint128_t set(uint32_t a, uint32_t b, uint32_t c, uint32_t d) noexcept
-{
- return _mm_set_epi32(a, b, c, d);
-}
-
-xint128_t sum(xint128_t a, xint128_t b) noexcept
-{
- return _mm_add_epi32(a, b);
-}
-
-xint128_t sum(xint128_t a, xint128_t b, xint128_t c) noexcept
-{
-
- return sum(sum(a, b), c);
-}
-
-xint128_t sum(xint128_t a, xint128_t b, xint128_t c,
- xint128_t d) noexcept
-{
- return sum(sum(a, b), sum(c, d));
-}
-
-xint128_t sum(xint128_t a, xint128_t b, xint128_t c, xint128_t d,
- xint128_t e) noexcept
-{
- return sum(sum(a, b, c), sum(d, e));
-}
-
-xint128_t inc(xint128_t& outa, xint128_t b) noexcept
-{
- return ((outa = sum(outa, b)));
-}
-
-xint128_t inc(xint128_t& outa, xint128_t b, xint128_t c) noexcept
-{
- return ((outa = sum(outa, b, c)));
-}
-
-xint128_t inc(xint128_t& outa, xint128_t b, xint128_t c,
- xint128_t d) noexcept
-{
- return ((outa = sum(outa, b, c, d)));
-}
-
-xint128_t exc(xint128_t a, xint128_t b) noexcept
-{
- return _mm_xor_si128(a, b);
-}
-
-xint128_t exc(xint128_t a, xint128_t b, xint128_t c) noexcept
-{
- return exc(exc(a, b), c);
-}
-
-xint128_t dis(xint128_t a, xint128_t b) noexcept
-{
- return _mm_or_si128(a, b);
-}
-
-xint128_t con(xint128_t a, xint128_t b) noexcept
-{
- return _mm_and_si128(a, b);
-}
-
-xint128_t shr(xint128_t a, uint32_t bits) noexcept
-{
- return _mm_srli_epi32(a, bits);
-}
-
-xint128_t shl(xint128_t a, uint32_t bits) noexcept
-{
- return _mm_slli_epi32(a, bits);
-}
-
-/// Concatenate two 16-byte blocks into a 32-byte temporary result, shift the
-/// result right by Shift bytes, and return the low 16 bytes.
-template
-xint128_t align_right(xint128_t a, xint128_t b) noexcept
-{
- return _mm_alignr_epi8(a, b, Shift);
-}
-
-/// Blend two packed 16-bit integers using Mask.
-template
-xint128_t blend(xint128_t a, xint128_t b) noexcept
-{
- return _mm_blend_epi16(a, b, Mask);
-}
-
-/// Shuffle 32-bit integers using Control.
-template
-xint128_t shuffle(xint128_t a) noexcept
-{
- return _mm_shuffle_epi32(a, Control);
-}
-
-/// Shuffle packed 8-bit integers in a according to shuffle control mask in the
-/// corresponding 8-bit element of b.
-xint128_t shuffle(xint128_t a, xint128_t b) noexcept
-{
- return _mm_shuffle_epi8(a, b);
-}
-
-} // namespace i128
-
-using namespace i128;
-
-xint128_t inline SIGMA0(xint128_t x) NOEXCEPT { return exc(dis(shr(x, 2), shl(x, 30)), dis(shr(x, 13), shl(x, 19)), dis(shr(x, 22), shl(x, 10))); }
-xint128_t inline SIGMA1(xint128_t x) NOEXCEPT { return exc(dis(shr(x, 6), shl(x, 26)), dis(shr(x, 11), shl(x, 21)), dis(shr(x, 25), shl(x, 7))); }
-xint128_t inline sigma0(xint128_t x) NOEXCEPT { return exc(dis(shr(x, 7), shl(x, 25)), dis(shr(x, 18), shl(x, 14)), shr(x, 3)); }
-xint128_t inline sigma1(xint128_t x) NOEXCEPT { return exc(dis(shr(x, 17), shl(x, 15)), dis(shr(x, 19), shl(x, 13)), shr(x, 10)); }
-xint128_t inline choice( xint128_t x, xint128_t y, xint128_t z) NOEXCEPT { return exc(z, con(x, exc(y, z))); }
-xint128_t inline majority(xint128_t x, xint128_t y, xint128_t z) NOEXCEPT { return dis(con(x, y), con(z, dis(x, y))); }
-
-void inline round(xint128_t a, xint128_t b, xint128_t c, xint128_t& d,
- xint128_t e, xint128_t f, xint128_t g, xint128_t& h, xint128_t k) NOEXCEPT
-{
- const auto t1 = sum(h, SIGMA1(e), choice(e, f, g), k);
- const auto t2 = sum( SIGMA0(a), majority(a, b, c));
- d = sum(d, t1);
- h = sum(t1, t2);
-}
-
-template
-xint128_t inline read4(const block4& blocks) NOEXCEPT
-{
- constexpr auto four = sizeof(uint32_t);
- BC_PUSH_WARNING(NO_ARRAY_INDEXING)
- const auto value = set(
- from_little_endian(array_cast(blocks[0])),
- from_little_endian(array_cast(blocks[1])),
- from_little_endian(array_cast(blocks[2])),
- from_little_endian(array_cast(blocks[3])));
- BC_POP_WARNING()
-
- // bswap_mask
- return shuffle(value, set(
- 0x0c0d0e0ful, 0x08090a0bul, 0x04050607ul, 0x00010203ul));
-}
-
-template
-void inline write4(digest4& hashes, xint128_t value) NOEXCEPT
-{
- // bswap_mask
- value = shuffle(value, set(
- 0x0c0d0e0ful, 0x08090a0bul, 0x04050607ul, 0x00010203ul));
-
- constexpr auto four = sizeof(uint32_t);
- BC_PUSH_WARNING(NO_ARRAY_INDEXING)
- array_cast(hashes[0]) = to_little_endian(get<3>(value));
- array_cast(hashes[1]) = to_little_endian(get<2>(value));
- array_cast(hashes[2]) = to_little_endian(get<1>(value));
- array_cast(hashes[3]) = to_little_endian(get<0>(value));
- BC_POP_WARNING()
-}
-
-// Four blocks in four lanes, doubled.
-void merkle_sse41(digest4& out, const block4& blocks) NOEXCEPT
-{
- // Transform 1.
- auto a = set(0x6a09e667ul);
- auto b = set(0xbb67ae85ul);
- auto c = set(0x3c6ef372ul);
- auto d = set(0xa54ff53aul);
- auto e = set(0x510e527ful);
- auto f = set(0x9b05688cul);
- auto g = set(0x1f83d9abul);
- auto h = set(0x5be0cd19ul);
-
- xint128_t w00, w01, w02, w03, w04, w05, w06, w07;
- xint128_t w08, w09, w10, w11, w12, w13, w14, w15;
-
- round(a, b, c, d, e, f, g, h, sum(set(0x428a2f98ul), w00 = read4< 0>(blocks)));
- round(h, a, b, c, d, e, f, g, sum(set(0x71374491ul), w01 = read4< 4>(blocks)));
- round(g, h, a, b, c, d, e, f, sum(set(0xb5c0fbcful), w02 = read4< 8>(blocks)));
- round(f, g, h, a, b, c, d, e, sum(set(0xe9b5dba5ul), w03 = read4<12>(blocks)));
- round(e, f, g, h, a, b, c, d, sum(set(0x3956c25bul), w04 = read4<16>(blocks)));
- round(d, e, f, g, h, a, b, c, sum(set(0x59f111f1ul), w05 = read4<20>(blocks)));
- round(c, d, e, f, g, h, a, b, sum(set(0x923f82a4ul), w06 = read4<24>(blocks)));
- round(b, c, d, e, f, g, h, a, sum(set(0xab1c5ed5ul), w07 = read4<28>(blocks)));
- round(a, b, c, d, e, f, g, h, sum(set(0xd807aa98ul), w08 = read4<32>(blocks)));
- round(h, a, b, c, d, e, f, g, sum(set(0x12835b01ul), w09 = read4<36>(blocks)));
- round(g, h, a, b, c, d, e, f, sum(set(0x243185beul), w10 = read4<40>(blocks)));
- round(f, g, h, a, b, c, d, e, sum(set(0x550c7dc3ul), w11 = read4<44>(blocks)));
- round(e, f, g, h, a, b, c, d, sum(set(0x72be5d74ul), w12 = read4<48>(blocks)));
- round(d, e, f, g, h, a, b, c, sum(set(0x80deb1feul), w13 = read4<52>(blocks)));
- round(c, d, e, f, g, h, a, b, sum(set(0x9bdc06a7ul), w14 = read4<56>(blocks)));
- round(b, c, d, e, f, g, h, a, sum(set(0xc19bf174ul), w15 = read4<60>(blocks)));
- round(a, b, c, d, e, f, g, h, sum(set(0xe49b69c1ul), inc(w00, sigma1(w14), w09, sigma0(w01))));
- round(h, a, b, c, d, e, f, g, sum(set(0xefbe4786ul), inc(w01, sigma1(w15), w10, sigma0(w02))));
- round(g, h, a, b, c, d, e, f, sum(set(0x0fc19dc6ul), inc(w02, sigma1(w00), w11, sigma0(w03))));
- round(f, g, h, a, b, c, d, e, sum(set(0x240ca1ccul), inc(w03, sigma1(w01), w12, sigma0(w04))));
- round(e, f, g, h, a, b, c, d, sum(set(0x2de92c6ful), inc(w04, sigma1(w02), w13, sigma0(w05))));
- round(d, e, f, g, h, a, b, c, sum(set(0x4a7484aaul), inc(w05, sigma1(w03), w14, sigma0(w06))));
- round(c, d, e, f, g, h, a, b, sum(set(0x5cb0a9dcul), inc(w06, sigma1(w04), w15, sigma0(w07))));
- round(b, c, d, e, f, g, h, a, sum(set(0x76f988daul), inc(w07, sigma1(w05), w00, sigma0(w08))));
- round(a, b, c, d, e, f, g, h, sum(set(0x983e5152ul), inc(w08, sigma1(w06), w01, sigma0(w09))));
- round(h, a, b, c, d, e, f, g, sum(set(0xa831c66dul), inc(w09, sigma1(w07), w02, sigma0(w10))));
- round(g, h, a, b, c, d, e, f, sum(set(0xb00327c8ul), inc(w10, sigma1(w08), w03, sigma0(w11))));
- round(f, g, h, a, b, c, d, e, sum(set(0xbf597fc7ul), inc(w11, sigma1(w09), w04, sigma0(w12))));
- round(e, f, g, h, a, b, c, d, sum(set(0xc6e00bf3ul), inc(w12, sigma1(w10), w05, sigma0(w13))));
- round(d, e, f, g, h, a, b, c, sum(set(0xd5a79147ul), inc(w13, sigma1(w11), w06, sigma0(w14))));
- round(c, d, e, f, g, h, a, b, sum(set(0x06ca6351ul), inc(w14, sigma1(w12), w07, sigma0(w15))));
- round(b, c, d, e, f, g, h, a, sum(set(0x14292967ul), inc(w15, sigma1(w13), w08, sigma0(w00))));
- round(a, b, c, d, e, f, g, h, sum(set(0x27b70a85ul), inc(w00, sigma1(w14), w09, sigma0(w01))));
- round(h, a, b, c, d, e, f, g, sum(set(0x2e1b2138ul), inc(w01, sigma1(w15), w10, sigma0(w02))));
- round(g, h, a, b, c, d, e, f, sum(set(0x4d2c6dfcul), inc(w02, sigma1(w00), w11, sigma0(w03))));
- round(f, g, h, a, b, c, d, e, sum(set(0x53380d13ul), inc(w03, sigma1(w01), w12, sigma0(w04))));
- round(e, f, g, h, a, b, c, d, sum(set(0x650a7354ul), inc(w04, sigma1(w02), w13, sigma0(w05))));
- round(d, e, f, g, h, a, b, c, sum(set(0x766a0abbul), inc(w05, sigma1(w03), w14, sigma0(w06))));
- round(c, d, e, f, g, h, a, b, sum(set(0x81c2c92eul), inc(w06, sigma1(w04), w15, sigma0(w07))));
- round(b, c, d, e, f, g, h, a, sum(set(0x92722c85ul), inc(w07, sigma1(w05), w00, sigma0(w08))));
- round(a, b, c, d, e, f, g, h, sum(set(0xa2bfe8a1ul), inc(w08, sigma1(w06), w01, sigma0(w09))));
- round(h, a, b, c, d, e, f, g, sum(set(0xa81a664bul), inc(w09, sigma1(w07), w02, sigma0(w10))));
- round(g, h, a, b, c, d, e, f, sum(set(0xc24b8b70ul), inc(w10, sigma1(w08), w03, sigma0(w11))));
- round(f, g, h, a, b, c, d, e, sum(set(0xc76c51a3ul), inc(w11, sigma1(w09), w04, sigma0(w12))));
- round(e, f, g, h, a, b, c, d, sum(set(0xd192e819ul), inc(w12, sigma1(w10), w05, sigma0(w13))));
- round(d, e, f, g, h, a, b, c, sum(set(0xd6990624ul), inc(w13, sigma1(w11), w06, sigma0(w14))));
- round(c, d, e, f, g, h, a, b, sum(set(0xf40e3585ul), inc(w14, sigma1(w12), w07, sigma0(w15))));
- round(b, c, d, e, f, g, h, a, sum(set(0x106aa070ul), inc(w15, sigma1(w13), w08, sigma0(w00))));
- round(a, b, c, d, e, f, g, h, sum(set(0x19a4c116ul), inc(w00, sigma1(w14), w09, sigma0(w01))));
- round(h, a, b, c, d, e, f, g, sum(set(0x1e376c08ul), inc(w01, sigma1(w15), w10, sigma0(w02))));
- round(g, h, a, b, c, d, e, f, sum(set(0x2748774cul), inc(w02, sigma1(w00), w11, sigma0(w03))));
- round(f, g, h, a, b, c, d, e, sum(set(0x34b0bcb5ul), inc(w03, sigma1(w01), w12, sigma0(w04))));
- round(e, f, g, h, a, b, c, d, sum(set(0x391c0cb3ul), inc(w04, sigma1(w02), w13, sigma0(w05))));
- round(d, e, f, g, h, a, b, c, sum(set(0x4ed8aa4aul), inc(w05, sigma1(w03), w14, sigma0(w06))));
- round(c, d, e, f, g, h, a, b, sum(set(0x5b9cca4ful), inc(w06, sigma1(w04), w15, sigma0(w07))));
- round(b, c, d, e, f, g, h, a, sum(set(0x682e6ff3ul), inc(w07, sigma1(w05), w00, sigma0(w08))));
- round(a, b, c, d, e, f, g, h, sum(set(0x748f82eeul), inc(w08, sigma1(w06), w01, sigma0(w09))));
- round(h, a, b, c, d, e, f, g, sum(set(0x78a5636ful), inc(w09, sigma1(w07), w02, sigma0(w10))));
- round(g, h, a, b, c, d, e, f, sum(set(0x84c87814ul), inc(w10, sigma1(w08), w03, sigma0(w11))));
- round(f, g, h, a, b, c, d, e, sum(set(0x8cc70208ul), inc(w11, sigma1(w09), w04, sigma0(w12))));
- round(e, f, g, h, a, b, c, d, sum(set(0x90befffaul), inc(w12, sigma1(w10), w05, sigma0(w13))));
- round(d, e, f, g, h, a, b, c, sum(set(0xa4506cebul), inc(w13, sigma1(w11), w06, sigma0(w14))));
- round(c, d, e, f, g, h, a, b, sum(set(0xbef9a3f7ul), inc(w14, sigma1(w12), w07, sigma0(w15))));
- round(b, c, d, e, f, g, h, a, sum(set(0xc67178f2ul), inc(w15, sigma1(w13), w08, sigma0(w00))));
-
- a = sum(a, set(0x6a09e667ul));
- b = sum(b, set(0xbb67ae85ul));
- c = sum(c, set(0x3c6ef372ul));
- d = sum(d, set(0xa54ff53aul));
- e = sum(e, set(0x510e527ful));
- f = sum(f, set(0x9b05688cul));
- g = sum(g, set(0x1f83d9abul));
- h = sum(h, set(0x5be0cd19ul));
-
- const xint128_t t0 = a, t1 = b, t2 = c, t3 = d, t4 = e, t5 = f, t6 = g, t7 = h;
-
- // Transform 2.
- round(a, b, c, d, e, f, g, h, set(0xc28a2f98ul));
- round(h, a, b, c, d, e, f, g, set(0x71374491ul));
- round(g, h, a, b, c, d, e, f, set(0xb5c0fbcful));
- round(f, g, h, a, b, c, d, e, set(0xe9b5dba5ul));
- round(e, f, g, h, a, b, c, d, set(0x3956c25bul));
- round(d, e, f, g, h, a, b, c, set(0x59f111f1ul));
- round(c, d, e, f, g, h, a, b, set(0x923f82a4ul));
- round(b, c, d, e, f, g, h, a, set(0xab1c5ed5ul));
- round(a, b, c, d, e, f, g, h, set(0xd807aa98ul));
- round(h, a, b, c, d, e, f, g, set(0x12835b01ul));
- round(g, h, a, b, c, d, e, f, set(0x243185beul));
- round(f, g, h, a, b, c, d, e, set(0x550c7dc3ul));
- round(e, f, g, h, a, b, c, d, set(0x72be5d74ul));
- round(d, e, f, g, h, a, b, c, set(0x80deb1feul));
- round(c, d, e, f, g, h, a, b, set(0x9bdc06a7ul));
- round(b, c, d, e, f, g, h, a, set(0xc19bf374ul));
- round(a, b, c, d, e, f, g, h, set(0x649b69c1ul));
- round(h, a, b, c, d, e, f, g, set(0xf0fe4786ul));
- round(g, h, a, b, c, d, e, f, set(0x0fe1edc6ul));
- round(f, g, h, a, b, c, d, e, set(0x240cf254ul));
- round(e, f, g, h, a, b, c, d, set(0x4fe9346ful));
- round(d, e, f, g, h, a, b, c, set(0x6cc984beul));
- round(c, d, e, f, g, h, a, b, set(0x61b9411eul));
- round(b, c, d, e, f, g, h, a, set(0x16f988faul));
- round(a, b, c, d, e, f, g, h, set(0xf2c65152ul));
- round(h, a, b, c, d, e, f, g, set(0xa88e5a6dul));
- round(g, h, a, b, c, d, e, f, set(0xb019fc65ul));
- round(f, g, h, a, b, c, d, e, set(0xb9d99ec7ul));
- round(e, f, g, h, a, b, c, d, set(0x9a1231c3ul));
- round(d, e, f, g, h, a, b, c, set(0xe70eeaa0ul));
- round(c, d, e, f, g, h, a, b, set(0xfdb1232bul));
- round(b, c, d, e, f, g, h, a, set(0xc7353eb0ul));
- round(a, b, c, d, e, f, g, h, set(0x3069bad5ul));
- round(h, a, b, c, d, e, f, g, set(0xcb976d5ful));
- round(g, h, a, b, c, d, e, f, set(0x5a0f118ful));
- round(f, g, h, a, b, c, d, e, set(0xdc1eeefdul));
- round(e, f, g, h, a, b, c, d, set(0x0a35b689ul));
- round(d, e, f, g, h, a, b, c, set(0xde0b7a04ul));
- round(c, d, e, f, g, h, a, b, set(0x58f4ca9dul));
- round(b, c, d, e, f, g, h, a, set(0xe15d5b16ul));
- round(a, b, c, d, e, f, g, h, set(0x007f3e86ul));
- round(h, a, b, c, d, e, f, g, set(0x37088980ul));
- round(g, h, a, b, c, d, e, f, set(0xa507ea32ul));
- round(f, g, h, a, b, c, d, e, set(0x6fab9537ul));
- round(e, f, g, h, a, b, c, d, set(0x17406110ul));
- round(d, e, f, g, h, a, b, c, set(0x0d8cd6f1ul));
- round(c, d, e, f, g, h, a, b, set(0xcdaa3b6dul));
- round(b, c, d, e, f, g, h, a, set(0xc0bbbe37ul));
- round(a, b, c, d, e, f, g, h, set(0x83613bdaul));
- round(h, a, b, c, d, e, f, g, set(0xdb48a363ul));
- round(g, h, a, b, c, d, e, f, set(0x0b02e931ul));
- round(f, g, h, a, b, c, d, e, set(0x6fd15ca7ul));
- round(e, f, g, h, a, b, c, d, set(0x521afacaul));
- round(d, e, f, g, h, a, b, c, set(0x31338431ul));
- round(c, d, e, f, g, h, a, b, set(0x6ed41a95ul));
- round(b, c, d, e, f, g, h, a, set(0x6d437890ul));
- round(a, b, c, d, e, f, g, h, set(0xc39c91f2ul));
- round(h, a, b, c, d, e, f, g, set(0x9eccabbdul));
- round(g, h, a, b, c, d, e, f, set(0xb5c9a0e6ul));
- round(f, g, h, a, b, c, d, e, set(0x532fb63cul));
- round(e, f, g, h, a, b, c, d, set(0xd2c741c6ul));
- round(d, e, f, g, h, a, b, c, set(0x07237ea3ul));
- round(c, d, e, f, g, h, a, b, set(0xa4954b68ul));
- round(b, c, d, e, f, g, h, a, set(0x4c191d76ul));
-
- w00 = sum(t0, a);
- w01 = sum(t1, b);
- w02 = sum(t2, c);
- w03 = sum(t3, d);
- w04 = sum(t4, e);
- w05 = sum(t5, f);
- w06 = sum(t6, g);
- w07 = sum(t7, h);
-
- // Transform 3.
- a = set(0x6a09e667ul);
- b = set(0xbb67ae85ul);
- c = set(0x3c6ef372ul);
- d = set(0xa54ff53aul);
- e = set(0x510e527ful);
- f = set(0x9b05688cul);
- g = set(0x1f83d9abul);
- h = set(0x5be0cd19ul);
-
- round(a, b, c, d, e, f, g, h, sum(set(0x428a2f98ul), w00));
- round(h, a, b, c, d, e, f, g, sum(set(0x71374491ul), w01));
- round(g, h, a, b, c, d, e, f, sum(set(0xb5c0fbcful), w02));
- round(f, g, h, a, b, c, d, e, sum(set(0xe9b5dba5ul), w03));
- round(e, f, g, h, a, b, c, d, sum(set(0x3956c25bul), w04));
- round(d, e, f, g, h, a, b, c, sum(set(0x59f111f1ul), w05));
- round(c, d, e, f, g, h, a, b, sum(set(0x923f82a4ul), w06));
- round(b, c, d, e, f, g, h, a, sum(set(0xab1c5ed5ul), w07));
- round(a, b, c, d, e, f, g, h, set(0x5807aa98ul));
- round(h, a, b, c, d, e, f, g, set(0x12835b01ul));
- round(g, h, a, b, c, d, e, f, set(0x243185beul));
- round(f, g, h, a, b, c, d, e, set(0x550c7dc3ul));
- round(e, f, g, h, a, b, c, d, set(0x72be5d74ul));
- round(d, e, f, g, h, a, b, c, set(0x80deb1feul));
- round(c, d, e, f, g, h, a, b, set(0x9bdc06a7ul));
- round(b, c, d, e, f, g, h, a, set(0xc19bf274ul));
- round(a, b, c, d, e, f, g, h, sum(set(0xe49b69c1ul), inc(w00, sigma0(w01))));
- round(h, a, b, c, d, e, f, g, sum(set(0xefbe4786ul), inc(w01, set(0xa00000ul), sigma0(w02))));
- round(g, h, a, b, c, d, e, f, sum(set(0x0fc19dc6ul), inc(w02, sigma1(w00), sigma0(w03))));
- round(f, g, h, a, b, c, d, e, sum(set(0x240ca1ccul), inc(w03, sigma1(w01), sigma0(w04))));
- round(e, f, g, h, a, b, c, d, sum(set(0x2de92c6ful), inc(w04, sigma1(w02), sigma0(w05))));
- round(d, e, f, g, h, a, b, c, sum(set(0x4a7484aaul), inc(w05, sigma1(w03), sigma0(w06))));
- round(c, d, e, f, g, h, a, b, sum(set(0x5cb0a9dcul), inc(w06, sigma1(w04), set(0x100ul), sigma0(w07))));
- round(b, c, d, e, f, g, h, a, sum(set(0x76f988daul), inc(w07, sigma1(w05), w00, set(0x11002000ul))));
- round(a, b, c, d, e, f, g, h, sum(set(0x983e5152ul), w08 = sum(set(0x80000000ul), sigma1(w06), w01)));
- round(h, a, b, c, d, e, f, g, sum(set(0xa831c66dul), w09 = sum(sigma1(w07), w02)));
- round(g, h, a, b, c, d, e, f, sum(set(0xb00327c8ul), w10 = sum(sigma1(w08), w03)));
- round(f, g, h, a, b, c, d, e, sum(set(0xbf597fc7ul), w11 = sum(sigma1(w09), w04)));
- round(e, f, g, h, a, b, c, d, sum(set(0xc6e00bf3ul), w12 = sum(sigma1(w10), w05)));
- round(d, e, f, g, h, a, b, c, sum(set(0xd5a79147ul), w13 = sum(sigma1(w11), w06)));
- round(c, d, e, f, g, h, a, b, sum(set(0x06ca6351ul), w14 = sum(sigma1(w12), w07, set(0x400022ul))));
- round(b, c, d, e, f, g, h, a, sum(set(0x14292967ul), w15 = sum(set(0x100ul), sigma1(w13), w08, sigma0(w00))));
- round(a, b, c, d, e, f, g, h, sum(set(0x27b70a85ul), inc(w00, sigma1(w14), w09, sigma0(w01))));
- round(h, a, b, c, d, e, f, g, sum(set(0x2e1b2138ul), inc(w01, sigma1(w15), w10, sigma0(w02))));
- round(g, h, a, b, c, d, e, f, sum(set(0x4d2c6dfcul), inc(w02, sigma1(w00), w11, sigma0(w03))));
- round(f, g, h, a, b, c, d, e, sum(set(0x53380d13ul), inc(w03, sigma1(w01), w12, sigma0(w04))));
- round(e, f, g, h, a, b, c, d, sum(set(0x650a7354ul), inc(w04, sigma1(w02), w13, sigma0(w05))));
- round(d, e, f, g, h, a, b, c, sum(set(0x766a0abbul), inc(w05, sigma1(w03), w14, sigma0(w06))));
- round(c, d, e, f, g, h, a, b, sum(set(0x81c2c92eul), inc(w06, sigma1(w04), w15, sigma0(w07))));
- round(b, c, d, e, f, g, h, a, sum(set(0x92722c85ul), inc(w07, sigma1(w05), w00, sigma0(w08))));
- round(a, b, c, d, e, f, g, h, sum(set(0xa2bfe8a1ul), inc(w08, sigma1(w06), w01, sigma0(w09))));
- round(h, a, b, c, d, e, f, g, sum(set(0xa81a664bul), inc(w09, sigma1(w07), w02, sigma0(w10))));
- round(g, h, a, b, c, d, e, f, sum(set(0xc24b8b70ul), inc(w10, sigma1(w08), w03, sigma0(w11))));
- round(f, g, h, a, b, c, d, e, sum(set(0xc76c51a3ul), inc(w11, sigma1(w09), w04, sigma0(w12))));
- round(e, f, g, h, a, b, c, d, sum(set(0xd192e819ul), inc(w12, sigma1(w10), w05, sigma0(w13))));
- round(d, e, f, g, h, a, b, c, sum(set(0xd6990624ul), inc(w13, sigma1(w11), w06, sigma0(w14))));
- round(c, d, e, f, g, h, a, b, sum(set(0xf40e3585ul), inc(w14, sigma1(w12), w07, sigma0(w15))));
- round(b, c, d, e, f, g, h, a, sum(set(0x106aa070ul), inc(w15, sigma1(w13), w08, sigma0(w00))));
- round(a, b, c, d, e, f, g, h, sum(set(0x19a4c116ul), inc(w00, sigma1(w14), w09, sigma0(w01))));
- round(h, a, b, c, d, e, f, g, sum(set(0x1e376c08ul), inc(w01, sigma1(w15), w10, sigma0(w02))));
- round(g, h, a, b, c, d, e, f, sum(set(0x2748774cul), inc(w02, sigma1(w00), w11, sigma0(w03))));
- round(f, g, h, a, b, c, d, e, sum(set(0x34b0bcb5ul), inc(w03, sigma1(w01), w12, sigma0(w04))));
- round(e, f, g, h, a, b, c, d, sum(set(0x391c0cb3ul), inc(w04, sigma1(w02), w13, sigma0(w05))));
- round(d, e, f, g, h, a, b, c, sum(set(0x4ed8aa4aul), inc(w05, sigma1(w03), w14, sigma0(w06))));
- round(c, d, e, f, g, h, a, b, sum(set(0x5b9cca4ful), inc(w06, sigma1(w04), w15, sigma0(w07))));
- round(b, c, d, e, f, g, h, a, sum(set(0x682e6ff3ul), inc(w07, sigma1(w05), w00, sigma0(w08))));
- round(a, b, c, d, e, f, g, h, sum(set(0x748f82eeul), inc(w08, sigma1(w06), w01, sigma0(w09))));
- round(h, a, b, c, d, e, f, g, sum(set(0x78a5636ful), inc(w09, sigma1(w07), w02, sigma0(w10))));
- round(g, h, a, b, c, d, e, f, sum(set(0x84c87814ul), inc(w10, sigma1(w08), w03, sigma0(w11))));
- round(f, g, h, a, b, c, d, e, sum(set(0x8cc70208ul), inc(w11, sigma1(w09), w04, sigma0(w12))));
- round(e, f, g, h, a, b, c, d, sum(set(0x90befffaul), inc(w12, sigma1(w10), w05, sigma0(w13))));
- round(d, e, f, g, h, a, b, c, sum(set(0xa4506cebul), inc(w13, sigma1(w11), w06, sigma0(w14))));
- round(c, d, e, f, g, h, a, b, sum(set(0xbef9a3f7ul), w14, sigma1(w12), w07, sigma0(w15)));
- round(b, c, d, e, f, g, h, a, sum(set(0xc67178f2ul), w15, sigma1(w13), w08, sigma0(w00)));
-
- // Output.
- write4< 0>(out, sum(a, set(0x6a09e667ul)));
- write4< 4>(out, sum(b, set(0xbb67ae85ul)));
- write4< 8>(out, sum(c, set(0x3c6ef372ul)));
- write4<12>(out, sum(d, set(0xa54ff53aul)));
- write4<16>(out, sum(e, set(0x510e527ful)));
- write4<20>(out, sum(f, set(0x9b05688cul)));
- write4<24>(out, sum(g, set(0x1f83d9abul)));
- write4<28>(out, sum(h, set(0x5be0cd19ul)));
-}
-
-#endif // HAVE_XCPU
-
-#endif // DISABLED
-
-} // namespace sha256
-} // namespace system
-} // namespace libbitcoin
diff --git a/src/hash/vectorization/sha256_8_avx2.cpp b/src/hash/vectorization/sha256_8_avx2.cpp
deleted file mode 100644
index 6c701981b9..0000000000
--- a/src/hash/vectorization/sha256_8_avx2.cpp
+++ /dev/null
@@ -1,439 +0,0 @@
-// Based on:
-// sha256-x86.c - Intel SHA extensions using C intrinsics
-// Written and place in public domain by Jeffrey Walton
-// Based on code from Intel, and by Sean Gulley for the miTLS project.
-
-#include
-#include
-#include
-
-namespace libbitcoin {
-namespace system {
-namespace sha256 {
-
-#if defined (DISABLED)
-
-#if !defined(HAVE_XCPU)
-
-void merkle_avx2(digest8& out, const block8& blocks) NOEXCEPT
-{
- BC_ASSERT_MSG(false, "merkle_avx2 undefined");
-}
-
-#else
-
-namespace i256 {
-
-using xint256_t = __m256i;
-
-template
-uint32_t get(xint256_t a) noexcept
-{
- return _mm256_extract_epi32(a, Offset);
-}
-
-xint256_t set(uint32_t a) noexcept
-{
- return _mm256_set1_epi32(a);
-}
-
-xint256_t set(uint32_t a, uint32_t b, uint32_t c, uint32_t d,
- uint32_t e, uint32_t f, uint32_t g, uint32_t h) noexcept
-{
- return _mm256_set_epi32(a, b, c, d, e, f, g, h);
-}
-
-xint256_t shuffle(xint256_t a, xint256_t b) noexcept
-{
- return _mm256_shuffle_epi8(a, b);
-}
-
-xint256_t sum(xint256_t a, xint256_t b) noexcept
-{
- return _mm256_add_epi32(a, b);
-}
-
-xint256_t sum(xint256_t a, xint256_t b, xint256_t c) noexcept
-{
- return sum(sum(a, b), c);
-}
-
-xint256_t sum(xint256_t a, xint256_t b, xint256_t c,
- xint256_t d) noexcept
-{
- return sum(sum(a, b), sum(c, d));
-}
-
-xint256_t sum(xint256_t a, xint256_t b, xint256_t c, xint256_t d,
- xint256_t e) noexcept
-{
- return sum(sum(a, b, c), sum(d, e));
-}
-
-xint256_t inc(xint256_t& outa, xint256_t b) noexcept
-{
- return ((outa = sum(outa, b)));
-}
-
-xint256_t inc(xint256_t& outa, xint256_t b, xint256_t c) noexcept
-{
- return ((outa = sum(outa, b, c)));
-}
-
-xint256_t inc(xint256_t& outa, xint256_t b, xint256_t c,
- xint256_t d) noexcept
-{
- return ((outa = sum(outa, b, c, d)));
-}
-
-xint256_t exc(xint256_t a, xint256_t b) noexcept
-{
- return _mm256_xor_si256(a, b);
-}
-
-xint256_t exc(xint256_t a, xint256_t b, xint256_t c) noexcept
-{
- return exc(exc(a, b), c);
-}
-
-xint256_t dis(xint256_t a, xint256_t b) noexcept
-{
- return _mm256_or_si256(a, b);
-}
-
-xint256_t con(xint256_t a, xint256_t b) noexcept
-{
- return _mm256_and_si256(a, b);
-}
-
-xint256_t shr(xint256_t a, uint32_t bits) noexcept
-{
- return _mm256_srli_epi32(a, bits);
-}
-
-xint256_t shl(xint256_t a, uint32_t bits) noexcept
-{
- return _mm256_slli_epi32(a, bits);
-}
-
-} // namespace i256
-
-using namespace i256;
-
-xint256_t inline SIGMA0(xint256_t x) NOEXCEPT { return exc(dis(shr(x, 2), shl(x, 30)), dis(shr(x, 13), shl(x, 19)), dis(shr(x, 22), shl(x, 10))); }
-xint256_t inline SIGMA1(xint256_t x) NOEXCEPT { return exc(dis(shr(x, 6), shl(x, 26)), dis(shr(x, 11), shl(x, 21)), dis(shr(x, 25), shl(x, 7))); }
-xint256_t inline sigma0(xint256_t x) NOEXCEPT { return exc(dis(shr(x, 7), shl(x, 25)), dis(shr(x, 18), shl(x, 14)), shr(x, 3)); }
-xint256_t inline sigma1(xint256_t x) NOEXCEPT { return exc(dis(shr(x, 17), shl(x, 15)), dis(shr(x, 19), shl(x, 13)), shr(x, 10)); }
-xint256_t inline choice( xint256_t x, xint256_t y, xint256_t z) NOEXCEPT { return exc(z, con(x, exc(y, z))); }
-xint256_t inline majority(xint256_t x, xint256_t y, xint256_t z) NOEXCEPT { return dis(con(x, y), con(z, dis(x, y))); }
-
-void inline round(xint256_t a, xint256_t b, xint256_t c, xint256_t& d,
- xint256_t e, xint256_t f, xint256_t g, xint256_t& h, xint256_t k) NOEXCEPT
-{
- const auto t1 = sum(h, SIGMA1(e), choice(e, f, g), k);
- const auto t2 = sum( SIGMA0(a), majority(a, b, c));
- d = sum(d, t1);
- h = sum(t1, t2);
-}
-
-template
-xint256_t inline read8(const block8& blocks) NOEXCEPT
-{
- constexpr auto four = sizeof(uint32_t);
- BC_PUSH_WARNING(NO_ARRAY_INDEXING)
- const auto value = set(
- from_little_endian(array_cast(blocks[0])),
- from_little_endian(array_cast(blocks[1])),
- from_little_endian(array_cast(blocks[2])),
- from_little_endian(array_cast(blocks[3])),
- from_little_endian(array_cast(blocks[4])),
- from_little_endian(array_cast(blocks[5])),
- from_little_endian(array_cast(blocks[6])),
- from_little_endian(array_cast(blocks[7])));
- BC_POP_WARNING()
-
- return shuffle(value, set(
- 0x0c0d0e0ful, 0x08090a0bul, 0x04050607ul, 0x00010203ul,
- 0x0c0d0e0ful, 0x08090a0bul, 0x04050607ul, 0x00010203ul));
-}
-
-template
-void inline write8(digest8& hashes, xint256_t value) NOEXCEPT
-{
- value = shuffle(value, set(
- 0x0c0d0e0ful, 0x08090a0bul, 0x04050607ul, 0x00010203ul,
- 0x0c0d0e0ful, 0x08090a0bul, 0x04050607ul, 0x00010203ul));
-
- constexpr auto four = sizeof(uint32_t);
- BC_PUSH_WARNING(NO_ARRAY_INDEXING)
- array_cast(hashes[0]) = to_little_endian(get<7>(value));
- array_cast(hashes[1]) = to_little_endian(get<6>(value));
- array_cast(hashes[2]) = to_little_endian(get<5>(value));
- array_cast(hashes[3]) = to_little_endian(get<4>(value));
- array_cast(hashes[4]) = to_little_endian(get<3>(value));
- array_cast(hashes[5]) = to_little_endian(get<2>(value));
- array_cast(hashes[6]) = to_little_endian(get<1>(value));
- array_cast(hashes[7]) = to_little_endian(get<0>(value));
- BC_POP_WARNING()
-}
-
-// Eight blocks in eight lanes, doubled.
-void merkle_avx2(digest8& out, const block8& blocks) NOEXCEPT
-{
- // Transform 1.
- auto a = set(0x6a09e667ul);
- auto b = set(0xbb67ae85ul);
- auto c = set(0x3c6ef372ul);
- auto d = set(0xa54ff53aul);
- auto e = set(0x510e527ful);
- auto f = set(0x9b05688cul);
- auto g = set(0x1f83d9abul);
- auto h = set(0x5be0cd19ul);
-
- xint256_t w00, w01, w02, w03, w04, w05, w06, w07;
- xint256_t w08, w09, w10, w11, w12, w13, w14, w15;
-
- round(a, b, c, d, e, f, g, h, sum(set(0x428a2f98ul), w00 = read8< 0>(blocks)));
- round(h, a, b, c, d, e, f, g, sum(set(0x71374491ul), w01 = read8< 4>(blocks)));
- round(g, h, a, b, c, d, e, f, sum(set(0xb5c0fbcful), w02 = read8< 8>(blocks)));
- round(f, g, h, a, b, c, d, e, sum(set(0xe9b5dba5ul), w03 = read8<12>(blocks)));
- round(e, f, g, h, a, b, c, d, sum(set(0x3956c25bul), w04 = read8<16>(blocks)));
- round(d, e, f, g, h, a, b, c, sum(set(0x59f111f1ul), w05 = read8<20>(blocks)));
- round(c, d, e, f, g, h, a, b, sum(set(0x923f82a4ul), w06 = read8<24>(blocks)));
- round(b, c, d, e, f, g, h, a, sum(set(0xab1c5ed5ul), w07 = read8<28>(blocks)));
- round(a, b, c, d, e, f, g, h, sum(set(0xd807aa98ul), w08 = read8<32>(blocks)));
- round(h, a, b, c, d, e, f, g, sum(set(0x12835b01ul), w09 = read8<36>(blocks)));
- round(g, h, a, b, c, d, e, f, sum(set(0x243185beul), w10 = read8<40>(blocks)));
- round(f, g, h, a, b, c, d, e, sum(set(0x550c7dc3ul), w11 = read8<44>(blocks)));
- round(e, f, g, h, a, b, c, d, sum(set(0x72be5d74ul), w12 = read8<48>(blocks)));
- round(d, e, f, g, h, a, b, c, sum(set(0x80deb1feul), w13 = read8<52>(blocks)));
- round(c, d, e, f, g, h, a, b, sum(set(0x9bdc06a7ul), w14 = read8<56>(blocks)));
- round(b, c, d, e, f, g, h, a, sum(set(0xc19bf174ul), w15 = read8<60>(blocks)));
- round(a, b, c, d, e, f, g, h, sum(set(0xe49b69c1ul), inc(w00, sigma1(w14), w09, sigma0(w01))));
- round(h, a, b, c, d, e, f, g, sum(set(0xefbe4786ul), inc(w01, sigma1(w15), w10, sigma0(w02))));
- round(g, h, a, b, c, d, e, f, sum(set(0x0fc19dc6ul), inc(w02, sigma1(w00), w11, sigma0(w03))));
- round(f, g, h, a, b, c, d, e, sum(set(0x240ca1ccul), inc(w03, sigma1(w01), w12, sigma0(w04))));
- round(e, f, g, h, a, b, c, d, sum(set(0x2de92c6ful), inc(w04, sigma1(w02), w13, sigma0(w05))));
- round(d, e, f, g, h, a, b, c, sum(set(0x4a7484aaul), inc(w05, sigma1(w03), w14, sigma0(w06))));
- round(c, d, e, f, g, h, a, b, sum(set(0x5cb0a9dcul), inc(w06, sigma1(w04), w15, sigma0(w07))));
- round(b, c, d, e, f, g, h, a, sum(set(0x76f988daul), inc(w07, sigma1(w05), w00, sigma0(w08))));
- round(a, b, c, d, e, f, g, h, sum(set(0x983e5152ul), inc(w08, sigma1(w06), w01, sigma0(w09))));
- round(h, a, b, c, d, e, f, g, sum(set(0xa831c66dul), inc(w09, sigma1(w07), w02, sigma0(w10))));
- round(g, h, a, b, c, d, e, f, sum(set(0xb00327c8ul), inc(w10, sigma1(w08), w03, sigma0(w11))));
- round(f, g, h, a, b, c, d, e, sum(set(0xbf597fc7ul), inc(w11, sigma1(w09), w04, sigma0(w12))));
- round(e, f, g, h, a, b, c, d, sum(set(0xc6e00bf3ul), inc(w12, sigma1(w10), w05, sigma0(w13))));
- round(d, e, f, g, h, a, b, c, sum(set(0xd5a79147ul), inc(w13, sigma1(w11), w06, sigma0(w14))));
- round(c, d, e, f, g, h, a, b, sum(set(0x06ca6351ul), inc(w14, sigma1(w12), w07, sigma0(w15))));
- round(b, c, d, e, f, g, h, a, sum(set(0x14292967ul), inc(w15, sigma1(w13), w08, sigma0(w00))));
- round(a, b, c, d, e, f, g, h, sum(set(0x27b70a85ul), inc(w00, sigma1(w14), w09, sigma0(w01))));
- round(h, a, b, c, d, e, f, g, sum(set(0x2e1b2138ul), inc(w01, sigma1(w15), w10, sigma0(w02))));
- round(g, h, a, b, c, d, e, f, sum(set(0x4d2c6dfcul), inc(w02, sigma1(w00), w11, sigma0(w03))));
- round(f, g, h, a, b, c, d, e, sum(set(0x53380d13ul), inc(w03, sigma1(w01), w12, sigma0(w04))));
- round(e, f, g, h, a, b, c, d, sum(set(0x650a7354ul), inc(w04, sigma1(w02), w13, sigma0(w05))));
- round(d, e, f, g, h, a, b, c, sum(set(0x766a0abbul), inc(w05, sigma1(w03), w14, sigma0(w06))));
- round(c, d, e, f, g, h, a, b, sum(set(0x81c2c92eul), inc(w06, sigma1(w04), w15, sigma0(w07))));
- round(b, c, d, e, f, g, h, a, sum(set(0x92722c85ul), inc(w07, sigma1(w05), w00, sigma0(w08))));
- round(a, b, c, d, e, f, g, h, sum(set(0xa2bfe8a1ul), inc(w08, sigma1(w06), w01, sigma0(w09))));
- round(h, a, b, c, d, e, f, g, sum(set(0xa81a664bul), inc(w09, sigma1(w07), w02, sigma0(w10))));
- round(g, h, a, b, c, d, e, f, sum(set(0xc24b8b70ul), inc(w10, sigma1(w08), w03, sigma0(w11))));
- round(f, g, h, a, b, c, d, e, sum(set(0xc76c51a3ul), inc(w11, sigma1(w09), w04, sigma0(w12))));
- round(e, f, g, h, a, b, c, d, sum(set(0xd192e819ul), inc(w12, sigma1(w10), w05, sigma0(w13))));
- round(d, e, f, g, h, a, b, c, sum(set(0xd6990624ul), inc(w13, sigma1(w11), w06, sigma0(w14))));
- round(c, d, e, f, g, h, a, b, sum(set(0xf40e3585ul), inc(w14, sigma1(w12), w07, sigma0(w15))));
- round(b, c, d, e, f, g, h, a, sum(set(0x106aa070ul), inc(w15, sigma1(w13), w08, sigma0(w00))));
- round(a, b, c, d, e, f, g, h, sum(set(0x19a4c116ul), inc(w00, sigma1(w14), w09, sigma0(w01))));
- round(h, a, b, c, d, e, f, g, sum(set(0x1e376c08ul), inc(w01, sigma1(w15), w10, sigma0(w02))));
- round(g, h, a, b, c, d, e, f, sum(set(0x2748774cul), inc(w02, sigma1(w00), w11, sigma0(w03))));
- round(f, g, h, a, b, c, d, e, sum(set(0x34b0bcb5ul), inc(w03, sigma1(w01), w12, sigma0(w04))));
- round(e, f, g, h, a, b, c, d, sum(set(0x391c0cb3ul), inc(w04, sigma1(w02), w13, sigma0(w05))));
- round(d, e, f, g, h, a, b, c, sum(set(0x4ed8aa4aul), inc(w05, sigma1(w03), w14, sigma0(w06))));
- round(c, d, e, f, g, h, a, b, sum(set(0x5b9cca4ful), inc(w06, sigma1(w04), w15, sigma0(w07))));
- round(b, c, d, e, f, g, h, a, sum(set(0x682e6ff3ul), inc(w07, sigma1(w05), w00, sigma0(w08))));
- round(a, b, c, d, e, f, g, h, sum(set(0x748f82eeul), inc(w08, sigma1(w06), w01, sigma0(w09))));
- round(h, a, b, c, d, e, f, g, sum(set(0x78a5636ful), inc(w09, sigma1(w07), w02, sigma0(w10))));
- round(g, h, a, b, c, d, e, f, sum(set(0x84c87814ul), inc(w10, sigma1(w08), w03, sigma0(w11))));
- round(f, g, h, a, b, c, d, e, sum(set(0x8cc70208ul), inc(w11, sigma1(w09), w04, sigma0(w12))));
- round(e, f, g, h, a, b, c, d, sum(set(0x90befffaul), inc(w12, sigma1(w10), w05, sigma0(w13))));
- round(d, e, f, g, h, a, b, c, sum(set(0xa4506cebul), inc(w13, sigma1(w11), w06, sigma0(w14))));
- round(c, d, e, f, g, h, a, b, sum(set(0xbef9a3f7ul), inc(w14, sigma1(w12), w07, sigma0(w15))));
- round(b, c, d, e, f, g, h, a, sum(set(0xc67178f2ul), inc(w15, sigma1(w13), w08, sigma0(w00))));
-
- a = sum(a, set(0x6a09e667ul));
- b = sum(b, set(0xbb67ae85ul));
- c = sum(c, set(0x3c6ef372ul));
- d = sum(d, set(0xa54ff53aul));
- e = sum(e, set(0x510e527ful));
- f = sum(f, set(0x9b05688cul));
- g = sum(g, set(0x1f83d9abul));
- h = sum(h, set(0x5be0cd19ul));
-
- const xint256_t t0 = a, t1 = b, t2 = c, t3 = d, t4 = e, t5 = f, t6 = g, t7 = h;
-
- // Transform 2.
- round(a, b, c, d, e, f, g, h, set(0xc28a2f98ul));
- round(h, a, b, c, d, e, f, g, set(0x71374491ul));
- round(g, h, a, b, c, d, e, f, set(0xb5c0fbcful));
- round(f, g, h, a, b, c, d, e, set(0xe9b5dba5ul));
- round(e, f, g, h, a, b, c, d, set(0x3956c25bul));
- round(d, e, f, g, h, a, b, c, set(0x59f111f1ul));
- round(c, d, e, f, g, h, a, b, set(0x923f82a4ul));
- round(b, c, d, e, f, g, h, a, set(0xab1c5ed5ul));
- round(a, b, c, d, e, f, g, h, set(0xd807aa98ul));
- round(h, a, b, c, d, e, f, g, set(0x12835b01ul));
- round(g, h, a, b, c, d, e, f, set(0x243185beul));
- round(f, g, h, a, b, c, d, e, set(0x550c7dc3ul));
- round(e, f, g, h, a, b, c, d, set(0x72be5d74ul));
- round(d, e, f, g, h, a, b, c, set(0x80deb1feul));
- round(c, d, e, f, g, h, a, b, set(0x9bdc06a7ul));
- round(b, c, d, e, f, g, h, a, set(0xc19bf374ul));
- round(a, b, c, d, e, f, g, h, set(0x649b69c1ul));
- round(h, a, b, c, d, e, f, g, set(0xf0fe4786ul));
- round(g, h, a, b, c, d, e, f, set(0x0fe1edc6ul));
- round(f, g, h, a, b, c, d, e, set(0x240cf254ul));
- round(e, f, g, h, a, b, c, d, set(0x4fe9346ful));
- round(d, e, f, g, h, a, b, c, set(0x6cc984beul));
- round(c, d, e, f, g, h, a, b, set(0x61b9411eul));
- round(b, c, d, e, f, g, h, a, set(0x16f988faul));
- round(a, b, c, d, e, f, g, h, set(0xf2c65152ul));
- round(h, a, b, c, d, e, f, g, set(0xa88e5a6dul));
- round(g, h, a, b, c, d, e, f, set(0xb019fc65ul));
- round(f, g, h, a, b, c, d, e, set(0xb9d99ec7ul));
- round(e, f, g, h, a, b, c, d, set(0x9a1231c3ul));
- round(d, e, f, g, h, a, b, c, set(0xe70eeaa0ul));
- round(c, d, e, f, g, h, a, b, set(0xfdb1232bul));
- round(b, c, d, e, f, g, h, a, set(0xc7353eb0ul));
- round(a, b, c, d, e, f, g, h, set(0x3069bad5ul));
- round(h, a, b, c, d, e, f, g, set(0xcb976d5ful));
- round(g, h, a, b, c, d, e, f, set(0x5a0f118ful));
- round(f, g, h, a, b, c, d, e, set(0xdc1eeefdul));
- round(e, f, g, h, a, b, c, d, set(0x0a35b689ul));
- round(d, e, f, g, h, a, b, c, set(0xde0b7a04ul));
- round(c, d, e, f, g, h, a, b, set(0x58f4ca9dul));
- round(b, c, d, e, f, g, h, a, set(0xe15d5b16ul));
- round(a, b, c, d, e, f, g, h, set(0x007f3e86ul));
- round(h, a, b, c, d, e, f, g, set(0x37088980ul));
- round(g, h, a, b, c, d, e, f, set(0xa507ea32ul));
- round(f, g, h, a, b, c, d, e, set(0x6fab9537ul));
- round(e, f, g, h, a, b, c, d, set(0x17406110ul));
- round(d, e, f, g, h, a, b, c, set(0x0d8cd6f1ul));
- round(c, d, e, f, g, h, a, b, set(0xcdaa3b6dul));
- round(b, c, d, e, f, g, h, a, set(0xc0bbbe37ul));
- round(a, b, c, d, e, f, g, h, set(0x83613bdaul));
- round(h, a, b, c, d, e, f, g, set(0xdb48a363ul));
- round(g, h, a, b, c, d, e, f, set(0x0b02e931ul));
- round(f, g, h, a, b, c, d, e, set(0x6fd15ca7ul));
- round(e, f, g, h, a, b, c, d, set(0x521afacaul));
- round(d, e, f, g, h, a, b, c, set(0x31338431ul));
- round(c, d, e, f, g, h, a, b, set(0x6ed41a95ul));
- round(b, c, d, e, f, g, h, a, set(0x6d437890ul));
- round(a, b, c, d, e, f, g, h, set(0xc39c91f2ul));
- round(h, a, b, c, d, e, f, g, set(0x9eccabbdul));
- round(g, h, a, b, c, d, e, f, set(0xb5c9a0e6ul));
- round(f, g, h, a, b, c, d, e, set(0x532fb63cul));
- round(e, f, g, h, a, b, c, d, set(0xd2c741c6ul));
- round(d, e, f, g, h, a, b, c, set(0x07237ea3ul));
- round(c, d, e, f, g, h, a, b, set(0xa4954b68ul));
- round(b, c, d, e, f, g, h, a, set(0x4c191d76ul));
-
- w00 = sum(t0, a);
- w01 = sum(t1, b);
- w02 = sum(t2, c);
- w03 = sum(t3, d);
- w04 = sum(t4, e);
- w05 = sum(t5, f);
- w06 = sum(t6, g);
- w07 = sum(t7, h);
-
- // Transform 3.
- a = set(0x6a09e667ul);
- b = set(0xbb67ae85ul);
- c = set(0x3c6ef372ul);
- d = set(0xa54ff53aul);
- e = set(0x510e527ful);
- f = set(0x9b05688cul);
- g = set(0x1f83d9abul);
- h = set(0x5be0cd19ul);
-
- round(a, b, c, d, e, f, g, h, sum(set(0x428a2f98ul), w00));
- round(h, a, b, c, d, e, f, g, sum(set(0x71374491ul), w01));
- round(g, h, a, b, c, d, e, f, sum(set(0xb5c0fbcful), w02));
- round(f, g, h, a, b, c, d, e, sum(set(0xe9b5dba5ul), w03));
- round(e, f, g, h, a, b, c, d, sum(set(0x3956c25bul), w04));
- round(d, e, f, g, h, a, b, c, sum(set(0x59f111f1ul), w05));
- round(c, d, e, f, g, h, a, b, sum(set(0x923f82a4ul), w06));
- round(b, c, d, e, f, g, h, a, sum(set(0xab1c5ed5ul), w07));
- round(a, b, c, d, e, f, g, h, set(0x5807aa98ul));
- round(h, a, b, c, d, e, f, g, set(0x12835b01ul));
- round(g, h, a, b, c, d, e, f, set(0x243185beul));
- round(f, g, h, a, b, c, d, e, set(0x550c7dc3ul));
- round(e, f, g, h, a, b, c, d, set(0x72be5d74ul));
- round(d, e, f, g, h, a, b, c, set(0x80deb1feul));
- round(c, d, e, f, g, h, a, b, set(0x9bdc06a7ul));
- round(b, c, d, e, f, g, h, a, set(0xc19bf274ul));
- round(a, b, c, d, e, f, g, h, sum(set(0xe49b69c1ul), inc(w00, sigma0(w01))));
- round(h, a, b, c, d, e, f, g, sum(set(0xefbe4786ul), inc(w01, set(0xa00000ul), sigma0(w02))));
- round(g, h, a, b, c, d, e, f, sum(set(0x0fc19dc6ul), inc(w02, sigma1(w00), sigma0(w03))));
- round(f, g, h, a, b, c, d, e, sum(set(0x240ca1ccul), inc(w03, sigma1(w01), sigma0(w04))));
- round(e, f, g, h, a, b, c, d, sum(set(0x2de92c6ful), inc(w04, sigma1(w02), sigma0(w05))));
- round(d, e, f, g, h, a, b, c, sum(set(0x4a7484aaul), inc(w05, sigma1(w03), sigma0(w06))));
- round(c, d, e, f, g, h, a, b, sum(set(0x5cb0a9dcul), inc(w06, sigma1(w04), set(0x100ul), sigma0(w07))));
- round(b, c, d, e, f, g, h, a, sum(set(0x76f988daul), inc(w07, sigma1(w05), w00, set(0x11002000ul))));
- round(a, b, c, d, e, f, g, h, sum(set(0x983e5152ul), w08 = sum(set(0x80000000ul), sigma1(w06), w01)));
- round(h, a, b, c, d, e, f, g, sum(set(0xa831c66dul), w09 = sum(sigma1(w07), w02)));
- round(g, h, a, b, c, d, e, f, sum(set(0xb00327c8ul), w10 = sum(sigma1(w08), w03)));
- round(f, g, h, a, b, c, d, e, sum(set(0xbf597fc7ul), w11 = sum(sigma1(w09), w04)));
- round(e, f, g, h, a, b, c, d, sum(set(0xc6e00bf3ul), w12 = sum(sigma1(w10), w05)));
- round(d, e, f, g, h, a, b, c, sum(set(0xd5a79147ul), w13 = sum(sigma1(w11), w06)));
- round(c, d, e, f, g, h, a, b, sum(set(0x06ca6351ul), w14 = sum(sigma1(w12), w07, set(0x400022ul))));
- round(b, c, d, e, f, g, h, a, sum(set(0x14292967ul), w15 = sum(set(0x100ul), sigma1(w13), w08, sigma0(w00))));
- round(a, b, c, d, e, f, g, h, sum(set(0x27b70a85ul), inc(w00, sigma1(w14), w09, sigma0(w01))));
- round(h, a, b, c, d, e, f, g, sum(set(0x2e1b2138ul), inc(w01, sigma1(w15), w10, sigma0(w02))));
- round(g, h, a, b, c, d, e, f, sum(set(0x4d2c6dfcul), inc(w02, sigma1(w00), w11, sigma0(w03))));
- round(f, g, h, a, b, c, d, e, sum(set(0x53380d13ul), inc(w03, sigma1(w01), w12, sigma0(w04))));
- round(e, f, g, h, a, b, c, d, sum(set(0x650a7354ul), inc(w04, sigma1(w02), w13, sigma0(w05))));
- round(d, e, f, g, h, a, b, c, sum(set(0x766a0abbul), inc(w05, sigma1(w03), w14, sigma0(w06))));
- round(c, d, e, f, g, h, a, b, sum(set(0x81c2c92eul), inc(w06, sigma1(w04), w15, sigma0(w07))));
- round(b, c, d, e, f, g, h, a, sum(set(0x92722c85ul), inc(w07, sigma1(w05), w00, sigma0(w08))));
- round(a, b, c, d, e, f, g, h, sum(set(0xa2bfe8a1ul), inc(w08, sigma1(w06), w01, sigma0(w09))));
- round(h, a, b, c, d, e, f, g, sum(set(0xa81a664bul), inc(w09, sigma1(w07), w02, sigma0(w10))));
- round(g, h, a, b, c, d, e, f, sum(set(0xc24b8b70ul), inc(w10, sigma1(w08), w03, sigma0(w11))));
- round(f, g, h, a, b, c, d, e, sum(set(0xc76c51a3ul), inc(w11, sigma1(w09), w04, sigma0(w12))));
- round(e, f, g, h, a, b, c, d, sum(set(0xd192e819ul), inc(w12, sigma1(w10), w05, sigma0(w13))));
- round(d, e, f, g, h, a, b, c, sum(set(0xd6990624ul), inc(w13, sigma1(w11), w06, sigma0(w14))));
- round(c, d, e, f, g, h, a, b, sum(set(0xf40e3585ul), inc(w14, sigma1(w12), w07, sigma0(w15))));
- round(b, c, d, e, f, g, h, a, sum(set(0x106aa070ul), inc(w15, sigma1(w13), w08, sigma0(w00))));
- round(a, b, c, d, e, f, g, h, sum(set(0x19a4c116ul), inc(w00, sigma1(w14), w09, sigma0(w01))));
- round(h, a, b, c, d, e, f, g, sum(set(0x1e376c08ul), inc(w01, sigma1(w15), w10, sigma0(w02))));
- round(g, h, a, b, c, d, e, f, sum(set(0x2748774cul), inc(w02, sigma1(w00), w11, sigma0(w03))));
- round(f, g, h, a, b, c, d, e, sum(set(0x34b0bcb5ul), inc(w03, sigma1(w01), w12, sigma0(w04))));
- round(e, f, g, h, a, b, c, d, sum(set(0x391c0cb3ul), inc(w04, sigma1(w02), w13, sigma0(w05))));
- round(d, e, f, g, h, a, b, c, sum(set(0x4ed8aa4aul), inc(w05, sigma1(w03), w14, sigma0(w06))));
- round(c, d, e, f, g, h, a, b, sum(set(0x5b9cca4ful), inc(w06, sigma1(w04), w15, sigma0(w07))));
- round(b, c, d, e, f, g, h, a, sum(set(0x682e6ff3ul), inc(w07, sigma1(w05), w00, sigma0(w08))));
- round(a, b, c, d, e, f, g, h, sum(set(0x748f82eeul), inc(w08, sigma1(w06), w01, sigma0(w09))));
- round(h, a, b, c, d, e, f, g, sum(set(0x78a5636ful), inc(w09, sigma1(w07), w02, sigma0(w10))));
- round(g, h, a, b, c, d, e, f, sum(set(0x84c87814ul), inc(w10, sigma1(w08), w03, sigma0(w11))));
- round(f, g, h, a, b, c, d, e, sum(set(0x8cc70208ul), inc(w11, sigma1(w09), w04, sigma0(w12))));
- round(e, f, g, h, a, b, c, d, sum(set(0x90befffaul), inc(w12, sigma1(w10), w05, sigma0(w13))));
- round(d, e, f, g, h, a, b, c, sum(set(0xa4506cebul), inc(w13, sigma1(w11), w06, sigma0(w14))));
- round(c, d, e, f, g, h, a, b, sum(set(0xbef9a3f7ul), w14, sigma1(w12), w07, sigma0(w15)));
- round(b, c, d, e, f, g, h, a, sum(set(0xc67178f2ul), w15, sigma1(w13), w08, sigma0(w00)));
-
- // Output.
- write8< 0>(out, sum(a, set(0x6a09e667ul)));
- write8< 4>(out, sum(b, set(0xbb67ae85ul)));
- write8< 8>(out, sum(c, set(0x3c6ef372ul)));
- write8<12>(out, sum(d, set(0xa54ff53aul)));
- write8<16>(out, sum(e, set(0x510e527ful)));
- write8<20>(out, sum(f, set(0x9b05688cul)));
- write8<24>(out, sum(g, set(0x1f83d9abul)));
- write8<28>(out, sum(h, set(0x5be0cd19ul)));
-}
-
-#endif // HAVE_XCPU
-
-#endif // DISABLED
-
-} // namespace sha256
-} // namespace system
-} // namespace libbitcoin
diff --git a/test/hash/hash.hpp b/test/hash/hash.hpp
index fc4ff29e74..f209cab12d 100644
--- a/test/hash/hash.hpp
+++ b/test/hash/hash.hpp
@@ -176,7 +176,7 @@ constexpr auto alpha2_count = 16'777'216_size;
constexpr auto long_alpha_size = alpha2_size * alpha2_count;
static const auto alpha2 = to_array("abcdefghbcdefghicdefghijdefghijkefghijklfghijklmghijklmnhijklmno");
static const std_vector long_alpha(alpha2_count, alpha2);
-static const auto long_alpha_data = pointer_cast(long_alpha.front().data());
+static const auto long_alpha_data = pointer_cast(long_alpha.front().data());
static const auto sha_6 = to_chunk(unsafe_array_cast(long_alpha_data));
constexpr auto sha_test_count = 6;
#else
diff --git a/test/hash/performance/performance.hpp b/test/hash/performance/performance.hpp
index 1973f9818f..56bfb74653 100644
--- a/test/hash/performance/performance.hpp
+++ b/test/hash/performance/performance.hpp
@@ -329,7 +329,7 @@ bool test_merkle(std::ostream& out, float ghz = 3.0f,
for (size_t seed = 0; seed < Count; ++seed)
{
constexpr auto size = array_count;
- std_vector digests{};
+ std::vector digests{};
digests.reserve(Size * two);
for (size_t blocks = 0; blocks < Size; ++blocks)