Merge pull request #1555 from evoskuil/master

Refactor sha algorithm, fix perf test drift, comments.
libbitcoin · Nov 26, 2024 · 572f3b9 · 572f3b9
2 parents 0eac043 + 70af8b1
commit 572f3b9
Show file tree

Hide file tree

Showing 19 changed files with 590 additions and 2,979 deletions.
diff --git a/Makefile.am b/Makefile.am
@@ -90,12 +90,6 @@ src_libbitcoin_system_la_SOURCES = \
     src/hash/accumulator.cpp \
     src/hash/checksum.cpp \
     src/hash/siphash.cpp \
-    src/hash/vectorization/sha256_1_native.cpp \
-    src/hash/vectorization/sha256_2_shani.cpp \
-    src/hash/vectorization/sha256_4_neon.cpp \
-    src/hash/vectorization/sha256_4_sse4.cpp \
-    src/hash/vectorization/sha256_4_sse41.cpp \
-    src/hash/vectorization/sha256_8_avx2.cpp \
     src/math/math.cpp \
     src/radix/base_10.cpp \
     src/radix/base_2048.cpp \
@@ -622,6 +616,7 @@ include_bitcoin_system_impl_hash_sha_HEADERS = \
     include/bitcoin/system/impl/hash/sha/algorithm_double.ipp \
     include/bitcoin/system/impl/hash/sha/algorithm_functions.ipp \
     include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp \
+    include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp \
     include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp \
     include/bitcoin/system/impl/hash/sha/algorithm_native.ipp \
     include/bitcoin/system/impl/hash/sha/algorithm_padding.ipp \

diff --git a/builds/cmake/CMakeLists.txt b/builds/cmake/CMakeLists.txt
@@ -529,12 +529,6 @@ add_library( ${CANONICAL_LIB_NAME}
     "../../src/hash/accumulator.cpp"
     "../../src/hash/checksum.cpp"
     "../../src/hash/siphash.cpp"
-    "../../src/hash/vectorization/sha256_1_native.cpp"
-    "../../src/hash/vectorization/sha256_2_shani.cpp"
-    "../../src/hash/vectorization/sha256_4_neon.cpp"
-    "../../src/hash/vectorization/sha256_4_sse4.cpp"
-    "../../src/hash/vectorization/sha256_4_sse41.cpp"
-    "../../src/hash/vectorization/sha256_8_avx2.cpp"
     "../../src/math/math.cpp"
     "../../src/radix/base_10.cpp"
     "../../src/radix/base_2048.cpp"

diff --git a/builds/msvc/vs2022/libbitcoin-system/libbitcoin-system.vcxproj b/builds/msvc/vs2022/libbitcoin-system/libbitcoin-system.vcxproj
@@ -155,12 +155,6 @@
     <ClCompile Include="..\..\..\..\src\hash\accumulator.cpp" />
     <ClCompile Include="..\..\..\..\src\hash\checksum.cpp" />
     <ClCompile Include="..\..\..\..\src\hash\siphash.cpp" />
-    <ClCompile Include="..\..\..\..\src\hash\vectorization\sha256_1_native.cpp" />
-    <ClCompile Include="..\..\..\..\src\hash\vectorization\sha256_2_shani.cpp" />
-    <ClCompile Include="..\..\..\..\src\hash\vectorization\sha256_4_neon.cpp" />
-    <ClCompile Include="..\..\..\..\src\hash\vectorization\sha256_4_sse4.cpp" />
-    <ClCompile Include="..\..\..\..\src\hash\vectorization\sha256_4_sse41.cpp" />
-    <ClCompile Include="..\..\..\..\src\hash\vectorization\sha256_8_avx2.cpp" />
     <ClCompile Include="..\..\..\..\src\math\math.cpp" />
     <ClCompile Include="..\..\..\..\src\radix\base_10.cpp" />
     <ClCompile Include="..\..\..\..\src\radix\base_2048.cpp" />
@@ -548,6 +542,7 @@
     <None Include="..\..\..\..\include\bitcoin\system\impl\hash\sha\algorithm_double.ipp" />
     <None Include="..\..\..\..\include\bitcoin\system\impl\hash\sha\algorithm_functions.ipp" />
     <None Include="..\..\..\..\include\bitcoin\system\impl\hash\sha\algorithm_iterate.ipp" />
+    <None Include="..\..\..\..\include\bitcoin\system\impl\hash\sha\algorithm_konstant.ipp" />
     <None Include="..\..\..\..\include\bitcoin\system\impl\hash\sha\algorithm_merkle.ipp" />
     <None Include="..\..\..\..\include\bitcoin\system\impl\hash\sha\algorithm_native.ipp" />
     <None Include="..\..\..\..\include\bitcoin\system\impl\hash\sha\algorithm_padding.ipp" />

diff --git a/builds/msvc/vs2022/libbitcoin-system/libbitcoin-system.vcxproj.filters b/builds/msvc/vs2022/libbitcoin-system/libbitcoin-system.vcxproj.filters
diff --git a/include/bitcoin/system/hash/sha/algorithm.hpp b/include/bitcoin/system/hash/sha/algorithm.hpp
@@ -144,7 +144,8 @@ class algorithm
     /// Intrinsics types.
     /// -----------------------------------------------------------------------
 
-    /// Extended integer capacity for uint32_t/uint64_t is 2/4/8/16 only.
+    /// Expand is multiple of buffer/state for Lane concurrent blocks.
+    /// Multiple blocks are "striped" across the expanded buffer in xWords.
     template <size_t Lanes, bool_if<is_valid_lanes<Lanes>> = true>
     using xblock_t = std_array<words_t, Lanes>;
 
@@ -157,6 +158,17 @@ class algorithm
     template <typename xWord, if_extended<xWord> = true>
     using xchunk_t = std_array<xWord, SHA::state_words>;
 
+    /// Wide is casting of buffer_t to xWord for single block concurrency.
+    /// This is not multi-block or block striping, just larger words.
+    template <typename xWord, if_extended<xWord> = true>
+    using wbuffer_t = std_array<xWord, sizeof(buffer_t) / sizeof(xWord)>;
+
+    template <typename xWord, if_extended<xWord> = true>
+    using wstate_t = std_array<xWord, sizeof(state_t) / sizeof(xWord)>;
+
+    /// Other types.
+    /// -----------------------------------------------------------------------
+
     using uint = unsigned int;
     using idigests_t = mutable_iterable<digest_t>;
     using pad_t = std_array<word_t, subtract(SHA::block_words,
@@ -210,7 +222,6 @@ class algorithm
 
     template <size_t Round>
     INLINE static constexpr void prepare(auto& buffer) NOEXCEPT;
-    INLINE static constexpr void add_k(auto& buffer) NOEXCEPT;
     static constexpr void schedule_(auto& buffer) NOEXCEPT;
     static constexpr void schedule(buffer_t& buffer) NOEXCEPT;
 
@@ -242,7 +253,7 @@ class algorithm
 
     static constexpr void reinput(auto& buffer, const auto& state) NOEXCEPT;
 
-    /// Iteration.
+    /// Iteration (message scheduling vectorized for multiple blocks).
     /// -----------------------------------------------------------------------
 
     template <size_t Word, size_t Lanes>
@@ -280,7 +291,7 @@ class algorithm
         const ablocks_t<Size>& blocks) NOEXCEPT;
     INLINE static void iterate(state_t& state, iblocks_t& blocks) NOEXCEPT;
 
-    /// Merkle hashing.
+    /// Merkle hashing (fully vectorized for multiple blocks).
     /// -----------------------------------------------------------------------
 
     template <typename xWord>
@@ -311,7 +322,7 @@ class algorithm
     VCONSTEXPR static void merkle_hash_(digests_t& digests,
         size_t offset=zero) NOEXCEPT;
 
-    /// sigma0 vectorization.
+    /// sigma0 vectorization (single blocks).
     /// -----------------------------------------------------------------------
 
     template <typename xWord, if_extended<xWord> = true>
@@ -328,22 +339,45 @@ class algorithm
     INLINE static void schedule_sigma(xbuffer_t<xWord>& xbuffer) NOEXCEPT;
     INLINE static void schedule_sigma(buffer_t& buffer) NOEXCEPT;
 
-    /// Native.
+    /// [K]onstant vectorization (single and multiple blocks).
+    /// -----------------------------------------------------------------------
+
+    template <size_t Round>
+    INLINE static constexpr void konstant(auto& buffer) NOEXCEPT;
+
+    template<size_t Round, typename xWord>
+    INLINE static void vector_konstant(wbuffer_t<xWord>& wbuffer) NOEXCEPT;
+    INLINE static void vector_konstant(buffer_t& buffer) NOEXCEPT;
+
+    template <typename xWord>
+    static constexpr void konstant(xbuffer_t<xWord>& xbuffer) NOEXCEPT;
+    static constexpr void konstant(buffer_t& buffer) NOEXCEPT;
+    static constexpr void konstant_(auto& buffer) NOEXCEPT;
+
+    /// Native SHA optimizations (single blocks).
     /// -----------------------------------------------------------------------
-    static constexpr auto native_lanes = capacity<xint128_t, word_t>;
-    static constexpr auto native_rounds = SHA::rounds / native_lanes;
-    using cbuffer_t = std_array<xint128_t, native_rounds>;
-    using cstate_t = std_array<xint128_t, two>;
 
     template<size_t Round>
-    INLINE static void prepare(cbuffer_t& buffer) NOEXCEPT;
-    INLINE static void add_k(cbuffer_t& buffer) NOEXCEPT;
-    static void schedule(cbuffer_t& buffer) NOEXCEPT;
+    INLINE static void prepare_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;
+    static void schedule(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;
 
     template <typename xWord>
     INLINE static void schedule_native(xbuffer_t<xWord>& xbuffer) NOEXCEPT;
     INLINE static void schedule_native(buffer_t& buffer) NOEXCEPT;
 
+    template<size_t Round, size_t Lane>
+    INLINE static void round_native(wstate_t<xint128_t>& state,
+        const wbuffer_t<xint128_t>& wk) NOEXCEPT;
+
+    INLINE static void shuffle(wstate_t<xint128_t>& wstate) NOEXCEPT;
+    INLINE static void unshuffle(wstate_t<xint128_t>& wstate) NOEXCEPT;
+    INLINE static void summarize_native(wstate_t<xint128_t>& out,
+        const wstate_t<xint128_t>& in) NOEXCEPT;
+
+    template <size_t Lane>
+    INLINE static void compress_native(wstate_t<xint128_t>& state,
+        const wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;
+
     template <typename xWord, size_t Lane>
     INLINE static void compress_native(xstate_t<xWord>& xstate,
         const xbuffer_t<xWord>& xbuffer) NOEXCEPT;
@@ -381,6 +415,7 @@ BC_PUSH_WARNING(NO_POINTER_ARITHMETIC)
 BC_PUSH_WARNING(NO_ARRAY_INDEXING)
 
 #include <bitcoin/system/impl/hash/sha/algorithm_compress.ipp>
+#include <bitcoin/system/impl/hash/sha/algorithm_konstant.ipp>
 #include <bitcoin/system/impl/hash/sha/algorithm_double.ipp>
 #include <bitcoin/system/impl/hash/sha/algorithm_functions.ipp>
 #include <bitcoin/system/impl/hash/sha/algorithm_iterate.ipp>

diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp
@@ -173,7 +173,6 @@ template <size_t Lane>
 constexpr void CLASS::
 compress_(auto& state, const auto& buffer) NOEXCEPT
 {
-    // SHA-NI/256: 64/4 = 16 quad rounds, 8/4 = 2 state elements.
     // This is a copy (state type varies due to vectorization).
     const auto start = state;