Merge pull request #1554 from evoskuil/master

Initial implementation of shani message scheduling.
libbitcoin · Nov 25, 2024 · 0eac043 · 0eac043
2 parents 4d4d3e2 + 1e79de7
commit 0eac043
Show file tree

Hide file tree

Showing 8 changed files with 172 additions and 54 deletions.
diff --git a/include/bitcoin/system/hash/rmd/algorithm.hpp b/include/bitcoin/system/hash/rmd/algorithm.hpp
@@ -25,7 +25,7 @@
 #include <bitcoin/system/hash/algorithm.hpp>
 #include <bitcoin/system/math/math.hpp>
 
- // algorithm.hpp file is the common include for rmd.
+// algorithm.hpp file is the common include for rmd.
 #include <bitcoin/system/hash/rmd/rmd.hpp>
 #include <bitcoin/system/hash/rmd/rmd128.hpp>
 #include <bitcoin/system/hash/rmd/rmd160.hpp>

diff --git a/include/bitcoin/system/hash/sha/algorithm.hpp b/include/bitcoin/system/hash/sha/algorithm.hpp
@@ -26,7 +26,7 @@
 #include <bitcoin/system/intrinsics/intrinsics.hpp>
 #include <bitcoin/system/math/math.hpp>
 
- // algorithm.hpp file is the common include for sha.
+// algorithm.hpp file is the common include for sha.
 #include <bitcoin/system/hash/sha/sha.hpp>
 #include <bitcoin/system/hash/sha/sha160.hpp>
 #include <bitcoin/system/hash/sha/sha256.hpp>
@@ -330,11 +330,15 @@ class algorithm
 
     /// Native.
     /// -----------------------------------------------------------------------
-    ////using cword_t = xint128_t;
-    ////static constexpr auto cratio = sizeof(cword_t) / SHA::word_bytes;
-    ////static constexpr auto crounds = SHA::rounds / cratio;
-    ////using cbuffer_t = std_array<cword_t, crounds>;
-    ////using cstate_t = std_array<xint128_t, two>;
+    static constexpr auto native_lanes = capacity<xint128_t, word_t>;
+    static constexpr auto native_rounds = SHA::rounds / native_lanes;
+    using cbuffer_t = std_array<xint128_t, native_rounds>;
+    using cstate_t = std_array<xint128_t, two>;
+
+    template<size_t Round>
+    INLINE static void prepare(cbuffer_t& buffer) NOEXCEPT;
+    INLINE static void add_k(cbuffer_t& buffer) NOEXCEPT;
+    static void schedule(cbuffer_t& buffer) NOEXCEPT;
 
     template <typename xWord>
     INLINE static void schedule_native(xbuffer_t<xWord>& xbuffer) NOEXCEPT;
@@ -356,7 +360,8 @@ class algorithm
     /// Summary public values.
     /// -----------------------------------------------------------------------
     static constexpr auto caching = Cached;
-    static constexpr auto native = use_shani || use_neon;
+    static constexpr auto native = (use_shani || use_neon) &&
+        !is_same_size<word_t, uint64_t>;
     static constexpr auto vector = (use_x128 || use_x256 || use_x512)
         && !(build_x32 && is_same_size<word_t, uint64_t>);
 };

diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_functions.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_functions.ipp
@@ -37,6 +37,7 @@ TEMPLATE
 INLINE constexpr auto CLASS::
 parity(auto x, auto y, auto z) NOEXCEPT
 {
+    // Normal form, unmodified.
     return f::xor_(f::xor_(x, y), z);
 }
 
@@ -63,6 +64,7 @@ template <unsigned int A, unsigned int B, unsigned int C>
 INLINE constexpr auto CLASS::
 sigma(auto x) NOEXCEPT
 {
+    // Normal form, unmodified.
     constexpr auto s = SHA::word_bits;
     return f::xor_(f::xor_(f::ror<A, s>(x), f::ror<B, s>(x)), f::shr<C, s>(x));
 }

diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_native.ipp
@@ -27,30 +27,171 @@
 // two state variables. This applies to sha160 and sha256, but sha512 native
 // is not supported.
 
+// The base buffer is already populated with proper endianness.
+// Input could be optimized using intrinsics (see comments in parse).
+// The unextended state vector is already output with proper endianness.
+// Output could also be optimized using intrinsics (see comments in parse).
+
 namespace libbitcoin {
 namespace system {
 namespace sha {
 
-// protected
-// ----------------------------------------------------------------------------
+TEMPLATE
+template<size_t Round>
+INLINE void CLASS::
+prepare(cbuffer_t& buffer) NOEXCEPT
+{
+    // K-adding is shifted 16 words, with last 16 added after scheduling.
+
+    if constexpr (SHA::strength == 160)
+    {
+        ////static_assert(false, "sha160 not implemented");
+    }
+    else if constexpr (use_neon)
+    {
+        ////static_assert(false, "neon not implemented");
+    }
+    else
+    {
+        static_assert(SHA::strength == 256);
+
+        constexpr auto r1 = Round - 1;
+        constexpr auto r2 = sub1(r1);
+        constexpr auto r3 = sub1(r2);
+        constexpr auto r4 = sub1(r3);
+        constexpr auto k0 = Round * 4 - 16;
+        constexpr auto k1 = add1(k0);
+        constexpr auto k2 = add1(k1);
+        constexpr auto k3 = add1(k2);
+
+        buffer[Round] = mm_sha256msg2_epu32
+            (
+                mm_add_epi32
+                (
+                    mm_alignr_epi8
+                    (
+                        buffer[r1], buffer[r2], SHA::word_bytes
+                    ),
+                    mm_sha256msg1_epu32
+                    (
+                        buffer[r4], buffer[r3]
+                    )
+                ),
+                buffer[r1]
+            );
+
+        buffer[r4] = mm_add_epi32
+            (
+                buffer[r4],
+                mm_set_epi32(K::get[k3], K::get[k2], K::get[k1], K::get[k0])
+            );
+    }
+}
 
 TEMPLATE
-template <typename xWord>
 INLINE void CLASS::
-schedule_native(xbuffer_t<xWord>& xbuffer) NOEXCEPT
+add_k(cbuffer_t& buffer) NOEXCEPT
 {
-    // Merkle extended buffer is not native dispatched.
-    schedule_(xbuffer);
+    // Add K to last 16 words.
+    // TODO: Consolidated K-adding can be performed in 4/8/16 lanes.
+    constexpr auto k = SHA::rounds - SHA::block_words;
+    constexpr auto r = k / native_lanes;
+
+    buffer[r + 0] = mm_add_epi32
+        (
+            buffer[r + 0],
+            mm_set_epi32(
+                K::get[k + 3], K::get[k + 2],
+                K::get[k + 1], K::get[k + 0])
+        );
+
+    buffer[r + 1] = mm_add_epi32
+        (
+            buffer[r + 1],
+            mm_set_epi32(
+                K::get[k + 7], K::get[k + 6],
+                K::get[k + 5], K::get[k + 4])
+        );
+
+    buffer[r + 2] = mm_add_epi32
+        (
+            buffer[r + 2],
+            mm_set_epi32(
+                K::get[k + 11], K::get[k + 10],
+                K::get[k + 9], K::get[k + 8])
+        );
+
+    buffer[r + 3] = mm_add_epi32
+        (
+            buffer[r + 3],
+            mm_set_epi32(
+                K::get[k + 15], K::get[k + 14],
+                K::get[k + 13], K::get[k + 12])
+        );
+}
+
+TEMPLATE
+INLINE void CLASS::
+schedule(cbuffer_t& buffer) NOEXCEPT
+{
+    auto& cbuffer = array_cast<xint128_t>(buffer);
+
+    prepare<4>(cbuffer);
+    prepare<5>(cbuffer);
+    prepare<6>(cbuffer);
+    prepare<7>(cbuffer);
+    prepare<8>(cbuffer);
+    prepare<9>(cbuffer);
+    prepare<10>(cbuffer);
+    prepare<11>(cbuffer);
+    prepare<12>(cbuffer);
+    prepare<13>(cbuffer);
+    prepare<14>(cbuffer);
+    prepare<15>(cbuffer);
+
+    ////if constexpr (SHA::rounds == 80)
+    ////{
+    ////    prepare<16>(buffer);
+    ////    prepare<17>(buffer);
+    ////    prepare<18>(buffer);
+    ////    prepare<19>(buffer);
+    ////}
+
+    add_k(buffer);
 }
 
+// schedule
+// ----------------------------------------------------------------------------
+// protected
+
 TEMPLATE
 INLINE void CLASS::
 schedule_native(buffer_t& buffer) NOEXCEPT
 {
-    // TODO: single block compression.
-    schedule_(buffer);
+    // neon and sha160 not yet implemented, sha512 is not native.
+    if constexpr (SHA::strength == 160 || SHA::strength == 512 || use_neon)
+    {
+        schedule_(buffer);
+    }
+    else
+    {
+        schedule(array_cast<xint128_t>(buffer));
+    }
 }
 
+TEMPLATE
+template <typename xWord>
+INLINE void CLASS::
+schedule_native(xbuffer_t<xWord>& xbuffer) NOEXCEPT
+{
+    // Merkle extended buffer is not native dispatched.
+    schedule_(xbuffer);
+}
+
+// compression
+// ----------------------------------------------------------------------------
+// protected
+
 TEMPLATE
 template <typename xWord, size_t Lane>
 INLINE void CLASS::
@@ -75,7 +216,7 @@ template <size_t Lane>
 INLINE void CLASS::
 compress_native(state_t& state, const buffer_t& buffer) NOEXCEPT
 {
-    // TODO: single block compression.
+    // TODO: Single block compression.
     compress_<Lane>(state, buffer);
 }
 

diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp
@@ -36,9 +36,7 @@ template<size_t Round>
 INLINE constexpr void CLASS::
 prepare(auto& buffer) NOEXCEPT
 {
-    // K is added to schedule words because schedule is vectorizable.
-    // This allows 3/4 of the cost of the K addtion to be vectorized.
-    // K-adding is shifted -16, with last 16 added after scheduling.
+    // K-adding is shifted 16 words, with last 16 added after scheduling.
     constexpr auto s = SHA::word_bits;
 
     if constexpr (SHA::strength == 160)
@@ -53,24 +51,6 @@ prepare(auto& buffer) NOEXCEPT
             f::xor_(buffer[r08], buffer[r03])));
 
         buffer[r16] = f::addc<K::get[r16], s>(buffer[r16]);
-
-        // SHA-NI
-        //
-        //     buffer[Round] = sha1msg2 // xor and rotl1
-        //     (
-        //         xor                // not using sha1msg1
-        //         (
-        //             sha1msg1       // xor (specialized)
-        //             (
-        //                 buffer[Round - 16],
-        //                 buffer[Round - 14]
-        //             ),
-        //             buffer[Round -  8]
-        //          ),
-        //          buffer[Round -  3]
-        //     );
-        // NEON
-        //     vsha1su1q/vsha1su0q
     }
     else
     {
@@ -84,18 +64,6 @@ prepare(auto& buffer) NOEXCEPT
             f::add<s>(buffer[r07], sigma1(buffer[r02])));
 
         buffer[r16] = f::addc<K::get[r16], s>(buffer[r16]);
-
-        // Each word is 128, buffer goes from 64 to 16 words.
-        // SHA-NI
-        // buffer[Round] =
-        //     sha256msg1(buffer[Round - 16], buffer[Round - 15]) +
-        //     sha256msg2(buffer[Round -  7], buffer[Round -  2]);
-        // NEON
-        // Not sure about these indexes.
-        // mijailovic.net/2018/06/06/sha256-armv8
-        // buffer[Round] =
-        //     vsha256su0q(buffer[Round - 13], buffer[Round - 9]) +
-        //     vsha256su1q(buffer[Round - 13], buffer[Round - 5], buffer[Round - 1]);
     }
 }
 
@@ -104,8 +72,9 @@ INLINE constexpr void CLASS::
 add_k(auto& buffer) NOEXCEPT
 {
     // Add K to last 16 words.
+    // TODO: Consolidated K-adding can be performed in 4/8/16 lanes.
     constexpr auto s = SHA::word_bits;
-    constexpr auto r = SHA::rounds - array_count<words_t>;
+    constexpr auto r = SHA::rounds - SHA::block_words;
     buffer[r + 0] = f::addc<K::get[r + 0], s>(buffer[r + 0]);
     buffer[r + 1] = f::addc<K::get[r + 1], s>(buffer[r + 1]);
     buffer[r + 2] = f::addc<K::get[r + 2], s>(buffer[r + 2]);
@@ -128,7 +97,6 @@ TEMPLATE
 constexpr void CLASS::
 schedule_(auto& buffer) NOEXCEPT
 {
-
     prepare<16>(buffer);
     prepare<17>(buffer);
     prepare<18>(buffer);

diff --git a/include/bitcoin/system/intrinsics/xcpu/defines.hpp b/include/bitcoin/system/intrinsics/xcpu/defines.hpp
@@ -112,6 +112,7 @@ BC_POP_WARNING()
 #endif
 
 #if !defined(HAVE_SSE41)
+    #define mm_alignr_epi8(a, b, c) {}
     #define mm_and_si128(a, b)  (a)
     #define mm_or_si128(a, b)   (a)
     #define mm_xor_si128(a, b)  (a)
@@ -145,6 +146,7 @@ BC_POP_WARNING()
     #define mm_set_epi16(x08, x07, x06, x05, x04, x03, x02, x01)
     #define mm_set_epi8(x16, x15, x14, x13, x12, x11, x10, x09, x08, x07, x06, x05, x04, x03, x02, x01)
 #else
+    #define mm_alignr_epi8(a, b, c)     _mm_alignr_epi8(a, b, c) // for native sha (128 only)
     #define mm_and_si128(a, b)          _mm_and_si128(a, b)
     #define mm_or_si128(a, b)           _mm_or_si128(a, b)
     #define mm_xor_si128(a, b)          _mm_xor_si128(a, b)

diff --git a/test/hash/performance/performance.hpp b/test/hash/performance/performance.hpp
@@ -212,7 +212,7 @@ using hash_selector = iif<Ripemd, rmd_algorithm<Strength>,
 
 static_assert(hash_selector< 160, true,  true, true, false>::native == with_shani || with_neon);
 static_assert(hash_selector< 256, true,  true, true, false>::native == with_shani || with_neon);
-static_assert(hash_selector< 512, true,  true, true, false>::native == with_shani || with_neon);
+static_assert(hash_selector< 512, true,  true, true, false>::native == /*with_shani || with_neon*/ false);
 static_assert(!hash_selector<160, false, true, true, false>::native);
 static_assert(!hash_selector<256, false, true, true, false>::native);
 static_assert(!hash_selector<512, false, true, true, false>::native);

diff --git a/test/hash/sha/sha512.cpp b/test/hash/sha/sha512.cpp
@@ -22,7 +22,7 @@
 BOOST_AUTO_TEST_SUITE(sha512_tests_)
 
 constexpr auto vector = (with_sse41 || with_avx2 || with_avx512) && !build_x32;
-constexpr auto native = with_shani || with_neon;
+constexpr auto native = /*(with_shani || with_neon)*/ false;
 
 BOOST_AUTO_TEST_CASE(sha512__hash__null_hash__expected)
 {