Merge pull request #1559 from evoskuil/master

Replace buffered shani with rotating.
libbitcoin · Dec 5, 2024 · 5480d11 · 5480d11
2 parents 2df85a0 + 99167b1
commit 5480d11
Show file tree

Hide file tree

Showing 14 changed files with 369 additions and 440 deletions.
diff --git a/include/bitcoin/system/data/iterable.hpp b/include/bitcoin/system/data/iterable.hpp
@@ -175,7 +175,7 @@ class iterable
         return begin_;
     }
 
-    template <size_t Elements>
+    template <size_t Elements = one>
     inline iterable& advance() NOEXCEPT
     {
         // This is safe for overflow, will advance to end.
@@ -185,7 +185,7 @@ class iterable
         return *this;
     }
 
-    template <size_t Elements>
+    template <size_t Elements = one>
     inline const std_array<value_t, Elements>& to_array() const NOEXCEPT
     {
         return unsafe_array_cast<value_t, Elements>(begin_);

diff --git a/include/bitcoin/system/hash/sha/algorithm.hpp b/include/bitcoin/system/hash/sha/algorithm.hpp
@@ -281,6 +281,11 @@ class algorithm
     INLINE static void iterate_vector(state_t& state,
         iblocks_t& blocks) NOEXCEPT;
 
+    template <size_t Size>
+    INLINE static void iterate_native(state_t& state,
+        const ablocks_t<Size>& blocks) NOEXCEPT;
+    INLINE static void iterate_native(state_t& state, iblocks_t& blocks) NOEXCEPT;
+
     template <size_t Size>
     INLINE static constexpr void iterate_(state_t& state,
         const ablocks_t<Size>& blocks) NOEXCEPT;
@@ -317,7 +322,8 @@ class algorithm
         const xstate_t<xWord>& xstate) NOEXCEPT;
 
     template <typename xWord, if_extended<xWord> = true>
-    INLINE static void merkle_hash_vector(idigests_t& digests, iblocks_t& blocks) NOEXCEPT;
+    INLINE static void merkle_hash_vector(idigests_t& digests,
+        iblocks_t& blocks) NOEXCEPT;
     INLINE static void merkle_hash_vector(digests_t& digests) NOEXCEPT;
     VCONSTEXPR static void merkle_hash_(digests_t& digests,
         size_t offset=zero) NOEXCEPT;
@@ -330,10 +336,10 @@ class algorithm
         auto x6, auto x7, auto x8) NOEXCEPT;
 
     template<size_t Round, size_t Offset>
-    INLINE static void prepare1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT;
+    INLINE static void prepare_1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT;
 
     template<size_t Round>
-    INLINE static void prepare8(buffer_t& buffer) NOEXCEPT;
+    INLINE static void prepare_8(buffer_t& buffer) NOEXCEPT;
 
     template <typename xWord>
     INLINE static void schedule_sigma(xbuffer_t<xWord>& xbuffer) NOEXCEPT;
@@ -357,45 +363,24 @@ class algorithm
     /// Native SHA optimizations (single blocks).
     /// -----------------------------------------------------------------------
 
-    template<size_t Round>
-    INLINE static void prepare_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;
-    static void schedule_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;
-
-    template <typename xWord>
-    INLINE static void schedule_native(xbuffer_t<xWord>& xbuffer) NOEXCEPT;
-    INLINE static void schedule_native(buffer_t& buffer) NOEXCEPT;
-
-    template<size_t Round, size_t Lane>
-    INLINE static void round_native(wstate_t<xint128_t>& state,
-        const wbuffer_t<xint128_t>& wk) NOEXCEPT;
-
-    INLINE static void shuffle(wstate_t<xint128_t>& wstate) NOEXCEPT;
-    INLINE static void unshuffle(wstate_t<xint128_t>& wstate) NOEXCEPT;
-    INLINE static void summarize_native(wstate_t<xint128_t>& out,
-        const wstate_t<xint128_t>& in) NOEXCEPT;
+    INLINE static void shuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT;
+    INLINE static void unshuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT;
+    INLINE static void prepare(xint128_t& message0, xint128_t message1) NOEXCEPT;
+    INLINE static void prepare(xint128_t& message0, xint128_t message1,
+        xint128_t& message2) NOEXCEPT;
 
-    template <size_t Lane>
-    static void compress_native(wstate_t<xint128_t>& state,
-        const wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;
-
-    template <typename xWord, size_t Lane>
-    INLINE static void compress_native(xstate_t<xWord>& xstate,
-        const xbuffer_t<xWord>& xbuffer) NOEXCEPT;
-
-    template <typename xWord, size_t Lane>
-    INLINE static void compress_native(state_t& state,
-        const xbuffer_t<xWord>& xbuffer) NOEXCEPT;
+    template <size_t Round>
+    INLINE static void round_4(xint128_t& state0, xint128_t& state1,
+        xint128_t message) NOEXCEPT;
 
-    template <size_t Lane>
-    INLINE static void compress_native(state_t& state,
-        const buffer_t& buffer) NOEXCEPT;
+    static void native_rounds(state_t& state, iblocks_t& blocks) NOEXCEPT;
 
 public:
     /// Summary public values.
     /// -----------------------------------------------------------------------
     static constexpr auto caching = Cached;
-    static constexpr auto native = (use_shani || use_neon) &&
-        !is_same_size<word_t, uint64_t>;
+    static constexpr auto native = (use_shani || use_neon)
+        && (SHA::strength == 256 || SHA::strength == 160);
     static constexpr auto vector = (use_x128 || use_x256 || use_x512)
         && !(build_x32 && is_same_size<word_t, uint64_t>);
 };

diff --git a/include/bitcoin/system/have.hpp b/include/bitcoin/system/have.hpp
@@ -110,12 +110,13 @@
     #define HAVE_XASSEMBLY
 #endif
 
+/// DISABLED
 /// ARM Neon intrinsics.
 #if defined(HAVE_ARM)
     // -march=armv8-a+crc+crypto [all]
     // -arch arm64 [apple] (also -isysroot to phone sdk)
     #if defined(HAVE_GNUC) || defined(__ARM_NEON) || defined(HAVE_MSC)
-        #define HAVE_NEON
+        ////#define HAVE_NEON
     #endif
 #endif
 

diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp
@@ -71,17 +71,6 @@ round(auto a, auto& b, auto c, auto d, auto& e, auto wk) NOEXCEPT
 
     e = /*a =*/ f::add<s>(f::add<s>(f::add<s>(f::rol<5, s>(a), fn(b, c, d)), e), wk);
     b = /*c =*/ f::rol<30, s>(b);
-
-    // SHA-NI
-    // Four rounds (total rounds 80/4).
-    // First round is add(e, w), then sha1nexte(e, w).
-    // fk is round-based enumeration implying f selection and k value.
-    //     e1 = sha1nexte(e0, w);
-    //     abcd = sha1rnds4(abcd, e0, fk);
-    // NEON
-    // f is implied by k in wk.
-    //     e1 = vsha1h(vgetq_lane(abcd, 0);
-    //     vsha1cq(abcd, e0, vaddq(w, k));
 }
 
 TEMPLATE
@@ -97,16 +86,6 @@ round(auto a, auto b, auto c, auto& d, auto e, auto f, auto g, auto& h,
     const auto t = f::add<s>(f::add<s>(f::add<s>(Sigma1(e), choice(e, f, g)), h), wk);
     d = /*e =*/    f::add<s>(d, t);
     h = /*a =*/    f::add<s>(f::add<s>(Sigma0(a), majority(a, b, c)), t);
-
-    // Each call is 2 rounds, s, w and k are 128 (4 words each, s1/s2 is 8 word state).
-    // SHA-NI
-    //     const auto value = add(w, k);
-    //     abcd = sha256rnds2(abcd, efgh, value);
-    //     efgh = sha256rnds2(efgh, abcd, shuffle(value));
-    // NEON
-    //     const auto value = vaddq(w, k);
-    //     abcd = vsha256hq(abcd, efgh, value);
-    //     efgh = vsha256h2q(efgh, abcd, value);
 }
 
 TEMPLATE
@@ -125,10 +104,6 @@ round(auto& state, const auto& wk) NOEXCEPT
             state[(SHA::rounds + 3 - Round) % SHA::state_words],
             state[(SHA::rounds + 4 - Round) % SHA::state_words], // a->e
             extract<word, Lane>(wk[Round]));
-
-        // SHA-NI/NEON
-        // State packs in 128 (one state variable), reduces above to 1 out[].
-        // Input value is 128 (w). Constants (k) statically initialized as 128.
     }
     else
     {
@@ -142,10 +117,6 @@ round(auto& state, const auto& wk) NOEXCEPT
             state[(SHA::rounds + 6 - Round) % SHA::state_words],
             state[(SHA::rounds + 7 - Round) % SHA::state_words], // a->h
             extract<word, Lane>(wk[Round]));
-
-        // SHA-NI/NEON
-        // Each element is 128 (vs. 32), reduces above to 2 out[] (s0/s1).
-        // Input value is 128 (w). Constants (k) statically initialized as 128.
     }
 }
 
@@ -276,11 +247,11 @@ compress(state_t& state, const buffer_t& buffer) NOEXCEPT
     {
         compress_<Lane>(state, buffer);
     }
-    else if constexpr (native)
-    {
-        // Single block shani compression optimization.
-        compress_native<Lane>(state, buffer);
-    }
+    ////else if constexpr (native)
+    ////{
+    ////    // Single block shani compression optimization.
+    ////    compress_native<Lane>(state, buffer);
+    ////}
     ////else if constexpr (vector)
     ////{
     ////    // Compression is not vectorized within a block, however this is

diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp
@@ -228,7 +228,7 @@ iterate_vector(state_t& state, const ablocks_t<Size>& blocks) NOEXCEPT
 {
     if (blocks.size() >= min_lanes)
     {
-        auto iblocks = iblocks_t{ array_cast<byte_t>(blocks) };
+        iblocks_t iblocks{ array_cast<byte_t>(blocks) };
         iterate_vector(state, iblocks);
     }
     else
@@ -237,6 +237,31 @@ iterate_vector(state_t& state, const ablocks_t<Size>& blocks) NOEXCEPT
     }
 }
 
+// Native SHA
+// ============================================================================
+// www.intel.com/content/dam/develop/external/us/en/documents/
+// intel-sha-extensions-white-paper-402097.pdf
+
+TEMPLATE
+INLINE void CLASS::
+iterate_native(state_t& state, iblocks_t& blocks) NOEXCEPT
+{
+    native_rounds(state, blocks);
+}
+
+TEMPLATE
+template <size_t Size>
+INLINE void CLASS::
+iterate_native(state_t& state, const ablocks_t<Size>& blocks) NOEXCEPT
+{
+    iblocks_t iblocks{ array_cast<byte_t>(blocks) };
+    native_rounds(state, iblocks);
+}
+
+// Dispatch and normal forms.
+// ============================================================================
+// protected
+
 TEMPLATE
 template <size_t Size>
 INLINE constexpr void CLASS::
@@ -273,11 +298,9 @@ iterate(state_t& state, const ablocks_t<Size>& blocks) NOEXCEPT
     {
         iterate_(state, blocks);
     }
-    else if constexpr (native)
+    else if constexpr (native && SHA::strength == 256)
     {
-        // TODO: evaluate 4/8/16 lane message scheduling vs. shani scheduling.
-        // Multiple block shani message schduling and compression optimization.
-        iterate_(state, blocks);
+        iterate_native(state, blocks);
     }
     else if constexpr (vector)
     {
@@ -294,11 +317,9 @@ TEMPLATE
 INLINE void CLASS::
 iterate(state_t& state, iblocks_t& blocks) NOEXCEPT
 {
-    if constexpr (native)
+    if constexpr (native && SHA::strength == 256)
     {
-        // TODO: evaluate 4/8/16 lane message scheduling vs. shani scheduling.
-        // Multiple block shani message schduling and compression optimization.
-        iterate_(state, blocks);
+        iterate_native(state, blocks);
     }
     else if constexpr (vector)
     {

diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp
@@ -50,27 +50,26 @@ template<size_t Round, typename xWord>
 INLINE void CLASS::
 vector_konstant(wbuffer_t<xWord>& wbuffer) NOEXCEPT
 {
-    constexpr auto s = SHA::word_bits;
     constexpr auto lanes = capacity<xWord, word_t>;
     constexpr auto r = Round * lanes;
 
     if constexpr (lanes == 16)
     {
-        wbuffer[Round] = f::add<s>(wbuffer[Round], set<xWord>(
+        wbuffer[Round] = add<word_t>(wbuffer[Round], set<xWord>(
             K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3],
             K::get[r + 4], K::get[r + 5], K::get[r + 6], K::get[r + 7],
             K::get[r + 8], K::get[r + 9], K::get[r + 10], K::get[r + 11],
             K::get[r + 12], K::get[r + 13], K::get[r + 14], K::get[r + 15]));
     }
     else if constexpr (lanes == 8)
     {
-        wbuffer[Round] = f::add<s>(wbuffer[Round], set<xWord>(
+        wbuffer[Round] = add<word_t>(wbuffer[Round], set<xWord>(
             K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3],
             K::get[r + 4], K::get[r + 5], K::get[r + 6], K::get[r + 7]));
     }
     else if constexpr (lanes == 4)
     {
-        wbuffer[Round] = f::add<s>(wbuffer[Round], set<xWord>(
+        wbuffer[Round] = add<word_t>(wbuffer[Round], set<xWord>(
             K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3]));
     }
 }

diff --git a/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp b/include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp
@@ -421,6 +421,10 @@ merkle_hash_vector(digests_t& digests) NOEXCEPT
 // ----------------------------------------------------------------------------
 // public
 
+// TODO: consider eliminating endianness conversions internal to the root
+// computation, instead converting on way in and way out ony, and using non
+// converting input/output (nop) functions.
+
 TEMPLATE
 VCONSTEXPR typename CLASS::digest_t CLASS::
 merkle_root(digests_t&& digests) NOEXCEPT