Skip to content

Commit

Permalink
Merge pull request #1559 from evoskuil/master
Browse files Browse the repository at this point in the history
Replace buffered shani with rotating.
  • Loading branch information
evoskuil authored Dec 5, 2024
2 parents 2df85a0 + 99167b1 commit 5480d11
Show file tree
Hide file tree
Showing 14 changed files with 369 additions and 440 deletions.
4 changes: 2 additions & 2 deletions include/bitcoin/system/data/iterable.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ class iterable
return begin_;
}

template <size_t Elements>
template <size_t Elements = one>
inline iterable& advance() NOEXCEPT
{
// This is safe for overflow, will advance to end.
Expand All @@ -185,7 +185,7 @@ class iterable
return *this;
}

template <size_t Elements>
template <size_t Elements = one>
inline const std_array<value_t, Elements>& to_array() const NOEXCEPT
{
return unsafe_array_cast<value_t, Elements>(begin_);
Expand Down
55 changes: 20 additions & 35 deletions include/bitcoin/system/hash/sha/algorithm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,11 @@ class algorithm
INLINE static void iterate_vector(state_t& state,
iblocks_t& blocks) NOEXCEPT;

template <size_t Size>
INLINE static void iterate_native(state_t& state,
const ablocks_t<Size>& blocks) NOEXCEPT;
INLINE static void iterate_native(state_t& state, iblocks_t& blocks) NOEXCEPT;

template <size_t Size>
INLINE static constexpr void iterate_(state_t& state,
const ablocks_t<Size>& blocks) NOEXCEPT;
Expand Down Expand Up @@ -317,7 +322,8 @@ class algorithm
const xstate_t<xWord>& xstate) NOEXCEPT;

template <typename xWord, if_extended<xWord> = true>
INLINE static void merkle_hash_vector(idigests_t& digests, iblocks_t& blocks) NOEXCEPT;
INLINE static void merkle_hash_vector(idigests_t& digests,
iblocks_t& blocks) NOEXCEPT;
INLINE static void merkle_hash_vector(digests_t& digests) NOEXCEPT;
VCONSTEXPR static void merkle_hash_(digests_t& digests,
size_t offset=zero) NOEXCEPT;
Expand All @@ -330,10 +336,10 @@ class algorithm
auto x6, auto x7, auto x8) NOEXCEPT;

template<size_t Round, size_t Offset>
INLINE static void prepare1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT;
INLINE static void prepare_1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT;

template<size_t Round>
INLINE static void prepare8(buffer_t& buffer) NOEXCEPT;
INLINE static void prepare_8(buffer_t& buffer) NOEXCEPT;

template <typename xWord>
INLINE static void schedule_sigma(xbuffer_t<xWord>& xbuffer) NOEXCEPT;
Expand All @@ -357,45 +363,24 @@ class algorithm
/// Native SHA optimizations (single blocks).
/// -----------------------------------------------------------------------

template<size_t Round>
INLINE static void prepare_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;
static void schedule_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;

template <typename xWord>
INLINE static void schedule_native(xbuffer_t<xWord>& xbuffer) NOEXCEPT;
INLINE static void schedule_native(buffer_t& buffer) NOEXCEPT;

template<size_t Round, size_t Lane>
INLINE static void round_native(wstate_t<xint128_t>& state,
const wbuffer_t<xint128_t>& wk) NOEXCEPT;

INLINE static void shuffle(wstate_t<xint128_t>& wstate) NOEXCEPT;
INLINE static void unshuffle(wstate_t<xint128_t>& wstate) NOEXCEPT;
INLINE static void summarize_native(wstate_t<xint128_t>& out,
const wstate_t<xint128_t>& in) NOEXCEPT;
INLINE static void shuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT;
INLINE static void unshuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT;
INLINE static void prepare(xint128_t& message0, xint128_t message1) NOEXCEPT;
INLINE static void prepare(xint128_t& message0, xint128_t message1,
xint128_t& message2) NOEXCEPT;

template <size_t Lane>
static void compress_native(wstate_t<xint128_t>& state,
const wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;

template <typename xWord, size_t Lane>
INLINE static void compress_native(xstate_t<xWord>& xstate,
const xbuffer_t<xWord>& xbuffer) NOEXCEPT;

template <typename xWord, size_t Lane>
INLINE static void compress_native(state_t& state,
const xbuffer_t<xWord>& xbuffer) NOEXCEPT;
template <size_t Round>
INLINE static void round_4(xint128_t& state0, xint128_t& state1,
xint128_t message) NOEXCEPT;

template <size_t Lane>
INLINE static void compress_native(state_t& state,
const buffer_t& buffer) NOEXCEPT;
static void native_rounds(state_t& state, iblocks_t& blocks) NOEXCEPT;

public:
/// Summary public values.
/// -----------------------------------------------------------------------
static constexpr auto caching = Cached;
static constexpr auto native = (use_shani || use_neon) &&
!is_same_size<word_t, uint64_t>;
static constexpr auto native = (use_shani || use_neon)
&& (SHA::strength == 256 || SHA::strength == 160);
static constexpr auto vector = (use_x128 || use_x256 || use_x512)
&& !(build_x32 && is_same_size<word_t, uint64_t>);
};
Expand Down
3 changes: 2 additions & 1 deletion include/bitcoin/system/have.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,13 @@
#define HAVE_XASSEMBLY
#endif

/// DISABLED
/// ARM Neon intrinsics.
#if defined(HAVE_ARM)
// -march=armv8-a+crc+crypto [all]
// -arch arm64 [apple] (also -isysroot to phone sdk)
#if defined(HAVE_GNUC) || defined(__ARM_NEON) || defined(HAVE_MSC)
#define HAVE_NEON
////#define HAVE_NEON
#endif
#endif

Expand Down
39 changes: 5 additions & 34 deletions include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -71,17 +71,6 @@ round(auto a, auto& b, auto c, auto d, auto& e, auto wk) NOEXCEPT

e = /*a =*/ f::add<s>(f::add<s>(f::add<s>(f::rol<5, s>(a), fn(b, c, d)), e), wk);
b = /*c =*/ f::rol<30, s>(b);

// SHA-NI
// Four rounds (total rounds 80/4).
// First round is add(e, w), then sha1nexte(e, w).
// fk is round-based enumeration implying f selection and k value.
// e1 = sha1nexte(e0, w);
// abcd = sha1rnds4(abcd, e0, fk);
// NEON
// f is implied by k in wk.
// e1 = vsha1h(vgetq_lane(abcd, 0);
// vsha1cq(abcd, e0, vaddq(w, k));
}

TEMPLATE
Expand All @@ -97,16 +86,6 @@ round(auto a, auto b, auto c, auto& d, auto e, auto f, auto g, auto& h,
const auto t = f::add<s>(f::add<s>(f::add<s>(Sigma1(e), choice(e, f, g)), h), wk);
d = /*e =*/ f::add<s>(d, t);
h = /*a =*/ f::add<s>(f::add<s>(Sigma0(a), majority(a, b, c)), t);

// Each call is 2 rounds, s, w and k are 128 (4 words each, s1/s2 is 8 word state).
// SHA-NI
// const auto value = add(w, k);
// abcd = sha256rnds2(abcd, efgh, value);
// efgh = sha256rnds2(efgh, abcd, shuffle(value));
// NEON
// const auto value = vaddq(w, k);
// abcd = vsha256hq(abcd, efgh, value);
// efgh = vsha256h2q(efgh, abcd, value);
}

TEMPLATE
Expand All @@ -125,10 +104,6 @@ round(auto& state, const auto& wk) NOEXCEPT
state[(SHA::rounds + 3 - Round) % SHA::state_words],
state[(SHA::rounds + 4 - Round) % SHA::state_words], // a->e
extract<word, Lane>(wk[Round]));

// SHA-NI/NEON
// State packs in 128 (one state variable), reduces above to 1 out[].
// Input value is 128 (w). Constants (k) statically initialized as 128.
}
else
{
Expand All @@ -142,10 +117,6 @@ round(auto& state, const auto& wk) NOEXCEPT
state[(SHA::rounds + 6 - Round) % SHA::state_words],
state[(SHA::rounds + 7 - Round) % SHA::state_words], // a->h
extract<word, Lane>(wk[Round]));

// SHA-NI/NEON
// Each element is 128 (vs. 32), reduces above to 2 out[] (s0/s1).
// Input value is 128 (w). Constants (k) statically initialized as 128.
}
}

Expand Down Expand Up @@ -276,11 +247,11 @@ compress(state_t& state, const buffer_t& buffer) NOEXCEPT
{
compress_<Lane>(state, buffer);
}
else if constexpr (native)
{
// Single block shani compression optimization.
compress_native<Lane>(state, buffer);
}
////else if constexpr (native)
////{
//// // Single block shani compression optimization.
//// compress_native<Lane>(state, buffer);
////}
////else if constexpr (vector)
////{
//// // Compression is not vectorized within a block, however this is
Expand Down
39 changes: 30 additions & 9 deletions include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ iterate_vector(state_t& state, const ablocks_t<Size>& blocks) NOEXCEPT
{
if (blocks.size() >= min_lanes)
{
auto iblocks = iblocks_t{ array_cast<byte_t>(blocks) };
iblocks_t iblocks{ array_cast<byte_t>(blocks) };
iterate_vector(state, iblocks);
}
else
Expand All @@ -237,6 +237,31 @@ iterate_vector(state_t& state, const ablocks_t<Size>& blocks) NOEXCEPT
}
}

// Native SHA
// ============================================================================
// www.intel.com/content/dam/develop/external/us/en/documents/
// intel-sha-extensions-white-paper-402097.pdf

TEMPLATE
INLINE void CLASS::
iterate_native(state_t& state, iblocks_t& blocks) NOEXCEPT
{
native_rounds(state, blocks);
}

TEMPLATE
template <size_t Size>
INLINE void CLASS::
iterate_native(state_t& state, const ablocks_t<Size>& blocks) NOEXCEPT
{
iblocks_t iblocks{ array_cast<byte_t>(blocks) };
native_rounds(state, iblocks);
}

// Dispatch and normal forms.
// ============================================================================
// protected

TEMPLATE
template <size_t Size>
INLINE constexpr void CLASS::
Expand Down Expand Up @@ -273,11 +298,9 @@ iterate(state_t& state, const ablocks_t<Size>& blocks) NOEXCEPT
{
iterate_(state, blocks);
}
else if constexpr (native)
else if constexpr (native && SHA::strength == 256)
{
// TODO: evaluate 4/8/16 lane message scheduling vs. shani scheduling.
// Multiple block shani message schduling and compression optimization.
iterate_(state, blocks);
iterate_native(state, blocks);
}
else if constexpr (vector)
{
Expand All @@ -294,11 +317,9 @@ TEMPLATE
INLINE void CLASS::
iterate(state_t& state, iblocks_t& blocks) NOEXCEPT
{
if constexpr (native)
if constexpr (native && SHA::strength == 256)
{
// TODO: evaluate 4/8/16 lane message scheduling vs. shani scheduling.
// Multiple block shani message schduling and compression optimization.
iterate_(state, blocks);
iterate_native(state, blocks);
}
else if constexpr (vector)
{
Expand Down
7 changes: 3 additions & 4 deletions include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -50,27 +50,26 @@ template<size_t Round, typename xWord>
INLINE void CLASS::
vector_konstant(wbuffer_t<xWord>& wbuffer) NOEXCEPT
{
constexpr auto s = SHA::word_bits;
constexpr auto lanes = capacity<xWord, word_t>;
constexpr auto r = Round * lanes;

if constexpr (lanes == 16)
{
wbuffer[Round] = f::add<s>(wbuffer[Round], set<xWord>(
wbuffer[Round] = add<word_t>(wbuffer[Round], set<xWord>(
K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3],
K::get[r + 4], K::get[r + 5], K::get[r + 6], K::get[r + 7],
K::get[r + 8], K::get[r + 9], K::get[r + 10], K::get[r + 11],
K::get[r + 12], K::get[r + 13], K::get[r + 14], K::get[r + 15]));
}
else if constexpr (lanes == 8)
{
wbuffer[Round] = f::add<s>(wbuffer[Round], set<xWord>(
wbuffer[Round] = add<word_t>(wbuffer[Round], set<xWord>(
K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3],
K::get[r + 4], K::get[r + 5], K::get[r + 6], K::get[r + 7]));
}
else if constexpr (lanes == 4)
{
wbuffer[Round] = f::add<s>(wbuffer[Round], set<xWord>(
wbuffer[Round] = add<word_t>(wbuffer[Round], set<xWord>(
K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3]));
}
}
Expand Down
4 changes: 4 additions & 0 deletions include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,10 @@ merkle_hash_vector(digests_t& digests) NOEXCEPT
// ----------------------------------------------------------------------------
// public

// TODO: consider eliminating endianness conversions internal to the root
// computation, instead converting on way in and way out ony, and using non
// converting input/output (nop) functions.

TEMPLATE
VCONSTEXPR typename CLASS::digest_t CLASS::
merkle_root(digests_t&& digests) NOEXCEPT
Expand Down
Loading

0 comments on commit 5480d11

Please sign in to comment.