Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace buffered shani with rotating. #1559

Merged
merged 9 commits into from
Dec 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions include/bitcoin/system/data/iterable.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ class iterable
return begin_;
}

template <size_t Elements>
template <size_t Elements = one>
inline iterable& advance() NOEXCEPT
{
// This is safe for overflow, will advance to end.
Expand All @@ -185,7 +185,7 @@ class iterable
return *this;
}

template <size_t Elements>
template <size_t Elements = one>
inline const std_array<value_t, Elements>& to_array() const NOEXCEPT
{
return unsafe_array_cast<value_t, Elements>(begin_);
Expand Down
55 changes: 20 additions & 35 deletions include/bitcoin/system/hash/sha/algorithm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -281,6 +281,11 @@ class algorithm
INLINE static void iterate_vector(state_t& state,
iblocks_t& blocks) NOEXCEPT;

template <size_t Size>
INLINE static void iterate_native(state_t& state,
const ablocks_t<Size>& blocks) NOEXCEPT;
INLINE static void iterate_native(state_t& state, iblocks_t& blocks) NOEXCEPT;

template <size_t Size>
INLINE static constexpr void iterate_(state_t& state,
const ablocks_t<Size>& blocks) NOEXCEPT;
Expand Down Expand Up @@ -317,7 +322,8 @@ class algorithm
const xstate_t<xWord>& xstate) NOEXCEPT;

template <typename xWord, if_extended<xWord> = true>
INLINE static void merkle_hash_vector(idigests_t& digests, iblocks_t& blocks) NOEXCEPT;
INLINE static void merkle_hash_vector(idigests_t& digests,
iblocks_t& blocks) NOEXCEPT;
INLINE static void merkle_hash_vector(digests_t& digests) NOEXCEPT;
VCONSTEXPR static void merkle_hash_(digests_t& digests,
size_t offset=zero) NOEXCEPT;
Expand All @@ -330,10 +336,10 @@ class algorithm
auto x6, auto x7, auto x8) NOEXCEPT;

template<size_t Round, size_t Offset>
INLINE static void prepare1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT;
INLINE static void prepare_1(buffer_t& buffer, const auto& xsigma0) NOEXCEPT;

template<size_t Round>
INLINE static void prepare8(buffer_t& buffer) NOEXCEPT;
INLINE static void prepare_8(buffer_t& buffer) NOEXCEPT;

template <typename xWord>
INLINE static void schedule_sigma(xbuffer_t<xWord>& xbuffer) NOEXCEPT;
Expand All @@ -357,45 +363,24 @@ class algorithm
/// Native SHA optimizations (single blocks).
/// -----------------------------------------------------------------------

template<size_t Round>
INLINE static void prepare_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;
static void schedule_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;

template <typename xWord>
INLINE static void schedule_native(xbuffer_t<xWord>& xbuffer) NOEXCEPT;
INLINE static void schedule_native(buffer_t& buffer) NOEXCEPT;

template<size_t Round, size_t Lane>
INLINE static void round_native(wstate_t<xint128_t>& state,
const wbuffer_t<xint128_t>& wk) NOEXCEPT;

INLINE static void shuffle(wstate_t<xint128_t>& wstate) NOEXCEPT;
INLINE static void unshuffle(wstate_t<xint128_t>& wstate) NOEXCEPT;
INLINE static void summarize_native(wstate_t<xint128_t>& out,
const wstate_t<xint128_t>& in) NOEXCEPT;
INLINE static void shuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT;
INLINE static void unshuffle(xint128_t& state0, xint128_t& state1) NOEXCEPT;
INLINE static void prepare(xint128_t& message0, xint128_t message1) NOEXCEPT;
INLINE static void prepare(xint128_t& message0, xint128_t message1,
xint128_t& message2) NOEXCEPT;

template <size_t Lane>
static void compress_native(wstate_t<xint128_t>& state,
const wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;

template <typename xWord, size_t Lane>
INLINE static void compress_native(xstate_t<xWord>& xstate,
const xbuffer_t<xWord>& xbuffer) NOEXCEPT;

template <typename xWord, size_t Lane>
INLINE static void compress_native(state_t& state,
const xbuffer_t<xWord>& xbuffer) NOEXCEPT;
template <size_t Round>
INLINE static void round_4(xint128_t& state0, xint128_t& state1,
xint128_t message) NOEXCEPT;

template <size_t Lane>
INLINE static void compress_native(state_t& state,
const buffer_t& buffer) NOEXCEPT;
static void native_rounds(state_t& state, iblocks_t& blocks) NOEXCEPT;

public:
/// Summary public values.
/// -----------------------------------------------------------------------
static constexpr auto caching = Cached;
static constexpr auto native = (use_shani || use_neon) &&
!is_same_size<word_t, uint64_t>;
static constexpr auto native = (use_shani || use_neon)
&& (SHA::strength == 256 || SHA::strength == 160);
static constexpr auto vector = (use_x128 || use_x256 || use_x512)
&& !(build_x32 && is_same_size<word_t, uint64_t>);
};
Expand Down
3 changes: 2 additions & 1 deletion include/bitcoin/system/have.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,12 +110,13 @@
#define HAVE_XASSEMBLY
#endif

/// DISABLED
/// ARM Neon intrinsics.
#if defined(HAVE_ARM)
// -march=armv8-a+crc+crypto [all]
// -arch arm64 [apple] (also -isysroot to phone sdk)
#if defined(HAVE_GNUC) || defined(__ARM_NEON) || defined(HAVE_MSC)
#define HAVE_NEON
////#define HAVE_NEON
#endif
#endif

Expand Down
39 changes: 5 additions & 34 deletions include/bitcoin/system/impl/hash/sha/algorithm_compress.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -71,17 +71,6 @@ round(auto a, auto& b, auto c, auto d, auto& e, auto wk) NOEXCEPT

e = /*a =*/ f::add<s>(f::add<s>(f::add<s>(f::rol<5, s>(a), fn(b, c, d)), e), wk);
b = /*c =*/ f::rol<30, s>(b);

// SHA-NI
// Four rounds (total rounds 80/4).
// First round is add(e, w), then sha1nexte(e, w).
// fk is round-based enumeration implying f selection and k value.
// e1 = sha1nexte(e0, w);
// abcd = sha1rnds4(abcd, e0, fk);
// NEON
// f is implied by k in wk.
// e1 = vsha1h(vgetq_lane(abcd, 0);
// vsha1cq(abcd, e0, vaddq(w, k));
}

TEMPLATE
Expand All @@ -97,16 +86,6 @@ round(auto a, auto b, auto c, auto& d, auto e, auto f, auto g, auto& h,
const auto t = f::add<s>(f::add<s>(f::add<s>(Sigma1(e), choice(e, f, g)), h), wk);
d = /*e =*/ f::add<s>(d, t);
h = /*a =*/ f::add<s>(f::add<s>(Sigma0(a), majority(a, b, c)), t);

// Each call is 2 rounds, s, w and k are 128 (4 words each, s1/s2 is 8 word state).
// SHA-NI
// const auto value = add(w, k);
// abcd = sha256rnds2(abcd, efgh, value);
// efgh = sha256rnds2(efgh, abcd, shuffle(value));
// NEON
// const auto value = vaddq(w, k);
// abcd = vsha256hq(abcd, efgh, value);
// efgh = vsha256h2q(efgh, abcd, value);
}

TEMPLATE
Expand All @@ -125,10 +104,6 @@ round(auto& state, const auto& wk) NOEXCEPT
state[(SHA::rounds + 3 - Round) % SHA::state_words],
state[(SHA::rounds + 4 - Round) % SHA::state_words], // a->e
extract<word, Lane>(wk[Round]));

// SHA-NI/NEON
// State packs in 128 (one state variable), reduces above to 1 out[].
// Input value is 128 (w). Constants (k) statically initialized as 128.
}
else
{
Expand All @@ -142,10 +117,6 @@ round(auto& state, const auto& wk) NOEXCEPT
state[(SHA::rounds + 6 - Round) % SHA::state_words],
state[(SHA::rounds + 7 - Round) % SHA::state_words], // a->h
extract<word, Lane>(wk[Round]));

// SHA-NI/NEON
// Each element is 128 (vs. 32), reduces above to 2 out[] (s0/s1).
// Input value is 128 (w). Constants (k) statically initialized as 128.
}
}

Expand Down Expand Up @@ -276,11 +247,11 @@ compress(state_t& state, const buffer_t& buffer) NOEXCEPT
{
compress_<Lane>(state, buffer);
}
else if constexpr (native)
{
// Single block shani compression optimization.
compress_native<Lane>(state, buffer);
}
////else if constexpr (native)
////{
//// // Single block shani compression optimization.
//// compress_native<Lane>(state, buffer);
////}
////else if constexpr (vector)
////{
//// // Compression is not vectorized within a block, however this is
Expand Down
39 changes: 30 additions & 9 deletions include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ iterate_vector(state_t& state, const ablocks_t<Size>& blocks) NOEXCEPT
{
if (blocks.size() >= min_lanes)
{
auto iblocks = iblocks_t{ array_cast<byte_t>(blocks) };
iblocks_t iblocks{ array_cast<byte_t>(blocks) };
iterate_vector(state, iblocks);
}
else
Expand All @@ -237,6 +237,31 @@ iterate_vector(state_t& state, const ablocks_t<Size>& blocks) NOEXCEPT
}
}

// Native SHA
// ============================================================================
// www.intel.com/content/dam/develop/external/us/en/documents/
// intel-sha-extensions-white-paper-402097.pdf

TEMPLATE
INLINE void CLASS::
iterate_native(state_t& state, iblocks_t& blocks) NOEXCEPT
{
native_rounds(state, blocks);
}

TEMPLATE
template <size_t Size>
INLINE void CLASS::
iterate_native(state_t& state, const ablocks_t<Size>& blocks) NOEXCEPT
{
iblocks_t iblocks{ array_cast<byte_t>(blocks) };
native_rounds(state, iblocks);
}

// Dispatch and normal forms.
// ============================================================================
// protected

TEMPLATE
template <size_t Size>
INLINE constexpr void CLASS::
Expand Down Expand Up @@ -273,11 +298,9 @@ iterate(state_t& state, const ablocks_t<Size>& blocks) NOEXCEPT
{
iterate_(state, blocks);
}
else if constexpr (native)
else if constexpr (native && SHA::strength == 256)
{
// TODO: evaluate 4/8/16 lane message scheduling vs. shani scheduling.
// Multiple block shani message schduling and compression optimization.
iterate_(state, blocks);
iterate_native(state, blocks);
}
else if constexpr (vector)
{
Expand All @@ -294,11 +317,9 @@ TEMPLATE
INLINE void CLASS::
iterate(state_t& state, iblocks_t& blocks) NOEXCEPT
{
if constexpr (native)
if constexpr (native && SHA::strength == 256)
{
// TODO: evaluate 4/8/16 lane message scheduling vs. shani scheduling.
// Multiple block shani message schduling and compression optimization.
iterate_(state, blocks);
iterate_native(state, blocks);
}
else if constexpr (vector)
{
Expand Down
7 changes: 3 additions & 4 deletions include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -50,27 +50,26 @@ template<size_t Round, typename xWord>
INLINE void CLASS::
vector_konstant(wbuffer_t<xWord>& wbuffer) NOEXCEPT
{
constexpr auto s = SHA::word_bits;
constexpr auto lanes = capacity<xWord, word_t>;
constexpr auto r = Round * lanes;

if constexpr (lanes == 16)
{
wbuffer[Round] = f::add<s>(wbuffer[Round], set<xWord>(
wbuffer[Round] = add<word_t>(wbuffer[Round], set<xWord>(
K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3],
K::get[r + 4], K::get[r + 5], K::get[r + 6], K::get[r + 7],
K::get[r + 8], K::get[r + 9], K::get[r + 10], K::get[r + 11],
K::get[r + 12], K::get[r + 13], K::get[r + 14], K::get[r + 15]));
}
else if constexpr (lanes == 8)
{
wbuffer[Round] = f::add<s>(wbuffer[Round], set<xWord>(
wbuffer[Round] = add<word_t>(wbuffer[Round], set<xWord>(
K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3],
K::get[r + 4], K::get[r + 5], K::get[r + 6], K::get[r + 7]));
}
else if constexpr (lanes == 4)
{
wbuffer[Round] = f::add<s>(wbuffer[Round], set<xWord>(
wbuffer[Round] = add<word_t>(wbuffer[Round], set<xWord>(
K::get[r + 0], K::get[r + 1], K::get[r + 2], K::get[r + 3]));
}
}
Expand Down
4 changes: 4 additions & 0 deletions include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,10 @@ merkle_hash_vector(digests_t& digests) NOEXCEPT
// ----------------------------------------------------------------------------
// public

// TODO: consider eliminating endianness conversions internal to the root
// computation, instead converting on way in and way out ony, and using non
// converting input/output (nop) functions.

TEMPLATE
VCONSTEXPR typename CLASS::digest_t CLASS::
merkle_root(digests_t&& digests) NOEXCEPT
Expand Down
Loading
Loading