Skip to content

Commit

Permalink
Merge pull request #1554 from evoskuil/master
Browse files Browse the repository at this point in the history
Initial implementation of shani message scheduling.
  • Loading branch information
evoskuil authored Nov 25, 2024
2 parents 4d4d3e2 + 1e79de7 commit 0eac043
Show file tree
Hide file tree
Showing 8 changed files with 172 additions and 54 deletions.
2 changes: 1 addition & 1 deletion include/bitcoin/system/hash/rmd/algorithm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
#include <bitcoin/system/hash/algorithm.hpp>
#include <bitcoin/system/math/math.hpp>

// algorithm.hpp file is the common include for rmd.
// algorithm.hpp file is the common include for rmd.
#include <bitcoin/system/hash/rmd/rmd.hpp>
#include <bitcoin/system/hash/rmd/rmd128.hpp>
#include <bitcoin/system/hash/rmd/rmd160.hpp>
Expand Down
19 changes: 12 additions & 7 deletions include/bitcoin/system/hash/sha/algorithm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
#include <bitcoin/system/intrinsics/intrinsics.hpp>
#include <bitcoin/system/math/math.hpp>

// algorithm.hpp file is the common include for sha.
// algorithm.hpp file is the common include for sha.
#include <bitcoin/system/hash/sha/sha.hpp>
#include <bitcoin/system/hash/sha/sha160.hpp>
#include <bitcoin/system/hash/sha/sha256.hpp>
Expand Down Expand Up @@ -330,11 +330,15 @@ class algorithm

/// Native.
/// -----------------------------------------------------------------------
////using cword_t = xint128_t;
////static constexpr auto cratio = sizeof(cword_t) / SHA::word_bytes;
////static constexpr auto crounds = SHA::rounds / cratio;
////using cbuffer_t = std_array<cword_t, crounds>;
////using cstate_t = std_array<xint128_t, two>;
static constexpr auto native_lanes = capacity<xint128_t, word_t>;
static constexpr auto native_rounds = SHA::rounds / native_lanes;
using cbuffer_t = std_array<xint128_t, native_rounds>;
using cstate_t = std_array<xint128_t, two>;

template<size_t Round>
INLINE static void prepare(cbuffer_t& buffer) NOEXCEPT;
INLINE static void add_k(cbuffer_t& buffer) NOEXCEPT;
static void schedule(cbuffer_t& buffer) NOEXCEPT;

template <typename xWord>
INLINE static void schedule_native(xbuffer_t<xWord>& xbuffer) NOEXCEPT;
Expand All @@ -356,7 +360,8 @@ class algorithm
/// Summary public values.
/// -----------------------------------------------------------------------
static constexpr auto caching = Cached;
static constexpr auto native = use_shani || use_neon;
static constexpr auto native = (use_shani || use_neon) &&
!is_same_size<word_t, uint64_t>;
static constexpr auto vector = (use_x128 || use_x256 || use_x512)
&& !(build_x32 && is_same_size<word_t, uint64_t>);
};
Expand Down
2 changes: 2 additions & 0 deletions include/bitcoin/system/impl/hash/sha/algorithm_functions.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ TEMPLATE
INLINE constexpr auto CLASS::
parity(auto x, auto y, auto z) NOEXCEPT
{
// Normal form, unmodified.
return f::xor_(f::xor_(x, y), z);
}

Expand All @@ -63,6 +64,7 @@ template <unsigned int A, unsigned int B, unsigned int C>
INLINE constexpr auto CLASS::
sigma(auto x) NOEXCEPT
{
// Normal form, unmodified.
constexpr auto s = SHA::word_bits;
return f::xor_(f::xor_(f::ror<A, s>(x), f::ror<B, s>(x)), f::shr<C, s>(x));
}
Expand Down
159 changes: 150 additions & 9 deletions include/bitcoin/system/impl/hash/sha/algorithm_native.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -27,30 +27,171 @@
// two state variables. This applies to sha160 and sha256, but sha512 native
// is not supported.

// The base buffer is already populated with proper endianness.
// Input could be optimized using intrinsics (see comments in parse).
// The unextended state vector is already output with proper endianness.
// Output could also be optimized using intrinsics (see comments in parse).

namespace libbitcoin {
namespace system {
namespace sha {

// protected
// ----------------------------------------------------------------------------
TEMPLATE
template<size_t Round>
INLINE void CLASS::
prepare(cbuffer_t& buffer) NOEXCEPT
{
// K-adding is shifted 16 words, with last 16 added after scheduling.

if constexpr (SHA::strength == 160)
{
////static_assert(false, "sha160 not implemented");
}
else if constexpr (use_neon)
{
////static_assert(false, "neon not implemented");
}
else
{
static_assert(SHA::strength == 256);

constexpr auto r1 = Round - 1;
constexpr auto r2 = sub1(r1);
constexpr auto r3 = sub1(r2);
constexpr auto r4 = sub1(r3);
constexpr auto k0 = Round * 4 - 16;
constexpr auto k1 = add1(k0);
constexpr auto k2 = add1(k1);
constexpr auto k3 = add1(k2);

buffer[Round] = mm_sha256msg2_epu32
(
mm_add_epi32
(
mm_alignr_epi8
(
buffer[r1], buffer[r2], SHA::word_bytes
),
mm_sha256msg1_epu32
(
buffer[r4], buffer[r3]
)
),
buffer[r1]
);

buffer[r4] = mm_add_epi32
(
buffer[r4],
mm_set_epi32(K::get[k3], K::get[k2], K::get[k1], K::get[k0])
);
}
}

TEMPLATE
template <typename xWord>
INLINE void CLASS::
schedule_native(xbuffer_t<xWord>& xbuffer) NOEXCEPT
add_k(cbuffer_t& buffer) NOEXCEPT
{
// Merkle extended buffer is not native dispatched.
schedule_(xbuffer);
// Add K to last 16 words.
// TODO: Consolidated K-adding can be performed in 4/8/16 lanes.
constexpr auto k = SHA::rounds - SHA::block_words;
constexpr auto r = k / native_lanes;

buffer[r + 0] = mm_add_epi32
(
buffer[r + 0],
mm_set_epi32(
K::get[k + 3], K::get[k + 2],
K::get[k + 1], K::get[k + 0])
);

buffer[r + 1] = mm_add_epi32
(
buffer[r + 1],
mm_set_epi32(
K::get[k + 7], K::get[k + 6],
K::get[k + 5], K::get[k + 4])
);

buffer[r + 2] = mm_add_epi32
(
buffer[r + 2],
mm_set_epi32(
K::get[k + 11], K::get[k + 10],
K::get[k + 9], K::get[k + 8])
);

buffer[r + 3] = mm_add_epi32
(
buffer[r + 3],
mm_set_epi32(
K::get[k + 15], K::get[k + 14],
K::get[k + 13], K::get[k + 12])
);
}

TEMPLATE
INLINE void CLASS::
schedule(cbuffer_t& buffer) NOEXCEPT
{
auto& cbuffer = array_cast<xint128_t>(buffer);

prepare<4>(cbuffer);
prepare<5>(cbuffer);
prepare<6>(cbuffer);
prepare<7>(cbuffer);
prepare<8>(cbuffer);
prepare<9>(cbuffer);
prepare<10>(cbuffer);
prepare<11>(cbuffer);
prepare<12>(cbuffer);
prepare<13>(cbuffer);
prepare<14>(cbuffer);
prepare<15>(cbuffer);

////if constexpr (SHA::rounds == 80)
////{
//// prepare<16>(buffer);
//// prepare<17>(buffer);
//// prepare<18>(buffer);
//// prepare<19>(buffer);
////}

add_k(buffer);
}

// schedule
// ----------------------------------------------------------------------------
// protected

TEMPLATE
INLINE void CLASS::
schedule_native(buffer_t& buffer) NOEXCEPT
{
// TODO: single block compression.
schedule_(buffer);
// neon and sha160 not yet implemented, sha512 is not native.
if constexpr (SHA::strength == 160 || SHA::strength == 512 || use_neon)
{
schedule_(buffer);
}
else
{
schedule(array_cast<xint128_t>(buffer));
}
}

TEMPLATE
template <typename xWord>
INLINE void CLASS::
schedule_native(xbuffer_t<xWord>& xbuffer) NOEXCEPT
{
// Merkle extended buffer is not native dispatched.
schedule_(xbuffer);
}

// compression
// ----------------------------------------------------------------------------
// protected

TEMPLATE
template <typename xWord, size_t Lane>
INLINE void CLASS::
Expand All @@ -75,7 +216,7 @@ template <size_t Lane>
INLINE void CLASS::
compress_native(state_t& state, const buffer_t& buffer) NOEXCEPT
{
// TODO: single block compression.
// TODO: Single block compression.
compress_<Lane>(state, buffer);
}

Expand Down
38 changes: 3 additions & 35 deletions include/bitcoin/system/impl/hash/sha/algorithm_schedule.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,7 @@ template<size_t Round>
INLINE constexpr void CLASS::
prepare(auto& buffer) NOEXCEPT
{
// K is added to schedule words because schedule is vectorizable.
// This allows 3/4 of the cost of the K addtion to be vectorized.
// K-adding is shifted -16, with last 16 added after scheduling.
// K-adding is shifted 16 words, with last 16 added after scheduling.
constexpr auto s = SHA::word_bits;

if constexpr (SHA::strength == 160)
Expand All @@ -53,24 +51,6 @@ prepare(auto& buffer) NOEXCEPT
f::xor_(buffer[r08], buffer[r03])));

buffer[r16] = f::addc<K::get[r16], s>(buffer[r16]);

// SHA-NI
//
// buffer[Round] = sha1msg2 // xor and rotl1
// (
// xor // not using sha1msg1
// (
// sha1msg1 // xor (specialized)
// (
// buffer[Round - 16],
// buffer[Round - 14]
// ),
// buffer[Round - 8]
// ),
// buffer[Round - 3]
// );
// NEON
// vsha1su1q/vsha1su0q
}
else
{
Expand All @@ -84,18 +64,6 @@ prepare(auto& buffer) NOEXCEPT
f::add<s>(buffer[r07], sigma1(buffer[r02])));

buffer[r16] = f::addc<K::get[r16], s>(buffer[r16]);

// Each word is 128, buffer goes from 64 to 16 words.
// SHA-NI
// buffer[Round] =
// sha256msg1(buffer[Round - 16], buffer[Round - 15]) +
// sha256msg2(buffer[Round - 7], buffer[Round - 2]);
// NEON
// Not sure about these indexes.
// mijailovic.net/2018/06/06/sha256-armv8
// buffer[Round] =
// vsha256su0q(buffer[Round - 13], buffer[Round - 9]) +
// vsha256su1q(buffer[Round - 13], buffer[Round - 5], buffer[Round - 1]);
}
}

Expand All @@ -104,8 +72,9 @@ INLINE constexpr void CLASS::
add_k(auto& buffer) NOEXCEPT
{
// Add K to last 16 words.
// TODO: Consolidated K-adding can be performed in 4/8/16 lanes.
constexpr auto s = SHA::word_bits;
constexpr auto r = SHA::rounds - array_count<words_t>;
constexpr auto r = SHA::rounds - SHA::block_words;
buffer[r + 0] = f::addc<K::get[r + 0], s>(buffer[r + 0]);
buffer[r + 1] = f::addc<K::get[r + 1], s>(buffer[r + 1]);
buffer[r + 2] = f::addc<K::get[r + 2], s>(buffer[r + 2]);
Expand All @@ -128,7 +97,6 @@ TEMPLATE
constexpr void CLASS::
schedule_(auto& buffer) NOEXCEPT
{

prepare<16>(buffer);
prepare<17>(buffer);
prepare<18>(buffer);
Expand Down
2 changes: 2 additions & 0 deletions include/bitcoin/system/intrinsics/xcpu/defines.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ BC_POP_WARNING()
#endif

#if !defined(HAVE_SSE41)
#define mm_alignr_epi8(a, b, c) {}
#define mm_and_si128(a, b) (a)
#define mm_or_si128(a, b) (a)
#define mm_xor_si128(a, b) (a)
Expand Down Expand Up @@ -145,6 +146,7 @@ BC_POP_WARNING()
#define mm_set_epi16(x08, x07, x06, x05, x04, x03, x02, x01)
#define mm_set_epi8(x16, x15, x14, x13, x12, x11, x10, x09, x08, x07, x06, x05, x04, x03, x02, x01)
#else
#define mm_alignr_epi8(a, b, c) _mm_alignr_epi8(a, b, c) // for native sha (128 only)
#define mm_and_si128(a, b) _mm_and_si128(a, b)
#define mm_or_si128(a, b) _mm_or_si128(a, b)
#define mm_xor_si128(a, b) _mm_xor_si128(a, b)
Expand Down
2 changes: 1 addition & 1 deletion test/hash/performance/performance.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ using hash_selector = iif<Ripemd, rmd_algorithm<Strength>,

static_assert(hash_selector< 160, true, true, true, false>::native == with_shani || with_neon);
static_assert(hash_selector< 256, true, true, true, false>::native == with_shani || with_neon);
static_assert(hash_selector< 512, true, true, true, false>::native == with_shani || with_neon);
static_assert(hash_selector< 512, true, true, true, false>::native == /*with_shani || with_neon*/ false);
static_assert(!hash_selector<160, false, true, true, false>::native);
static_assert(!hash_selector<256, false, true, true, false>::native);
static_assert(!hash_selector<512, false, true, true, false>::native);
Expand Down
2 changes: 1 addition & 1 deletion test/hash/sha/sha512.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
BOOST_AUTO_TEST_SUITE(sha512_tests_)

constexpr auto vector = (with_sse41 || with_avx2 || with_avx512) && !build_x32;
constexpr auto native = with_shani || with_neon;
constexpr auto native = /*(with_shani || with_neon)*/ false;

BOOST_AUTO_TEST_CASE(sha512__hash__null_hash__expected)
{
Expand Down

0 comments on commit 0eac043

Please sign in to comment.