Skip to content

Commit

Permalink
Merge pull request #1557 from evoskuil/master
Browse files Browse the repository at this point in the history
Fix performance test vectorization regression.
  • Loading branch information
evoskuil authored Dec 3, 2024
2 parents 572f3b9 + d524274 commit 4a0fb22
Show file tree
Hide file tree
Showing 5 changed files with 112 additions and 86 deletions.
4 changes: 2 additions & 2 deletions include/bitcoin/system/hash/sha/algorithm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ class algorithm

template<size_t Round>
INLINE static void prepare_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;
static void schedule(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;
static void schedule_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;

template <typename xWord>
INLINE static void schedule_native(xbuffer_t<xWord>& xbuffer) NOEXCEPT;
Expand All @@ -375,7 +375,7 @@ class algorithm
const wstate_t<xint128_t>& in) NOEXCEPT;

template <size_t Lane>
INLINE static void compress_native(wstate_t<xint128_t>& state,
static void compress_native(wstate_t<xint128_t>& state,
const wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;

template <typename xWord, size_t Lane>
Expand Down
28 changes: 14 additions & 14 deletions include/bitcoin/system/impl/hash/sha/algorithm_native.ipp
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,8 @@ prepare_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT
}

TEMPLATE
INLINE void CLASS::
schedule(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT
void CLASS::
schedule_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT
{
prepare_native<4>(wbuffer);
prepare_native<5>(wbuffer);
Expand Down Expand Up @@ -115,7 +115,7 @@ schedule_native(buffer_t& buffer) NOEXCEPT
// neon and sha160 not yet implemented, sha512 is not native.
if constexpr (SHA::strength == 256 && !use_neon)
{
schedule(array_cast<xint128_t>(buffer));
schedule_native(array_cast<xint128_t>(buffer));
}
else
{
Expand Down Expand Up @@ -203,8 +203,8 @@ shuffle(wstate_t<xint128_t>& wstate) NOEXCEPT
// [ABCD][EFGH] -> [FEBA][HGDC] (ordered low to high).
const auto t1 = mm_shuffle_epi32(wstate[0], 0xb1);
const auto t2 = mm_shuffle_epi32(wstate[1], 0x1b);
wstate[0] = mm_alignr_epi8(t1, t2, 8);
wstate[1] = mm_blend_epi16(t2, t1, 15);
wstate[0] = mm_alignr_epi8(t1, t2, 0x08);
wstate[1] = mm_blend_epi16(t2, t1, 0xf0);
}

TEMPLATE
Expand All @@ -215,13 +215,13 @@ unshuffle(wstate_t<xint128_t>& wstate) NOEXCEPT
// [FEBA][HGDC] -> [ABCD][EFGH] (ordered low to high).
const auto t1 = mm_shuffle_epi32(wstate[0], 0x1b);
const auto t2 = mm_shuffle_epi32(wstate[1], 0xb1);
wstate[0] = mm_blend_epi16(t1, t2, 15);
wstate[1] = mm_alignr_epi8(t2, t1, 8);
wstate[0] = mm_blend_epi16(t1, t2, 0xf0);
wstate[1] = mm_alignr_epi8(t2, t1, 0x08);
}

TEMPLATE
template <size_t Lane>
INLINE void CLASS::
void CLASS::
compress_native(wstate_t<xint128_t>& wstate,
const wbuffer_t<xint128_t>& wbuffer) NOEXCEPT
{
Expand Down Expand Up @@ -291,12 +291,12 @@ compress_native(state_t& state, const buffer_t& buffer) NOEXCEPT
// TODO: debug.
// TODO: sha160 state is too small to array cast into two xwords.
// neon and sha160 not yet implemented, sha512 is not native.
////if constexpr (SHA::strength == 256 && !use_neon)
////{
//// compress_native<Lane>(array_cast<xint128_t>(state),
//// array_cast<xint128_t>(buffer));
////}
////else
if constexpr (SHA::strength == 256 && !use_neon)
{
compress_native<Lane>(array_cast<xint128_t>(state),
array_cast<xint128_t>(buffer));
}
else
{
compress_<Lane>(state, buffer);
}
Expand Down
2 changes: 1 addition & 1 deletion src/define.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
// /unicode : define
// /intrinsics : define
// /math : /intrinsics
// /data : /math
// /data : /math /unicode
// /endian : /data
// /words : /data
// /radix : /words
Expand Down
156 changes: 91 additions & 65 deletions test/hash/performance/performance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,121 +74,147 @@ struct v4
struct mr
{
static constexpr size_t c = 10 * 1024;
static constexpr size_t s = 3;
////static constexpr size_t s = 32;
};

BOOST_AUTO_TEST_CASE(performance__sha256a_none__merkle)
{
auto complete = true;
complete = test_merkle<sha256a_none, mr::c, mr::s>(std::cout);
complete &= test_merkle<sha256a_none, mr::c, 1>(std::cout);
complete &= test_merkle<sha256a_none, mr::c, 2>(std::cout);
complete &= test_merkle<sha256a_none, mr::c, 3>(std::cout);
complete &= test_merkle<sha256a_none, mr::c, 4>(std::cout);
complete &= test_merkle<sha256a_none, mr::c, 8>(std::cout);
complete &= test_merkle<sha256a_none, mr::c, 16>(std::cout);
complete &= test_merkle<sha256a_none, mr::c, 32>(std::cout);
complete &= test_merkle<sha256a_none, mr::c, 64>(std::cout);
complete &= test_merkle<sha256a_none, mr::c, 128>(std::cout);
complete &= test_merkle<sha256a_none, mr::c, 256>(std::cout);
complete &= test_merkle<sha256a_none, mr::c, 512>(std::cout);
complete &= test_merkle<sha256a_none, mr::c, 1024>(std::cout);
complete &= test_merkle<sha256a_none, mr::c, 2048>(std::cout);
complete &= test_merkle<sha256a_none, mr::c, 4096>(std::cout);
BOOST_CHECK(complete);
}

BOOST_AUTO_TEST_CASE(performance__sha256a_vect__merkle)
{
auto complete = true;
complete = test_merkle<sha256a_vect, mr::c, mr::s>(std::cout);
complete &= test_merkle<sha256a_vect, mr::c, 1>(std::cout);
complete &= test_merkle<sha256a_vect, mr::c, 2>(std::cout);
complete &= test_merkle<sha256a_vect, mr::c, 3>(std::cout);
complete &= test_merkle<sha256a_vect, mr::c, 4>(std::cout);
complete &= test_merkle<sha256a_vect, mr::c, 8>(std::cout);
complete &= test_merkle<sha256a_vect, mr::c, 16>(std::cout);
complete &= test_merkle<sha256a_vect, mr::c, 32>(std::cout);
complete &= test_merkle<sha256a_vect, mr::c, 64>(std::cout);
complete &= test_merkle<sha256a_vect, mr::c, 128>(std::cout);
complete &= test_merkle<sha256a_vect, mr::c, 256>(std::cout);
complete &= test_merkle<sha256a_vect, mr::c, 512>(std::cout);
complete &= test_merkle<sha256a_vect, mr::c, 1024>(std::cout);
complete &= test_merkle<sha256a_vect, mr::c, 2048>(std::cout);
complete &= test_merkle<sha256a_vect, mr::c, 4096>(std::cout);
BOOST_CHECK(complete);
}

// !using shax (see performahce.hpp)

BOOST_AUTO_TEST_CASE(performance__base_sha256a)
{
auto complete = true;
complete = base::test_hash<base_sha256a, v0::c, v0::s>(std::cout);
complete = base::test_hash<base_sha256a, v1::c, v1::s>(std::cout);
complete = base::test_hash<base_sha256a, v2::c, v2::s>(std::cout);
complete = base::test_hash<base_sha256a, v3::c, v3::s>(std::cout);
complete = base::test_hash<base_sha256a, v4::c, v4::s>(std::cout);
BOOST_CHECK(complete);
}

BOOST_AUTO_TEST_CASE(performance__sha256a_none)
{
auto complete = true;
complete = test_accumulator<sha256a_none, v0::c, v0::s>(std::cout);
complete = test_accumulator<sha256a_none, v1::c, v1::s>(std::cout);
complete = test_accumulator<sha256a_none, v2::c, v2::s>(std::cout);
complete = test_accumulator<sha256a_none, v3::c, v3::s>(std::cout);
complete = test_accumulator<sha256a_none, v4::c, v4::s>(std::cout);
BOOST_CHECK(complete);
}

BOOST_AUTO_TEST_CASE(performance__sha256a_vect)
{
auto complete = true;
complete = test_accumulator<sha256a_vect, v0::c, v0::s>(std::cout);
complete = test_accumulator<sha256a_vect, v1::c, v1::s>(std::cout);
complete = test_accumulator<sha256a_vect, v2::c, v2::s>(std::cout);
complete = test_accumulator<sha256a_vect, v3::c, v3::s>(std::cout);
complete = test_accumulator<sha256a_vect, v4::c, v4::s>(std::cout);
BOOST_CHECK(complete);
}
////BOOST_AUTO_TEST_CASE(performance__base_sha256a)
////{
//// auto complete = true;
//// complete &= base::test_hash<base_sha256a, v0::c, v0::s>(std::cout);
//// complete &= base::test_hash<base_sha256a, v1::c, v1::s>(std::cout);
//// complete &= base::test_hash<base_sha256a, v2::c, v2::s>(std::cout);
//// complete &= base::test_hash<base_sha256a, v3::c, v3::s>(std::cout);
//// complete &= base::test_hash<base_sha256a, v4::c, v4::s>(std::cout);
//// BOOST_CHECK(complete);
////}
////
////BOOST_AUTO_TEST_CASE(performance__sha256a_none)
////{
//// auto complete = true;
//// complete &= test_accumulator<sha256a_none, v0::c, v0::s>(std::cout);
//// complete &= test_accumulator<sha256a_none, v1::c, v1::s>(std::cout);
//// complete &= test_accumulator<sha256a_none, v2::c, v2::s>(std::cout);
//// complete &= test_accumulator<sha256a_none, v3::c, v3::s>(std::cout);
//// complete &= test_accumulator<sha256a_none, v4::c, v4::s>(std::cout);
//// BOOST_CHECK(complete);
////}
////
////BOOST_AUTO_TEST_CASE(performance__sha256a_vect)
////{
//// auto complete = true;
//// complete &= test_accumulator<sha256a_vect, v0::c, v0::s>(std::cout);
//// complete &= test_accumulator<sha256a_vect, v1::c, v1::s>(std::cout);
//// complete &= test_accumulator<sha256a_vect, v2::c, v2::s>(std::cout);
//// complete &= test_accumulator<sha256a_vect, v3::c, v3::s>(std::cout);
//// complete &= test_accumulator<sha256a_vect, v4::c, v4::s>(std::cout);
//// BOOST_CHECK(complete);
////}

////BOOST_AUTO_TEST_CASE(performance__rmd160__baseline)
////{
//// auto complete = true;
//// complete = base::test_hash<base_rmd160a, v0::c, v0::s>(std::cout);
//// complete = base::test_hash<base_rmd160a, v1::c, v1::s>(std::cout);
//// complete = base::test_hash<base_rmd160a, v2::c, v2::s>(std::cout);
//// complete = base::test_hash<base_rmd160a, v3::c, v3::s>(std::cout);
//// complete = base::test_hash<base_rmd160a, v4::c, v4::s>(std::cout);
//// complete &= base::test_hash<base_rmd160a, v0::c, v0::s>(std::cout);
//// complete &= base::test_hash<base_rmd160a, v1::c, v1::s>(std::cout);
//// complete &= base::test_hash<base_rmd160a, v2::c, v2::s>(std::cout);
//// complete &= base::test_hash<base_rmd160a, v3::c, v3::s>(std::cout);
//// complete &= base::test_hash<base_rmd160a, v4::c, v4::s>(std::cout);
//// BOOST_CHECK(complete);
////}
////
////BOOST_AUTO_TEST_CASE(performance__rmd160__algorithm)
////{
//// auto complete = true;
//// complete = test_accumulator<rmd160a, v0::c, v0::s>(std::cout);
//// complete = test_accumulator<rmd160a, v1::c, v1::s>(std::cout);
//// complete = test_accumulator<rmd160a, v2::c, v2::s>(std::cout);
//// complete = test_accumulator<rmd160a, v3::c, v3::s>(std::cout);
//// complete = test_accumulator<rmd160a, v4::c, v4::s>(std::cout);
//// complete &= test_accumulator<rmd160a, v0::c, v0::s>(std::cout);
//// complete &= test_accumulator<rmd160a, v1::c, v1::s>(std::cout);
//// complete &= test_accumulator<rmd160a, v2::c, v2::s>(std::cout);
//// complete &= test_accumulator<rmd160a, v3::c, v3::s>(std::cout);
//// complete &= test_accumulator<rmd160a, v4::c, v4::s>(std::cout);
//// BOOST_CHECK(complete);
////}

////BOOST_AUTO_TEST_CASE(performance__sha256__accumulator)
////{
//// auto complete = true;
//// complete = test_accumulator<sha256a, v0::c, v0::s>(std::cout);
//// complete = test_accumulator<sha256a, v1::c, v1::s>(std::cout);
//// complete = test_accumulator<sha256a, v2::c, v2::s>(std::cout);
//// complete = test_accumulator<sha256a, v3::c, v3::s>(std::cout);
//// complete = test_accumulator<sha256a, v4::c, v4::s>(std::cout);
//// complete &= test_accumulator<sha256a, v0::c, v0::s>(std::cout);
//// complete &= test_accumulator<sha256a, v1::c, v1::s>(std::cout);
//// complete &= test_accumulator<sha256a, v2::c, v2::s>(std::cout);
//// complete &= test_accumulator<sha256a, v3::c, v3::s>(std::cout);
//// complete &= test_accumulator<sha256a, v4::c, v4::s>(std::cout);
//// BOOST_CHECK(complete);
////}
////
////BOOST_AUTO_TEST_CASE(performance__rmd160__accumulator)
////{
//// auto complete = true;
//// complete = test_accumulator<rmd160a, v0::c, v0::s>(std::cout);
//// complete = test_accumulator<rmd160a, v1::c, v1::s>(std::cout);
//// complete = test_accumulator<rmd160a, v2::c, v2::s>(std::cout);
//// complete = test_accumulator<rmd160a, v3::c, v3::s>(std::cout);
//// complete = test_accumulator<rmd160a, v4::c, v4::s>(std::cout);
//// complete &= test_accumulator<rmd160a, v0::c, v0::s>(std::cout);
//// complete &= test_accumulator<rmd160a, v1::c, v1::s>(std::cout);
//// complete &= test_accumulator<rmd160a, v2::c, v2::s>(std::cout);
//// complete &= test_accumulator<rmd160a, v3::c, v3::s>(std::cout);
//// complete &= test_accumulator<rmd160a, v4::c, v4::s>(std::cout);
//// BOOST_CHECK(complete);
////}

////BOOST_AUTO_TEST_CASE(performance__sha256c_cached__accumulator)
////{
//// auto complete = true;
//// complete = test_accumulator<sha256c_cached, v0::c, v0::s>(std::cout);
//// complete = test_accumulator<sha256c_cached, v1::c, v1::s>(std::cout);
//// complete = test_accumulator<sha256c_cached, v2::c, v2::s>(std::cout);
//// complete = test_accumulator<sha256c_cached, v3::c, v3::s>(std::cout);
//// complete = test_accumulator<sha256c_cached, v4::c, v4::s>(std::cout);
//// complete &= test_accumulator<sha256c_cached, v0::c, v0::s>(std::cout);
//// complete &= test_accumulator<sha256c_cached, v1::c, v1::s>(std::cout);
//// complete &= test_accumulator<sha256c_cached, v2::c, v2::s>(std::cout);
//// complete &= test_accumulator<sha256c_cached, v3::c, v3::s>(std::cout);
//// complete &= test_accumulator<sha256c_cached, v4::c, v4::s>(std::cout);
//// BOOST_CHECK(complete);
////}
////
////BOOST_AUTO_TEST_CASE(performance__sha256c_uncached__accumulator)
////{
//// auto complete = true;
//// complete = test_accumulator<sha256c_uncached, v0::c, v0::s>(std::cout);
//// complete = test_accumulator<sha256c_uncached, v1::c, v1::s>(std::cout);
//// complete = test_accumulator<sha256c_uncached, v2::c, v2::s>(std::cout);
//// complete = test_accumulator<sha256c_uncached, v3::c, v3::s>(std::cout);
//// complete = test_accumulator<sha256c_uncached, v4::c, v4::s>(std::cout);
//// complete &= test_accumulator<sha256c_uncached, v0::c, v0::s>(std::cout);
//// complete &= test_accumulator<sha256c_uncached, v1::c, v1::s>(std::cout);
//// complete &= test_accumulator<sha256c_uncached, v2::c, v2::s>(std::cout);
//// complete &= test_accumulator<sha256c_uncached, v3::c, v3::s>(std::cout);
//// complete &= test_accumulator<sha256c_uncached, v4::c, v4::s>(std::cout);
//// BOOST_CHECK(complete);
////}

Expand Down
8 changes: 4 additions & 4 deletions test/hash/performance/performance.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ constexpr auto cycles_per_byte(float seconds, float ghz) noexcept
struct parameters
{
static constexpr size_t strength{}; // algorithm strength (160/256/512|128/160).
static constexpr bool native{}; // intrinsic sha (ignored for rmd).
static constexpr bool vector{}; // algorithm vectorization.
static constexpr bool native{}; // intrinsic sha (ignored for rmd).
static constexpr bool vector{}; // algorithm vectorization.
static constexpr bool cached{}; // scheduled pad caching.
static constexpr bool chunked{}; // false for array data.
static constexpr bool ripemd{}; // false for sha algorithm.
Expand Down Expand Up @@ -99,7 +99,7 @@ void output(std::ostream& out, uint64_t time, float ghz, bool csv) noexcept
<< delimiter
<< "bytes_per_round_: " << serialize(Size)
<< delimiter
<< "Native______: " << serialize(P::native)
<< "native__________: " << serialize(P::native)
<< delimiter
<< "vectorized______: " << serialize(P::vector)
<< delimiter
Expand Down Expand Up @@ -356,7 +356,7 @@ struct sha256_parameters : parameters
{
static constexpr size_t strength{ 256 };
static constexpr bool native{ Native };
static constexpr bool vectorized{ Vector };
static constexpr bool vector{ Vector };
static constexpr bool cached{ Cached };
static constexpr bool chunked{ Chunked };
static constexpr bool ripemd{};
Expand Down

0 comments on commit 4a0fb22

Please sign in to comment.