Skip to content

Commit

Permalink
Merge pull request #1555 from evoskuil/master
Browse files Browse the repository at this point in the history
Refactor sha algorithm, fix perf test drift, comments.
  • Loading branch information
evoskuil authored Nov 26, 2024
2 parents 0eac043 + 70af8b1 commit 572f3b9
Show file tree
Hide file tree
Showing 19 changed files with 590 additions and 2,979 deletions.
7 changes: 1 addition & 6 deletions Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,6 @@ src_libbitcoin_system_la_SOURCES = \
src/hash/accumulator.cpp \
src/hash/checksum.cpp \
src/hash/siphash.cpp \
src/hash/vectorization/sha256_1_native.cpp \
src/hash/vectorization/sha256_2_shani.cpp \
src/hash/vectorization/sha256_4_neon.cpp \
src/hash/vectorization/sha256_4_sse4.cpp \
src/hash/vectorization/sha256_4_sse41.cpp \
src/hash/vectorization/sha256_8_avx2.cpp \
src/math/math.cpp \
src/radix/base_10.cpp \
src/radix/base_2048.cpp \
Expand Down Expand Up @@ -622,6 +616,7 @@ include_bitcoin_system_impl_hash_sha_HEADERS = \
include/bitcoin/system/impl/hash/sha/algorithm_double.ipp \
include/bitcoin/system/impl/hash/sha/algorithm_functions.ipp \
include/bitcoin/system/impl/hash/sha/algorithm_iterate.ipp \
include/bitcoin/system/impl/hash/sha/algorithm_konstant.ipp \
include/bitcoin/system/impl/hash/sha/algorithm_merkle.ipp \
include/bitcoin/system/impl/hash/sha/algorithm_native.ipp \
include/bitcoin/system/impl/hash/sha/algorithm_padding.ipp \
Expand Down
6 changes: 0 additions & 6 deletions builds/cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -529,12 +529,6 @@ add_library( ${CANONICAL_LIB_NAME}
"../../src/hash/accumulator.cpp"
"../../src/hash/checksum.cpp"
"../../src/hash/siphash.cpp"
"../../src/hash/vectorization/sha256_1_native.cpp"
"../../src/hash/vectorization/sha256_2_shani.cpp"
"../../src/hash/vectorization/sha256_4_neon.cpp"
"../../src/hash/vectorization/sha256_4_sse4.cpp"
"../../src/hash/vectorization/sha256_4_sse41.cpp"
"../../src/hash/vectorization/sha256_8_avx2.cpp"
"../../src/math/math.cpp"
"../../src/radix/base_10.cpp"
"../../src/radix/base_2048.cpp"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,12 +155,6 @@
<ClCompile Include="..\..\..\..\src\hash\accumulator.cpp" />
<ClCompile Include="..\..\..\..\src\hash\checksum.cpp" />
<ClCompile Include="..\..\..\..\src\hash\siphash.cpp" />
<ClCompile Include="..\..\..\..\src\hash\vectorization\sha256_1_native.cpp" />
<ClCompile Include="..\..\..\..\src\hash\vectorization\sha256_2_shani.cpp" />
<ClCompile Include="..\..\..\..\src\hash\vectorization\sha256_4_neon.cpp" />
<ClCompile Include="..\..\..\..\src\hash\vectorization\sha256_4_sse4.cpp" />
<ClCompile Include="..\..\..\..\src\hash\vectorization\sha256_4_sse41.cpp" />
<ClCompile Include="..\..\..\..\src\hash\vectorization\sha256_8_avx2.cpp" />
<ClCompile Include="..\..\..\..\src\math\math.cpp" />
<ClCompile Include="..\..\..\..\src\radix\base_10.cpp" />
<ClCompile Include="..\..\..\..\src\radix\base_2048.cpp" />
Expand Down Expand Up @@ -548,6 +542,7 @@
<None Include="..\..\..\..\include\bitcoin\system\impl\hash\sha\algorithm_double.ipp" />
<None Include="..\..\..\..\include\bitcoin\system\impl\hash\sha\algorithm_functions.ipp" />
<None Include="..\..\..\..\include\bitcoin\system\impl\hash\sha\algorithm_iterate.ipp" />
<None Include="..\..\..\..\include\bitcoin\system\impl\hash\sha\algorithm_konstant.ipp" />
<None Include="..\..\..\..\include\bitcoin\system\impl\hash\sha\algorithm_merkle.ipp" />
<None Include="..\..\..\..\include\bitcoin\system\impl\hash\sha\algorithm_native.ipp" />
<None Include="..\..\..\..\include\bitcoin\system\impl\hash\sha\algorithm_padding.ipp" />
Expand Down
140 changes: 61 additions & 79 deletions builds/msvc/vs2022/libbitcoin-system/libbitcoin-system.vcxproj.filters

Large diffs are not rendered by default.

61 changes: 48 additions & 13 deletions include/bitcoin/system/hash/sha/algorithm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,8 @@ class algorithm
/// Intrinsics types.
/// -----------------------------------------------------------------------

/// Extended integer capacity for uint32_t/uint64_t is 2/4/8/16 only.
/// Expand is multiple of buffer/state for Lane concurrent blocks.
/// Multiple blocks are "striped" across the expanded buffer in xWords.
template <size_t Lanes, bool_if<is_valid_lanes<Lanes>> = true>
using xblock_t = std_array<words_t, Lanes>;

Expand All @@ -157,6 +158,17 @@ class algorithm
template <typename xWord, if_extended<xWord> = true>
using xchunk_t = std_array<xWord, SHA::state_words>;

/// Wide is casting of buffer_t to xWord for single block concurrency.
/// This is not multi-block or block striping, just larger words.
template <typename xWord, if_extended<xWord> = true>
using wbuffer_t = std_array<xWord, sizeof(buffer_t) / sizeof(xWord)>;

template <typename xWord, if_extended<xWord> = true>
using wstate_t = std_array<xWord, sizeof(state_t) / sizeof(xWord)>;

/// Other types.
/// -----------------------------------------------------------------------

using uint = unsigned int;
using idigests_t = mutable_iterable<digest_t>;
using pad_t = std_array<word_t, subtract(SHA::block_words,
Expand Down Expand Up @@ -210,7 +222,6 @@ class algorithm

template <size_t Round>
INLINE static constexpr void prepare(auto& buffer) NOEXCEPT;
INLINE static constexpr void add_k(auto& buffer) NOEXCEPT;
static constexpr void schedule_(auto& buffer) NOEXCEPT;
static constexpr void schedule(buffer_t& buffer) NOEXCEPT;

Expand Down Expand Up @@ -242,7 +253,7 @@ class algorithm

static constexpr void reinput(auto& buffer, const auto& state) NOEXCEPT;

/// Iteration.
/// Iteration (message scheduling vectorized for multiple blocks).
/// -----------------------------------------------------------------------

template <size_t Word, size_t Lanes>
Expand Down Expand Up @@ -280,7 +291,7 @@ class algorithm
const ablocks_t<Size>& blocks) NOEXCEPT;
INLINE static void iterate(state_t& state, iblocks_t& blocks) NOEXCEPT;

/// Merkle hashing.
/// Merkle hashing (fully vectorized for multiple blocks).
/// -----------------------------------------------------------------------

template <typename xWord>
Expand Down Expand Up @@ -311,7 +322,7 @@ class algorithm
VCONSTEXPR static void merkle_hash_(digests_t& digests,
size_t offset=zero) NOEXCEPT;

/// sigma0 vectorization.
/// sigma0 vectorization (single blocks).
/// -----------------------------------------------------------------------

template <typename xWord, if_extended<xWord> = true>
Expand All @@ -328,22 +339,45 @@ class algorithm
INLINE static void schedule_sigma(xbuffer_t<xWord>& xbuffer) NOEXCEPT;
INLINE static void schedule_sigma(buffer_t& buffer) NOEXCEPT;

/// Native.
/// [K]onstant vectorization (single and multiple blocks).
/// -----------------------------------------------------------------------

template <size_t Round>
INLINE static constexpr void konstant(auto& buffer) NOEXCEPT;

template<size_t Round, typename xWord>
INLINE static void vector_konstant(wbuffer_t<xWord>& wbuffer) NOEXCEPT;
INLINE static void vector_konstant(buffer_t& buffer) NOEXCEPT;

template <typename xWord>
static constexpr void konstant(xbuffer_t<xWord>& xbuffer) NOEXCEPT;
static constexpr void konstant(buffer_t& buffer) NOEXCEPT;
static constexpr void konstant_(auto& buffer) NOEXCEPT;

/// Native SHA optimizations (single blocks).
/// -----------------------------------------------------------------------
static constexpr auto native_lanes = capacity<xint128_t, word_t>;
static constexpr auto native_rounds = SHA::rounds / native_lanes;
using cbuffer_t = std_array<xint128_t, native_rounds>;
using cstate_t = std_array<xint128_t, two>;

template<size_t Round>
INLINE static void prepare(cbuffer_t& buffer) NOEXCEPT;
INLINE static void add_k(cbuffer_t& buffer) NOEXCEPT;
static void schedule(cbuffer_t& buffer) NOEXCEPT;
INLINE static void prepare_native(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;
static void schedule(wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;

template <typename xWord>
INLINE static void schedule_native(xbuffer_t<xWord>& xbuffer) NOEXCEPT;
INLINE static void schedule_native(buffer_t& buffer) NOEXCEPT;

template<size_t Round, size_t Lane>
INLINE static void round_native(wstate_t<xint128_t>& state,
const wbuffer_t<xint128_t>& wk) NOEXCEPT;

INLINE static void shuffle(wstate_t<xint128_t>& wstate) NOEXCEPT;
INLINE static void unshuffle(wstate_t<xint128_t>& wstate) NOEXCEPT;
INLINE static void summarize_native(wstate_t<xint128_t>& out,
const wstate_t<xint128_t>& in) NOEXCEPT;

template <size_t Lane>
INLINE static void compress_native(wstate_t<xint128_t>& state,
const wbuffer_t<xint128_t>& wbuffer) NOEXCEPT;

template <typename xWord, size_t Lane>
INLINE static void compress_native(xstate_t<xWord>& xstate,
const xbuffer_t<xWord>& xbuffer) NOEXCEPT;
Expand Down Expand Up @@ -381,6 +415,7 @@ BC_PUSH_WARNING(NO_POINTER_ARITHMETIC)
BC_PUSH_WARNING(NO_ARRAY_INDEXING)

#include <bitcoin/system/impl/hash/sha/algorithm_compress.ipp>
#include <bitcoin/system/impl/hash/sha/algorithm_konstant.ipp>
#include <bitcoin/system/impl/hash/sha/algorithm_double.ipp>
#include <bitcoin/system/impl/hash/sha/algorithm_functions.ipp>
#include <bitcoin/system/impl/hash/sha/algorithm_iterate.ipp>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,6 @@ template <size_t Lane>
constexpr void CLASS::
compress_(auto& state, const auto& buffer) NOEXCEPT
{
// SHA-NI/256: 64/4 = 16 quad rounds, 8/4 = 2 state elements.
// This is a copy (state type varies due to vectorization).
const auto start = state;

Expand Down
Loading

0 comments on commit 572f3b9

Please sign in to comment.