diff --git a/tests/performance/benchmark.cpp b/tests/performance/benchmark.cpp new file mode 100644 index 00000000..fa5fca19 --- /dev/null +++ b/tests/performance/benchmark.cpp @@ -0,0 +1,306 @@ +//RandomX performance test for x86 +//https://github.com/tevador/RandomX +//License: GPL v3 + +#include +#include +#include +#include +#include +#include +#include + +#if defined(_WIN32) || defined(__MINGW32__) || defined(__CYGWIN__) || defined(__CYGWIN32__) + #define WINDOWS + #include + #include +#endif + +#if defined(__GNUC__) && defined(__x86_64__) + #include + typedef unsigned __int128 uint128_t; + typedef __int128 int128_t; + static inline uint64_t umulhi64(uint64_t a, uint64_t b) { + return ((uint128_t)a * b) >> 64; + } + static inline uint64_t imulhi64(int64_t a, int64_t b) { + return ((int128_t)a * b) >> 64; + } + #define ror64 __rorq + #define rol64 __rolq + #define forceinline inline +#elif defined(_MSC_VER) && defined(_M_X64) + #include + #include + #define umulhi64 __umulh + static inline uint64_t imulhi64(int64_t a, int64_t b) { + int64_t hi; + _mul128(a, b, &hi); + return hi; + } + #define ror64 _rotr64 + #define rol64 _rotl64 + #define forceinline __forceinline +#else + #error "Unsupported platform" +#endif + +typedef union { + double f64; + int64_t i64; + uint64_t u64; + int32_t i32; + uint32_t u32; +} convertible_t; + +forceinline void NOOP(convertible_t& a, convertible_t& b, convertible_t& c) { + c.u64 = a.u64; +} + +forceinline void FNOOP(convertible_t& a, convertible_t& b, convertible_t& c) { + c.f64 = (double)a.i64; +} + +forceinline void ADD_64(convertible_t& a, convertible_t& b, convertible_t& c) { + c.u64 = a.u64 + b.u64; +} + +forceinline void ADD_32(convertible_t& a, convertible_t& b, convertible_t& c) { + c.u64 = a.u32 + b.u32; +} + +forceinline void SUB_64(convertible_t& a, convertible_t& b, convertible_t& c) { + c.u64 = a.u64 - b.u64; +} + +forceinline void SUB_32(convertible_t& a, convertible_t& b, convertible_t& c) { + c.u64 = a.u32 - b.u32; +} + +forceinline void MUL_64(convertible_t& a, convertible_t& b, convertible_t& c) { + c.u64 = a.u64 * b.u64; +} + +forceinline void MULH_64(convertible_t& a, convertible_t& b, convertible_t& c) { + c.u64 = umulhi64(a.u64, b.u64); +} + +forceinline void MUL_32(convertible_t& a, convertible_t& b, convertible_t& c) { + c.u64 = (uint64_t)a.u32 * b.u32; +} + +forceinline void IMUL_32(convertible_t& a, convertible_t& b, convertible_t& c) { + c.i64 = (int64_t)a.i32 * b.i32; +} + +forceinline void IMULH_64(convertible_t& a, convertible_t& b, convertible_t& c) { + c.i64 = imulhi64(a.i64, b.i64); +} + +forceinline void DIV_64(convertible_t& a, convertible_t& b, convertible_t& c) { + c.u64 = a.u64 / (b.u32 != 0 ? b.u32 : 1U); +} + +forceinline void IDIV_64(convertible_t& a, convertible_t& b, convertible_t& c) { + c.i64 = a.i64 / (b.i32 != 0 ? b.i32 : 1); +} + +forceinline void AND_64(convertible_t& a, convertible_t& b, convertible_t& c) { + c.u64 = a.u64 & b.u64; +} + +forceinline void AND_32(convertible_t& a, convertible_t& b, convertible_t& c) { + c.u64 = a.u32 & b.u32; +} + +forceinline void OR_64(convertible_t& a, convertible_t& b, convertible_t& c) { + c.u64 = a.u64 | b.u64; +} + +forceinline void OR_32(convertible_t& a, convertible_t& b, convertible_t& c) { + c.u64 = a.u32 | b.u32; +} + +forceinline void XOR_64(convertible_t& a, convertible_t& b, convertible_t& c) { + c.u64 = a.u64 ^ b.u64; +} + +forceinline void XOR_32(convertible_t& a, convertible_t& b, convertible_t& c) { + c.u64 = a.u32 ^ b.u32; +} + +forceinline void SHL_64(convertible_t& a, convertible_t& b, convertible_t& c) { + c.u64 = a.u64 << (b.u64 & 63); +} + +forceinline void SHR_64(convertible_t& a, convertible_t& b, convertible_t& c) { + c.u64 = a.u64 >> (b.u64 & 63); +} + +forceinline void SAR_64(convertible_t& a, convertible_t& b, convertible_t& c) { + c.i64 = a.i64 >> (b.u64 & 63); +} + +forceinline void ROL_64(convertible_t& a, convertible_t& b, convertible_t& c) { + c.u64 = rol64(a.u64, (b.u64 & 63)); +} + +forceinline void ROR_64(convertible_t& a, convertible_t& b, convertible_t& c) { + c.u64 = ror64(a.u64, (b.u64 & 63)); +} + +forceinline void FADD(convertible_t& a, convertible_t& b, convertible_t& c) { + c.f64 = (double)a.i64 + (double)b.i64; +} + +forceinline void FSUB(convertible_t& a, convertible_t& b, convertible_t& c) { + c.f64 = (double)a.i64 - (double)b.i64; +} + +forceinline void FMUL(convertible_t& a, convertible_t& b, convertible_t& c) { + c.f64 = (double)a.i64 * (double)b.i64; +} + +forceinline void FDIV(convertible_t& a, convertible_t& b, convertible_t& c) { + c.f64 = (double)a.i64 / (double)b.i64; +} + +forceinline void FSQRT(convertible_t& a, convertible_t& b, convertible_t& c) { + double d = fabs((double)a.i64); + c.f64 = _mm_cvtsd_f64(_mm_sqrt_sd(_mm_setzero_pd(), _mm_load_pd(&d))); +} + +static uint32_t mxcsr; + +forceinline void FROUND(convertible_t& a, convertible_t& b, convertible_t& c) { + c.f64 = (double)a.i64; + _mm_setcsr(mxcsr | ((uint32_t)(a.u64 << 13) & _MM_ROUND_MASK)); +} + +inline void init_FPU() { + mxcsr = (_mm_getcsr() | _MM_FLUSH_ZERO_ON) & ~_MM_ROUND_MASK; + _mm_setcsr(mxcsr); +} + +template +bool tryParse(char* buffer, T& out) { + std::istringstream ss(buffer); + if (!(ss >> out)) { + std::cout << "Invalid value '" << buffer << "'" << std::endl; + return false; + } + return true; +} + +//#define ITERATIONS 10000000 +#define SCRATCHPAD_SIZE (16 * 1024) +#define SCRATCHPAD_LENGTH (SCRATCHPAD_SIZE / sizeof(convertible_t)) +#define SCRATCHPAD_MASK (SCRATCHPAD_SIZE / sizeof(convertible_t) - 1) +#define SCRATCHPAD_16K(x) scratchpad[(x) & SCRATCHPAD_MASK] + +#define BENCHMARK(FUNC,TYPE) do { \ + memcpy((void*)scratchpad, input, SCRATCHPAD_SIZE); \ + tstart = std::chrono::high_resolution_clock::now(); \ + for (uint64_t i = 0; i < iterations; ++i) { \ + FUNC(SCRATCHPAD_16K(i + 8 + 0), r0, SCRATCHPAD_16K(i + 0)); \ + SCRATCHPAD_16K(i + 0).u64 ^= r7.u64;\ + FUNC(SCRATCHPAD_16K(i + 8 + 1), r1, SCRATCHPAD_16K(i + 1)); \ + SCRATCHPAD_16K(i + 1).u64 ^= r6.u64;\ + FUNC(SCRATCHPAD_16K(i + 8 + 2), r2, SCRATCHPAD_16K(i + 2)); \ + SCRATCHPAD_16K(i + 2).u64 ^= r5.u64;\ + FUNC(SCRATCHPAD_16K(i + 8 + 3), r3, SCRATCHPAD_16K(i + 3)); \ + SCRATCHPAD_16K(i + 3).u64 ^= r4.u64;\ + FUNC(SCRATCHPAD_16K(i + 8 + 4), r4, SCRATCHPAD_16K(i + 4)); \ + SCRATCHPAD_16K(i + 4).u64 ^= r3.u64;\ + FUNC(SCRATCHPAD_16K(i + 8 + 5), r5, SCRATCHPAD_16K(i + 5)); \ + SCRATCHPAD_16K(i + 5).u64 ^= r2.u64;\ + FUNC(SCRATCHPAD_16K(i + 8 + 6), r6, SCRATCHPAD_16K(i + 6)); \ + SCRATCHPAD_16K(i + 6).u64 ^= r1.u64;\ + FUNC(SCRATCHPAD_16K(i + 8 + 7), r7, SCRATCHPAD_16K(i + 7)); \ + SCRATCHPAD_16K(i + 7).u64 ^= r0.u64;\ + } \ + tend = std::chrono::high_resolution_clock::now(); \ + uint64_t acum = 0; \ + for (int i = 0; i < SCRATCHPAD_LENGTH; ++i) \ + acum += scratchpad[i].u64; \ + std::cout << "| " << #FUNC << " | " << std::chrono::duration(tend - tstart).count() << " | " << acum << " |" << std::endl; \ + } while(false) + + +int main(int argc, char** argv) { + uint64_t iterations; + if (argc > 1) { + if (!tryParse(argv[1], iterations)) + return 1; + } + else { + iterations = 100000000; + } +#ifdef WINDOWS + _setmode(_fileno(stdin), O_BINARY); +#endif + convertible_t input[SCRATCHPAD_LENGTH]; + + std::cout << "Reading " << sizeof(input) << " bytes from STDIN..." << std::endl; + std::cin.read((char*)input, sizeof(input)); + + if (!std::cin) { + std::cerr << "Insufficient input" << std::endl; + return 1; + } + + convertible_t scratchpad[SCRATCHPAD_LENGTH]; + convertible_t r0, r1, r2, r3, r4, r5, r6, r7; + + r0.u64 = input[0].u64; + r1.u64 = input[1].u64; + r2.u64 = input[2].u64; + r3.u64 = input[3].u64; + r4.u64 = input[4].u64; + r5.u64 = input[5].u64; + r6.u64 = input[6].u64; + r7.u64 = input[7].u64; + + std::chrono::high_resolution_clock::time_point tstart, tend; + + std::cout << iterations << " iterations:" << std::endl << std::endl; + + std::cout << "| operation | time [s] | (result) |" << std::endl; + std::cout << "|-----------|----------|----------|" << std::endl; + + BENCHMARK(NOOP, u64); + BENCHMARK(ADD_64, u64); + BENCHMARK(ADD_32, u64); + BENCHMARK(SUB_64, u64); + BENCHMARK(SUB_32, u64); + BENCHMARK(MUL_64, u64); + BENCHMARK(MULH_64, u64); + BENCHMARK(MUL_32, u64); + BENCHMARK(IMUL_32, u64); + BENCHMARK(IMULH_64, u64); + BENCHMARK(DIV_64, u64); + BENCHMARK(IDIV_64, u64); + BENCHMARK(AND_64, u64); + BENCHMARK(AND_32, u64); + BENCHMARK(OR_64, u64); + BENCHMARK(OR_32, u64); + BENCHMARK(XOR_64, u64); + BENCHMARK(XOR_32, u64); + BENCHMARK(SHL_64, u64); + BENCHMARK(SHR_64, u64); + BENCHMARK(SAR_64, u64); + BENCHMARK(ROR_64, u64); + BENCHMARK(ROL_64, u64); + + init_FPU(); + + BENCHMARK(FNOOP, f64); + BENCHMARK(FADD, f64); + BENCHMARK(FSUB, f64); + BENCHMARK(FMUL, f64); + BENCHMARK(FDIV, f64); + BENCHMARK(FSQRT, f64); + BENCHMARK(FROUND, f64); + + return 0; +} diff --git a/tests/performance/test1.data b/tests/performance/test1.data new file mode 100644 index 00000000..2eb8fe53 Binary files /dev/null and b/tests/performance/test1.data differ diff --git a/tests/performance/test2.data b/tests/performance/test2.data new file mode 100644 index 00000000..11c8b487 Binary files /dev/null and b/tests/performance/test2.data differ diff --git a/tests/rx2c.py b/tests/rx2c.py index 6571f754..331c25bd 100644 --- a/tests/rx2c.py +++ b/tests/rx2c.py @@ -126,12 +126,12 @@ def writeEpilog(file): file.write("\tend:\n") file.write("\t\tclockEnd = clock();\n") for i in range(8): - file.write('\t\tprintf("r{0} = %-36lu f{0} = %g\\n", r{0}, f{0});\n'.format(i)) + file.write('\t\tprintf("r{0} = %-36" PRIu64 " f{0} = %g\\n", r{0}, f{0});\n'.format(i)) file.write(("\t\tuint64_t spadsum = 0;\n" "\t\tfor(int i = 0; i < SCRATCHPAD_LENGTH; ++i) {\n" "\t\t spadsum += scratchpad[i].u64;\n" "\t\t}\n" - '\t\tprintf("scratchpad sum = %lu\\n", spadsum);\n' + '\t\tprintf("scratchpad sum = %" PRIu64 "\\n", spadsum);\n' '\t\tprintf("runtime: %f\\n", (clockEnd - clockStart) / (double)CLOCKS_PER_SEC);\n' "#ifdef RAM\n" "\t\t_mm_free((void*)mmu.buffer);\n" @@ -632,8 +632,8 @@ def writeMain(file): " register uint64_t r0, r1, r2, r3, r4, r5, r6, r7;\n" " register double f0, f1, f2, f3, f4, f5, f6, f7;\n" " register uint64_t ic, sp;\n" - " convertible_t scratchpad[SCRATCHPAD_LENGTH] __attribute__ ((aligned (16)));\n" " stack_t stack[STACK_LENGTH];\n" + " convertible_t scratchpad[SCRATCHPAD_LENGTH] __attribute__ ((aligned (16)));\n" " mmu_t mmu;\n" " uint32_t mxcsr;\n" )) @@ -646,6 +646,7 @@ def writeProlog(file): "#include \n" "#include \n" "#include \n" + "#include \n" "typedef uint32_t addr_t;\n" "typedef unsigned __int128 uint128_t;\n" "typedef __int128 int128_t;\n" @@ -669,14 +670,14 @@ def writeProlog(file): " const char* buffer;\n" "#endif\n" "} mmu_t;\n" - "#define DRAM_SIZE (1UL << 32)\n" + "#define DRAM_SIZE (1ULL << 32)\n" "#define SCRATCHPAD_SIZE (256 * 1024)\n" "#define SCRATCHPAD_LENGTH (SCRATCHPAD_SIZE / sizeof(convertible_t))\n" "#define SCRATCHPAD_MASK14 (16 * 1024 / sizeof(convertible_t) - 1)\n" "#define SCRATCHPAD_MASK18 (SCRATCHPAD_LENGTH - 1)\n" "#define SCRATCHPAD_16K(x) scratchpad[(x) & SCRATCHPAD_MASK14]\n" "#define SCRATCHPAD_256K(x) scratchpad[(x) & SCRATCHPAD_MASK18]\n" - "#define STACK_LENGTH (32 * 1024)\n" + "#define STACK_LENGTH (128 * 1024)\n" "#ifdef RAM\n" "#define DRAM_READ(mmu) (convertible_t)*(uint64_t*)((mmu)->buffer + (mmu)->m0)\n" "#define PREFETCH(mmu) _mm_prefetch(((mmu)->buffer + (mmu)->m1), _MM_HINT_T0)\n"