From d8c52b4d03abb820550d1eaadfafa703d54b298f Mon Sep 17 00:00:00 2001 From: Potuz Date: Mon, 13 May 2024 21:56:16 +0300 Subject: [PATCH] Fix Mac --- .github/workflows/test.yml | 38 ++++++++ README.md | 2 +- src/Makefile | 4 + src/hashtree.c | 6 ++ src/sha256_armv8_crypto.S | 104 ++++++++++++--------- src/sha256_armv8_neon_x1.S | 32 +++++-- src/sha256_armv8_neon_x4.S | 38 ++++++-- src/test.c | 10 +- src/ubench.h | 181 +++++++++++++++++++++++++------------ 9 files changed, 299 insertions(+), 116 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 63cf44a..ac2ef09 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -12,6 +12,44 @@ jobs: - name: Run tests run: ./src/test + windows: + name: windows + runs-on: windows-latest + steps: + - uses: actions/checkout@v4 + + - name: Install MinGW + run: | + choco install mingw + echo "C:\tools\mingw64\bin" >> $GITHUB_PATH + + - name: Set CC environment variable + run: | + echo "CC=gcc" >> $GITHUB_ENV + + - name: Verify gcc + run: | + gcc --version + shell: bash + + - name: Build + run: CC=gcc make all + shell: bash + + - name: Run tests + run: .\src\test.exe + shell: bash + + macos: + name: macos + runs-on: macos-latest + steps: + - uses: actions/checkout@v4 + - name: Build + run: make all + - name: Run tests + run: ./src/test + rust-bindings: runs-on: ubuntu-latest name: (${{ matrix.target }}) diff --git a/README.md b/README.md index 151215e..cdc7918 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ There are no dependencies besides the standard `C` header `stdint.h`. Benchmarks have a dependency on `libm`. Tests and benchmarks on x86-64 an extra dependency on `cpuid.h` is needed. An optional dependency on openssl allows to test and benchmark against openssl. The only build-time dependency is a GCC and -GNU assembler compatible compiler like `gcc` and `gas`. +GNU assembler compatible compiler like `gcc` and `gas`. On Mac OS X with newer Apple Silicon processors the library can be built with the default clang compiler. ## Compilation - Start by cloning the repository diff --git a/src/Makefile b/src/Makefile index d3ac075..2efcdb2 100644 --- a/src/Makefile +++ b/src/Makefile @@ -67,7 +67,11 @@ $(libname): $(objx86) hashtree.pc $(AR) rcs $(libname) $(objx86) endif +ifeq ($(WIN),1) +all: $(libname) test +else all: $(libname) test bench +endif test: hashtree.h acutest.h test.c $(libname) $(CC) $(CFLAGS) $(LDFLAGS) -o test test.c $(testlibs) diff --git a/src/hashtree.c b/src/hashtree.c index 97769a7..06eca9c 100644 --- a/src/hashtree.c +++ b/src/hashtree.c @@ -29,9 +29,11 @@ SOFTWARE. #include #endif #ifdef __aarch64__ +#ifndef __APPLE__ #include #include #endif +#endif static void init_and_hash(unsigned char *output, const unsigned char *input, uint64_t count); @@ -65,6 +67,9 @@ static hashtree_hash_fcn hashtree_detect() { return (hashtree_hash_fcn)0; #endif #ifdef __aarch64__ +#ifdef __APPLE__ + return &hashtree_sha256_sha_x1; +#else long hwcaps = getauxval(AT_HWCAP); if (hwcaps & HWCAP_SHA2) { return &hashtree_sha256_sha_x1; @@ -76,6 +81,7 @@ static hashtree_hash_fcn hashtree_detect() { return (hashtree_hash_fcn)0; #endif +#endif } int hashtree_init(hashtree_hash_fcn override) { diff --git a/src/sha256_armv8_crypto.S b/src/sha256_armv8_crypto.S index 6de7288..f9ede54 100644 --- a/src/sha256_armv8_crypto.S +++ b/src/sha256_armv8_crypto.S @@ -55,35 +55,55 @@ padding .req x5 .macro hashupdate WORD - sha256h q2, q3, WORD\().4s - sha256h2 q3, q8, WORD\().4s + sha256h q2, q3, \WORD + sha256h2 q3, q8, \WORD mov v8.16b, v2.16b .endm .macro schedule A, B, C, D, E, WORD - add \WORD\().4s, \B\().4s, \A\().4s - sha256su0 \B\().4s, \C\().4s - sha256su1 \E\().4s, \C\().4s, \D\().4s + add \WORD, \B, \A + sha256su0 \B, \C + sha256su1 \E, \C, \D hashupdate \WORD .endm +#ifdef __APPLE__ +.global _hashtree_sha256_sha_x1 +#else .global hashtree_sha256_sha_x1 +#endif +#ifndef __APPLE__ .type hashtree_sha256_sha_x1,%function +#endif .align 5 +#ifdef __APPLE__ +_hashtree_sha256_sha_x1: +#else hashtree_sha256_sha_x1: +#endif // Set up stack, need to save the clobbered registers d8-d11 sub sp, sp, #32 stp d8, d9, [sp] + +#ifdef __APPLE__ + adrp digest, .LDIGEST@PAGE + add digest, digest, #:lo12:.LDIGEST@PAGEOFF + adrp k256, .LK256@PAGE + add k256, k256, #:lo12:.LK256@PAGEOFF +#else adrp digest, .LDIGEST add digest, digest, #:lo12:.LDIGEST - adrp k256, .LK256 add k256, k256, #:lo12:.LK256 - +#endif stp d10, d11, [sp, #16] +#ifdef __APPLE__ + adrp padding, .LPADDING@PAGE + add padding, padding, #:lo12:.LPADDING@PAGEOFF +#else adrp padding, .LPADDING add padding, padding, #:lo12:.LPADDING - +#endif add last, output, count, lsl #5 ld1 {v0.4s, v1.4s}, [digest] @@ -113,29 +133,29 @@ hashtree_sha256_sha_x1: add v9.4s, v4.4s, v16.4s sha256su0 v4.4s, v5.4s - hashupdate v9 - - schedule v17, v5, v6, v7, v4, v9 - schedule v18, v6, v7, v4, v5, v9 - schedule v19, v7, v4, v5, v6, v9 - schedule v20, v4, v5, v6, v7, v9 - schedule v21, v5, v6, v7, v4, v9 - schedule v22, v6, v7, v4, v5, v9 - schedule v23, v7, v4, v5, v6, v9 - schedule v24, v4, v5, v6, v7, v9 - schedule v25, v5, v6, v7, v4, v9 - schedule v26, v6, v7, v4, v5, v9 - schedule v27, v7, v4, v5, v6, v9 + hashupdate v9.4s + + schedule v17.4s, v5.4s, v6.4s, v7.4s, v4.4s, v9.4s + schedule v18.4s, v6.4s, v7.4s, v4.4s, v5.4s, v9.4s + schedule v19.4s, v7.4s, v4.4s, v5.4s, v6.4s, v9.4s + schedule v20.4s, v4.4s, v5.4s, v6.4s, v7.4s, v9.4s + schedule v21.4s, v5.4s, v6.4s, v7.4s, v4.4s, v9.4s + schedule v22.4s, v6.4s, v7.4s, v4.4s, v5.4s, v9.4s + schedule v23.4s, v7.4s, v4.4s, v5.4s, v6.4s, v9.4s + schedule v24.4s, v4.4s, v5.4s, v6.4s, v7.4s, v9.4s + schedule v25.4s, v5.4s, v6.4s, v7.4s, v4.4s, v9.4s + schedule v26.4s, v6.4s, v7.4s, v4.4s, v5.4s, v9.4s + schedule v27.4s, v7.4s, v4.4s, v5.4s, v6.4s, v9.4s add v9.4s, v4.4s, v28.4s - hashupdate v9 + hashupdate v9.4s sha256su1 v7.4s, v5.4s, v6.4s add v9.4s, v5.4s, v29.4s - hashupdate v9 + hashupdate v9.4s add v9.4s, v6.4s, v30.4s - hashupdate v9 + hashupdate v9.4s add v9.4s, v7.4s, v31.4s - hashupdate v9 + hashupdate v9.4s // Add initial digest and back it up add v2.4s, v0.4s, v2.4s @@ -153,22 +173,22 @@ hashtree_sha256_sha_x1: ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [padding] sub padding, padding, #192 - hashupdate v16 - hashupdate v17 - hashupdate v18 - hashupdate v19 - hashupdate v20 - hashupdate v21 - hashupdate v22 - hashupdate v23 - hashupdate v24 - hashupdate v25 - hashupdate v26 - hashupdate v27 - hashupdate v28 - hashupdate v29 - hashupdate v30 - hashupdate v31 + hashupdate v16.4s + hashupdate v17.4s + hashupdate v18.4s + hashupdate v19.4s + hashupdate v20.4s + hashupdate v21.4s + hashupdate v22.4s + hashupdate v23.4s + hashupdate v24.4s + hashupdate v25.4s + hashupdate v26.4s + hashupdate v27.4s + hashupdate v28.4s + hashupdate v29.4s + hashupdate v30.4s + hashupdate v31.4s // Add backed up digest add v2.4s, v10.4s, v2.4s @@ -185,7 +205,7 @@ hashtree_sha256_sha_x1: ldp d10, d11, [sp], #16 ret -.section .rodata +.section .rodata, "a" .align 4 .LDIGEST: .word 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,\ diff --git a/src/sha256_armv8_neon_x1.S b/src/sha256_armv8_neon_x1.S index dcdde4e..1507d3a 100644 --- a/src/sha256_armv8_neon_x1.S +++ b/src/sha256_armv8_neon_x1.S @@ -248,7 +248,7 @@ T5 .req w22 # it reads pre-scheduled words from ptr + offset. ################################################################################## .macro one_round A, B, C, D, E, F, G, H, ptr, offset - ldr T3, [\ptr, #\offset] + ldr T3, [\ptr, \offset] ror T1, \E, #6 ror T2, \A, #2 ror T4, \A, #13 @@ -299,21 +299,35 @@ T5 .req w22 # is writable. # ######################################################################################################## + +#ifdef __APPLE__ +.global _hashtree_sha256_neon_x1 +#else .global hashtree_sha256_neon_x1 .type hashtree_sha256_neon_x1,%function +#endif .align 4 +#ifdef __APPLE__ +_hashtree_sha256_neon_x1: +#else hashtree_sha256_neon_x1: +#endif sub sp, sp, #64 stp digest,k256, [sp, #48] movi VZ.4s, #0 stp padding, x22, [sp, #32] - adrp digest, .LDIGEST +#ifdef __APPLE__ + adrp digest, .LDIGEST@PAGE + add digest, digest, .LDIGEST@PAGEOFF + adrp padding, .LPADDING@PAGE + add padding, padding, .LPADDING@PAGEOFF +#else + adrp digest, .LDIGEST add digest, digest, #:lo12:.LDIGEST - adrp padding, .LPADDING - add padding, padding, #:lo12:.LPADDING - + add padding, padding, #:lo12:.LPADDING +#endif add last, output, count, lsl #5 .Lhash_1_block_loop: @@ -322,9 +336,13 @@ hashtree_sha256_neon_x1: beq .Larmv8_neon_x1_finish ld1 {VR0.4s, VR1.4s, VR2.4s, VR3.4s}, [input], #64 +#ifdef __APPLE__ + adrp k256, .LK256@PAGE + add k256, k256, #:lo12:.LK256@PAGEOFF +#else adrp k256, .LK256 add k256, k256, #:lo12:.LK256 - +#endif # change endianness rev32 VR0.16b, VR0.16b rev32 VR1.16b, VR1.16b @@ -423,7 +441,7 @@ hashtree_sha256_neon_x1: add sp, sp, #64 ret -.section .rodata +.section .rodata, "a" .align 4 .LDIGEST: .word 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,\ diff --git a/src/sha256_armv8_neon_x4.S b/src/sha256_armv8_neon_x4.S index 81ccea0..00fad05 100644 --- a/src/sha256_armv8_neon_x4.S +++ b/src/sha256_armv8_neon_x4.S @@ -277,23 +277,44 @@ TQ7 .req q22 round_padding \F, \G, \H, \A, \B, \C, \D, \E .endm +#ifdef __APPLE__ +.global _hashtree_sha256_neon_x4 +#else .global hashtree_sha256_neon_x4 +#endif +#ifdef __APPLE__ +//.type hashtree_sha256_neon_x4,%function +#else .type hashtree_sha256_neon_x4,%function +#endif + .align 5 +#ifdef __APPLE__ +_hashtree_sha256_neon_x4: +#else hashtree_sha256_neon_x4: +#endif sub sp, sp, #1024 - adrp k256, .LK256x4 - add k256, k256, #:lo12:.LK256x4 +#ifdef __APPLE__ + adrp k256,.LK256x4@GOTPAGE + ldr k256, [k256, .LK256x4@GOTPAGEOFF] + adrp padding, .LPADDINGx4@GOTPAGE + ldr padding, [padding, .LPADDINGx4@GOTPAGEOFF] + adrp digest, .LDIGESTx4L@GOTPAGE + ldr digest, [digest, .LDIGESTx4L@GOTPAGEOFF] + adrp digest2, .LDIGESTx4H@GOTPAGE + ldr digest2, [digest2, .LDIGESTx4H@GOTPAGEOFF] +#else + adrp k256,.LK256x4 + add k256, k256, #:lo12:.LK256x4 adrp padding, .LPADDINGx4 add padding, padding, #:lo12:.LPADDINGx4 - adrp digest, .LDIGESTx4L add digest, digest, #:lo12:.LDIGESTx4L - adrp digest2, .LDIGESTx4H add digest2, digest2, #:lo12:.LDIGESTx4H - +#endif mov post64, #64 mov post32, #32 mov postminus80, #-80 @@ -397,9 +418,12 @@ hashtree_sha256_neon_x4: b .Larmv8_neon_x4_loop .Lsha256_armv8_x4_epilog: add sp, sp, #1024 +#ifdef __APPLE__ + b _hashtree_sha256_neon_x1 +#else b hashtree_sha256_neon_x1 - -.section .rodata +#endif +.section .rodata,"a" .align 4 .LDIGESTx4L: .word 0x6a09e667, 0x6a09e667, 0x6a09e667, 0x6a09e667,\ diff --git a/src/test.c b/src/test.c index 98ed510..09ea45c 100644 --- a/src/test.c +++ b/src/test.c @@ -26,9 +26,12 @@ #include #endif #ifdef __aarch64__ +#ifdef __APPLE__ +#else #include #include #endif +#endif #include "acutest.h" #include "hashtree.h" @@ -589,8 +592,10 @@ void test_armv8_neon_x4() { } void test_hash_armv8_crypto_multiple_blocks() { +#ifndef __APPLE__ long hwcaps = getauxval(AT_HWCAP); if (hwcaps & HWCAP_SHA2) { +#endif unsigned char digest[128]; hashtree_sha256_sha_x1(digest, test_16_block, 4); @@ -602,15 +607,16 @@ void test_hash_armv8_crypto_multiple_blocks() { TEST_CHECK(digests_equal(digest, test_4_digests, sizeof(digest))); TEST_DUMP("Expected: ", test_4_digests, sizeof(test_4_digests)); TEST_DUMP("Produced: ", digest, sizeof(digest)); +#ifndef __APPLE__ } else { acutest_colored_printf_(ACUTEST_COLOR_GREEN_INTENSIVE_, "[ CPU does not support Sha2 instructions ]\n"); } - +#endif } #endif int override_called; -void test_override(unsigned char*, const unsigned char*, uint64_t) { +void test_override(unsigned char* digest, const unsigned char* chunks, uint64_t count) { override_called += 1; } diff --git a/src/ubench.h b/src/ubench.h index b778cf1..fb83ab2 100644 --- a/src/ubench.h +++ b/src/ubench.h @@ -48,22 +48,17 @@ #pragma warning(disable : 4711) /* - Disable warning about replacing undefined preprocessor macro '__cplusplus' with - 0 emitted from microsofts own headers. - See: https://developercommunity.visualstudio.com/t/issue-in-corecrth-header-results-in-an-undefined-m/433021 + Disable warning about replacing undefined preprocessor macro '__cplusplus' + with 0 emitted from microsofts own headers. See: + https://developercommunity.visualstudio.com/t/issue-in-corecrth-header-results-in-an-undefined-m/433021 */ #pragma warning(disable : 4668) +#if _MSC_VER > 1930 /* - Disabled warning about dangerous use of section. - section '.CRT$XCU' is reserved for C++ dynamic initialization. Manually - creating the section will interfere with C++ dynamic initialization and may lead to undefined behavior + Disable warning about 'const' variable is not used. */ -#if defined(_MSC_FULL_VER) -#if _MSC_FULL_VER >= 192930100 // this warning was introduced in Visual Studio 2019 version 16.11 -#pragma warning(disable : 5247) -#pragma warning(disable : 5248) -#endif +#pragma warning(disable : 5264) #endif #pragma warning(push, 1) @@ -81,6 +76,12 @@ #define UBENCH_NULL 0 #endif +#if defined(__TINYC__) +#define UBENCH_ATTRIBUTE(a) __attribute((a)) +#else +#define UBENCH_ATTRIBUTE(a) __attribute__((a)) +#endif + #if defined(_MSC_VER) && (_MSC_VER < 1920) typedef __int64 ubench_int64_t; typedef unsigned __int64 ubench_uint64_t; @@ -104,17 +105,19 @@ typedef uint64_t ubench_uint64_t; typedef union { struct { unsigned long LowPart; - long HighPart; + long HighPart; } DUMMYSTRUCTNAME; struct { unsigned long LowPart; - long HighPart; + long HighPart; } u; ubench_int64_t QuadPart; } ubench_large_integer; -UBENCH_C_FUNC __declspec(dllimport) int __stdcall QueryPerformanceCounter(ubench_large_integer *); -UBENCH_C_FUNC __declspec(dllimport) int __stdcall QueryPerformanceFrequency(ubench_large_integer *); +UBENCH_C_FUNC __declspec(dllimport) int __stdcall QueryPerformanceCounter( + ubench_large_integer *); +UBENCH_C_FUNC __declspec(dllimport) int __stdcall QueryPerformanceFrequency( + ubench_large_integer *); #elif defined(__linux__) /* @@ -138,7 +141,7 @@ UBENCH_C_FUNC __declspec(dllimport) int __stdcall QueryPerformanceFrequency(uben #endif #elif defined(__APPLE__) -#include +#include #endif #if defined(__cplusplus) @@ -169,9 +172,42 @@ UBENCH_C_FUNC __declspec(dllimport) int __stdcall QueryPerformanceFrequency(uben #define UBENCH_PRIu64 PRIu64 #endif -#if defined(_MSC_VER) +#if defined(__cplusplus) +#define UBENCH_INLINE inline +#elif defined(_MSC_VER) #define UBENCH_INLINE __forceinline +#else +#define UBENCH_INLINE inline +#endif + +#if defined(_MSC_VER) #define UBENCH_NOINLINE __declspec(noinline) +#else +#define UBENCH_NOINLINE UBENCH_ATTRIBUTE(noinline) +#endif + +#if defined(__cplusplus) + +#if defined(__clang__) +#define UBENCH_INITIALIZER_BEGIN_DISABLE_WARNINGS \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wglobal-constructors\"") + +#define UBENCH_INITIALIZER_END_DISABLE_WARNINGS _Pragma("clang diagnostic pop") +#else +#define UBENCH_INITIALIZER_BEGIN_DISABLE_WARNINGS +#define UBENCH_INITIALIZER_END_DISABLE_WARNINGS +#endif + +#define UBENCH_INITIALIZER(f) \ + struct f##_cpp_struct { \ + f##_cpp_struct(); \ + }; \ + UBENCH_INITIALIZER_BEGIN_DISABLE_WARNINGS static f##_cpp_struct \ + f##_cpp_global UBENCH_INITIALIZER_END_DISABLE_WARNINGS; \ + f##_cpp_struct::f##_cpp_struct() + +#elif defined(_MSC_VER) #if defined(_WIN64) #define UBENCH_SYMBOL_PREFIX @@ -216,11 +252,8 @@ UBENCH_C_FUNC __declspec(dllimport) int __stdcall QueryPerformanceFrequency(uben #endif #endif -#define UBENCH_INLINE inline -#define UBENCH_NOINLINE __attribute__((noinline)) - #define UBENCH_INITIALIZER(f) \ - static void f(void) __attribute__((constructor)); \ + static void f(void) UBENCH_ATTRIBUTE(constructor); \ static void f(void) #endif @@ -251,7 +284,7 @@ UBENCH_C_FUNC __declspec(dllimport) int __stdcall QueryPerformanceFrequency(uben #endif static UBENCH_INLINE ubench_int64_t ubench_ns(void) { -#ifdef _MSC_VER +#if defined(_MSC_VER) ubench_large_integer counter; ubench_large_integer frequency; QueryPerformanceCounter(&counter); @@ -269,17 +302,17 @@ static UBENCH_INLINE ubench_int64_t ubench_ns(void) { return UBENCH_CAST(ubench_int64_t, ts.tv_sec) * 1000 * 1000 * 1000 + ts.tv_nsec; #elif __APPLE__ - return UBENCH_CAST(ubench_int64_t, mach_absolute_time()); + return UBENCH_CAST(ubench_int64_t, clock_gettime_nsec_np(CLOCK_UPTIME_RAW)); #endif } struct ubench_run_state_s { - ubench_int64_t* ns; - ubench_int64_t size; - ubench_int64_t sample; + ubench_int64_t *ns; + ubench_int64_t size; + ubench_int64_t sample; }; -typedef void (*ubench_benchmark_t)(struct ubench_run_state_s* ubs); +typedef void (*ubench_benchmark_t)(struct ubench_run_state_s *ubs); struct ubench_benchmark_state_s { ubench_benchmark_t func; @@ -299,7 +332,7 @@ UBENCH_EXTERN struct ubench_state_s ubench_state; #if defined(_MSC_VER) #define UBENCH_UNUSED #else -#define UBENCH_UNUSED __attribute__((unused)) +#define UBENCH_UNUSED UBENCH_ATTRIBUTE(unused) #endif #ifdef __clang__ @@ -332,22 +365,35 @@ UBENCH_EXTERN struct ubench_state_s ubench_state; #pragma clang diagnostic pop #endif -static UBENCH_INLINE int ubench_do_benchmark(struct ubench_run_state_s* ubs) -{ - ubench_int64_t curr_sample = ubs->sample++; - ubs->ns[curr_sample] = ubench_ns(); - return curr_sample < ubs->size ? 1 : 0; -} +#if defined(__clang__) +#if __has_warning("-Wunsafe-buffer-usage") +#define UBENCH_SURPRESS_WARNINGS_BEGIN \ + _Pragma("clang diagnostic push") \ + _Pragma("clang diagnostic ignored \"-Wunsafe-buffer-usage\"") +#define UBENCH_SURPRESS_WARNINGS_END _Pragma("clang diagnostic pop") +#else +#define UBENCH_SURPRESS_WARNINGS_BEGIN +#define UBENCH_SURPRESS_WARNINGS_END +#endif +#elif defined(__GNUC__) && __GNUC__ >= 8 && defined(__cplusplus) +#define UBENCH_SURPRESS_WARNINGS_BEGIN \ + _Pragma("GCC diagnostic push") \ + _Pragma("GCC diagnostic ignored \"-Wclass-memaccess\"") +#define UBENCH_SURPRESS_WARNINGS_END _Pragma("GCC diagnostic pop") +#else +#define UBENCH_SURPRESS_WARNINGS_BEGIN +#define UBENCH_SURPRESS_WARNINGS_END +#endif -#define UBENCH_DO_BENCHMARK() \ - while(ubench_do_benchmark(ubench_run_state) > 0) +#define UBENCH_DO_BENCHMARK() while (ubench_do_benchmark(ubench_run_state) > 0) #define UBENCH_EX(SET, NAME) \ + UBENCH_SURPRESS_WARNINGS_BEGIN \ UBENCH_EXTERN struct ubench_state_s ubench_state; \ - static void ubench_##SET##_##NAME(struct ubench_run_state_s* ubs); \ + static void ubench_##SET##_##NAME(struct ubench_run_state_s *ubs); \ UBENCH_INITIALIZER(ubench_register_##SET##_##NAME) { \ const size_t index = ubench_state.benchmarks_length++; \ - const char *name_part = #SET "." #NAME; \ + const char name_part[] = #SET "." #NAME; \ const size_t name_size = strlen(name_part) + 1; \ char *name = UBENCH_PTR_CAST(char *, malloc(name_size)); \ ubench_state.benchmarks = UBENCH_PTR_CAST( \ @@ -359,14 +405,13 @@ static UBENCH_INLINE int ubench_do_benchmark(struct ubench_run_state_s* ubs) ubench_state.benchmarks[index].name = name; \ UBENCH_SNPRINTF(name, name_size, "%s", name_part); \ } \ - void ubench_##SET##_##NAME(struct ubench_run_state_s* ubench_run_state) + UBENCH_SURPRESS_WARNINGS_END \ + void ubench_##SET##_##NAME(struct ubench_run_state_s *ubench_run_state) #define UBENCH(SET, NAME) \ static void ubench_run_##SET##_##NAME(void); \ UBENCH_EX(SET, NAME) { \ - UBENCH_DO_BENCHMARK() { \ - ubench_run_##SET##_##NAME(); \ - } \ + UBENCH_DO_BENCHMARK() { ubench_run_##SET##_##NAME(); } \ } \ void ubench_run_##SET##_##NAME(void) @@ -377,12 +422,14 @@ static UBENCH_INLINE int ubench_do_benchmark(struct ubench_run_state_s* ubs) static void ubench_f_teardown_##FIXTURE(struct FIXTURE *ubench_fixture) #define UBENCH_EX_F(FIXTURE, NAME) \ + UBENCH_SURPRESS_WARNINGS_BEGIN \ UBENCH_EXTERN struct ubench_state_s ubench_state; \ static void ubench_f_setup_##FIXTURE(struct FIXTURE *); \ static void ubench_f_teardown_##FIXTURE(struct FIXTURE *); \ static void ubench_run_ex_##FIXTURE##_##NAME(struct FIXTURE *, \ - struct ubench_run_state_s*); \ - static void ubench_f_##FIXTURE##_##NAME(struct ubench_run_state_s* ubench_run_state) { \ + struct ubench_run_state_s *); \ + static void ubench_f_##FIXTURE##_##NAME( \ + struct ubench_run_state_s *ubench_run_state) { \ struct FIXTURE fixture; \ memset(&fixture, 0, sizeof(fixture)); \ ubench_f_setup_##FIXTURE(&fixture); \ @@ -391,7 +438,7 @@ static UBENCH_INLINE int ubench_do_benchmark(struct ubench_run_state_s* ubs) } \ UBENCH_INITIALIZER(ubench_register_##FIXTURE##_##NAME) { \ const size_t index = ubench_state.benchmarks_length++; \ - const char *name_part = #FIXTURE "." #NAME; \ + const char name_part[] = #FIXTURE "." #NAME; \ const size_t name_size = strlen(name_part) + 1; \ char *name = UBENCH_PTR_CAST(char *, malloc(name_size)); \ ubench_state.benchmarks = UBENCH_PTR_CAST( \ @@ -403,18 +450,33 @@ static UBENCH_INLINE int ubench_do_benchmark(struct ubench_run_state_s* ubs) ubench_state.benchmarks[index].name = name; \ UBENCH_SNPRINTF(name, name_size, "%s", name_part); \ } \ - void ubench_run_ex_##FIXTURE##_##NAME(struct FIXTURE *ubench_fixture, \ - struct ubench_run_state_s* ubench_run_state) + UBENCH_SURPRESS_WARNINGS_END \ + void ubench_run_ex_##FIXTURE##_##NAME( \ + struct FIXTURE *ubench_fixture, \ + struct ubench_run_state_s *ubench_run_state) #define UBENCH_F(FIXTURE, NAME) \ static void ubench_run_##FIXTURE##_##NAME(struct FIXTURE *); \ UBENCH_EX_F(FIXTURE, NAME) { \ - UBENCH_DO_BENCHMARK() { \ - ubench_run_##FIXTURE##_##NAME(ubench_fixture); \ - } \ + UBENCH_DO_BENCHMARK() { ubench_run_##FIXTURE##_##NAME(ubench_fixture); } \ } \ void ubench_run_##FIXTURE##_##NAME(struct FIXTURE *ubench_fixture) +#ifdef __clang__ +#pragma clang diagnostic push + +#if __has_warning("-Wunsafe-buffer-usage") +#pragma clang diagnostic ignored "-Wunsafe-buffer-usage" +#endif +#endif + +static UBENCH_INLINE int +ubench_do_benchmark(struct ubench_run_state_s *const ubs) { + const ubench_int64_t curr_sample = ubs->sample++; + ubs->ns[curr_sample] = ubench_ns(); + return curr_sample < ubs->size ? 1 : 0; +} + static UBENCH_INLINE int ubench_should_filter(const char *filter, const char *benchmark); int ubench_should_filter(const char *filter, const char *benchmark) { @@ -613,8 +675,9 @@ int ubench_main(int argc, const char *const argv[]) { ubench_int64_t iterations = 10; const ubench_int64_t max_iterations = UBENCH_MAX_ITERATIONS; const ubench_int64_t min_iterations = UBENCH_MIN_ITERATIONS; - /* Add one extra timestamp slot, as we save times between runs and time after exiting the last one */ - ubench_int64_t ns[UBENCH_MAX_ITERATIONS+1]; + /* Add one extra timestamp slot, as we save times between runs and time + * after exiting the last one */ + ubench_int64_t ns[UBENCH_MAX_ITERATIONS + 1]; #undef UBENCH_MAX_ITERATIONS #undef UBENCH_MIN_ITERATIONS @@ -625,8 +688,8 @@ int ubench_main(int argc, const char *const argv[]) { printf("%s[ RUN ]%s %s\n", colours[GREEN], colours[RESET], ubench_state.benchmarks[index].name); - ubs.ns = ns; - ubs.size = 1; + ubs.ns = ns; + ubs.size = 1; ubs.sample = 0; /* Time once to work out the base number of iterations to use. */ @@ -646,7 +709,7 @@ int ubench_main(int argc, const char *const argv[]) { iterations = iterations > max_iterations ? max_iterations : iterations; ubs.sample = 0; - ubs.size = iterations; + ubs.size = iterations; ubench_state.benchmarks[index].func(&ubs); /* Calculate benchmark run-times */ @@ -767,6 +830,10 @@ int ubench_main(int argc, const char *const argv[]) { return UBENCH_CAST(int, failed); } +#ifdef __clang__ +#pragma clang diagnostic pop +#endif + UBENCH_C_FUNC UBENCH_NOINLINE void ubench_do_nothing(void *const); #define UBENCH_DO_NOTHING(x) ubench_do_nothing(x) @@ -784,13 +851,13 @@ UBENCH_C_FUNC void _ReadWriteBarrier(void); void ubench_do_nothing(void *ptr) { \ _Pragma("clang diagnostic push") \ _Pragma("clang diagnostic ignored \"-Wlanguage-extension-token\""); \ - asm volatile("" : : "r,m"(ptr) : "memory"); \ + asm volatile("" : : "r"(ptr), "m"(ptr) : "memory"); \ _Pragma("clang diagnostic pop"); \ } #else #define UBENCH_DECLARE_DO_NOTHING() \ void ubench_do_nothing(void *ptr) { \ - asm volatile("" : : "r,m"(ptr) : "memory"); \ + asm volatile("" : : "r"(ptr), "m"(ptr) : "memory"); \ } #endif