From 33c1bfbecd94dd79be2cd7f167d30e3090204ffd Mon Sep 17 00:00:00 2001 From: "Jonathan M. Henson" Date: Thu, 21 Mar 2024 21:17:05 -0700 Subject: [PATCH] Better vectorization and crc64. Cleaned up cmake and added better runtime cpu detection (#1083) Co-authored-by: Alfred G <28123637+alfred2g@users.noreply.github.com> Co-authored-by: Alfred Gedeon --- CMakeLists.txt | 14 ++-- bin/system_info/print_system_info.c | 14 ++++ cmake/AwsFeatureTests.cmake | 13 +++ cmake/AwsSIMD.cmake | 97 +++++++++++++++++------ include/aws/common/config.h.in | 7 ++ include/aws/common/cpuid.h | 2 + source/arch/arm/{asm => auxv}/cpuid.c | 8 +- source/arch/arm/darwin/cpuid.c | 40 ++++++++++ source/arch/arm/{msvc => windows}/cpuid.c | 13 ++- source/arch/intel/cpuid.c | 4 +- 10 files changed, 178 insertions(+), 34 deletions(-) rename source/arch/arm/{asm => auxv}/cpuid.c (86%) create mode 100644 source/arch/arm/darwin/cpuid.c rename source/arch/arm/{msvc => windows}/cpuid.c (57%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7e5c5cd9f..6d6e615a3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -152,13 +152,17 @@ if (USE_CPU_EXTENSIONS) ) endif() elseif (AWS_ARCH_ARM64 OR AWS_ARCH_ARM32) - if (MSVC) + if (WINDOWS) file(GLOB AWS_COMMON_ARCH_SRC - "source/arch/arm/msvc/*.c" + "source/arch/arm/windows/*.c" ) - elseif (AWS_HAVE_AUXV) + elseif(APPLE) + file(GLOB AWS_COMMON_ARCH_SRC + "source/arch/arm/darwin/*.c" + ) + else() file(GLOB AWS_COMMON_ARCH_SRC - "source/arch/arm/asm/*.c" + "source/arch/arm/auxv/*.c" ) endif() endif() @@ -221,7 +225,7 @@ target_compile_definitions(${PROJECT_NAME} PRIVATE -DCJSON_HIDE_SYMBOLS) if (AWS_HAVE_AVX2_INTRINSICS) target_compile_definitions(${PROJECT_NAME} PRIVATE -DUSE_SIMD_ENCODING) - simd_add_source_avx(${PROJECT_NAME} "source/arch/intel/encoding_avx2.c") + simd_append_source_and_features(${PROJECT_NAME} "source/arch/intel/encoding_avx2.c" ${AWS_AVX2_FLAG}) message(STATUS "Building SIMD base64 decoder") endif() diff --git a/bin/system_info/print_system_info.c b/bin/system_info/print_system_info.c index c29877086..aac1b570f 100644 --- a/bin/system_info/print_system_info.c +++ b/bin/system_info/print_system_info.c @@ -3,6 +3,7 @@ #include #include #include +#include int main(void) { struct aws_allocator *allocator = aws_default_allocator(); @@ -39,6 +40,19 @@ int main(void) { fprintf(stdout, " 'numa architecture': 'false'\n"); } + fprintf(stdout, " 'cpu_capabilities': {\n"); + fprintf(stdout, " 'arm_crc': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC) ? "true" : "false"); + fprintf(stdout, " 'arm_pmull': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_PMULL) ? "true" : "false"); + fprintf(stdout, " 'arm_crypto': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRYPTO) ? "true" : "false"); + fprintf(stdout, " 'amd_sse4_1': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_1) ? "true" : "false"); + fprintf(stdout, " 'amd_sse4_2': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2) ? "true" : "false"); + fprintf(stdout, " 'amd_clmul': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL) ? "true" : "false"); + fprintf(stdout, " 'amd_vpclmulqdq': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ) ? "true" : "false"); + fprintf(stdout, " 'amd_avx2': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_AVX2) ? "true" : "false"); + fprintf(stdout, " 'amd_avx512': %s,\n", aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512) ? "true" : "false"); + fprintf(stdout, " 'amd_bmi2': %s\n", aws_cpu_has_feature(AWS_CPU_FEATURE_BMI2) ? "true" : "false"); + fprintf(stdout, " }\n"); + fprintf(stdout, "}\n"); aws_system_environment_release(env); aws_logger_clean_up(&logger); diff --git a/cmake/AwsFeatureTests.cmake b/cmake/AwsFeatureTests.cmake index 60a548cfd..813a3c9f8 100644 --- a/cmake/AwsFeatureTests.cmake +++ b/cmake/AwsFeatureTests.cmake @@ -17,6 +17,10 @@ if(MINGW) set(USE_CPU_EXTENSIONS OFF) endif() +if (USE_CPU_EXTENSIONS) + set(AWS_USE_CPU_EXTENSIONS ON) +endif() + if(NOT CMAKE_CROSSCOMPILING) check_c_source_runs(" #include @@ -54,6 +58,15 @@ check_c_source_compiles(" } " AWS_ARCH_INTEL) +check_c_source_compiles(" + int main() { +#if !(defined(__x86_64__) || defined(_M_X64)) +# error \"not intel\" +#endif + return 0; + } +" AWS_ARCH_INTEL_X64) + check_c_source_compiles(" int main() { #if !(defined(__aarch64__) || defined(_M_ARM64)) diff --git a/cmake/AwsSIMD.cmake b/cmake/AwsSIMD.cmake index 6ba9f236e..65fce96c7 100644 --- a/cmake/AwsSIMD.cmake +++ b/cmake/AwsSIMD.cmake @@ -4,35 +4,39 @@ include(CheckCCompilerFlag) include(CheckIncludeFile) +if (MSVC) + set(AWS_AVX2_FLAG "/arch:AVX2") + set(AWS_AVX512_FLAG "/arch:AVX512") + set(AWS_AVX512vL_FLAG "") + set(AWS_CLMUL_FLAG "") + set(AWS_SSE4_2_FLAG "") + set(AWS_ARMv8_1_FLAG "/arch:arm8.1") + set(WERROR_FLAG "") +else() + set(AWS_AVX2_FLAG "-mavx -mavx2") + set(AWS_AVX512_FLAG "-mavx512f -mvpclmulqdq") + set(AWS_AVX512vL_FLAG "-mavx512vl") + set(AWS_CLMUL_FLAG "-mpclmul") + set(AWS_SSE4_2_FLAG "-msse4.2") + set(AWS_ARMv8_1_FLAG "-march=armv8-a+crc+crypto -mtune=neoverse-v1") + set(WERROR_FLAG "-Werror") +endif() + if (USE_CPU_EXTENSIONS) - if (MSVC) - check_c_compiler_flag("/arch:AVX2" HAVE_M_AVX2_FLAG) - if (HAVE_M_AVX2_FLAG) - set(AVX_CFLAGS "/arch:AVX2") - endif() - else() - check_c_compiler_flag(-mavx2 HAVE_M_AVX2_FLAG) - if (HAVE_M_AVX2_FLAG) - set(AVX_CFLAGS "-mavx -mavx2") - endif() + set(AVX_CFLAGS ${AWS_SSE4_2_FLAG}) + + check_c_compiler_flag(${AWS_AVX2_FLAG} HAVE_M_AVX2_FLAG) + if (HAVE_M_AVX2_FLAG) + set(AVX_CFLAGS "${AWS_AVX2_FLAG} ${AVX_CFLAGS}") endif() - if (MSVC) - check_c_compiler_flag("/arch:AVX512" HAVE_M_AVX512_FLAG) - if (HAVE_M_AVX512_FLAG) - # docs imply AVX512 brings in AVX2. And it will compile, but it will break at runtime on - # instructions such as _mm256_load_si256(). Leave it on. - set(AVX_CFLAGS "/arch:AVX512 /arch:AVX2") - endif() - else() - check_c_compiler_flag("-mavx512f -mvpclmulqdq" HAVE_M_AVX512_FLAG) - if (HAVE_M_AVX512_FLAG) - set(AVX_CFLAGS "-mavx512f -mvpclmulqdq -mpclmul -mavx -mavx2 -msse4.2") - endif() + check_c_compiler_flag("${AWS_AVX512_FLAG} ${AWS_CLMUL_FLAG}" HAVE_M_AVX512_FLAG) + if (HAVE_M_AVX512_FLAG) + set(AVX_CFLAGS "${AWS_AVX512_FLAG} ${AWS_CLMUL_FLAG} ${AVX_CFLAGS}") endif() set(old_flags "${CMAKE_REQUIRED_FLAGS}") - set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${AVX_CFLAGS}") + set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${AVX_CFLAGS} ${WERROR_FLAG}") check_c_source_compiles(" #include @@ -68,7 +72,35 @@ if (USE_CPU_EXTENSIONS) return (int)_mm256_extract_epi64(vec, 2); }" AWS_HAVE_MM256_EXTRACT_EPI64) + check_c_source_compiles(" + #include + #include + int main() { + __m128i a = _mm_setzero_si128(); + __m128i b = _mm_setzero_si128(); + __m128i result = _mm_clmulepi64_si128(a, b, 0x00); + (void)result; + return 0; + }" AWS_HAVE_CLMUL) + + set(CMAKE_REQUIRED_FLAGS "${old_flags} ${AWS_ARMv8_1_FLAG} ${WERROR_FLAG}") + check_c_source_compiles(" + #include + int main() { + int crc = __crc32d(0, 1); + return 0; + }" AWS_HAVE_ARM32_CRC) + + check_c_source_compiles(" + #include + int main() { + _Atomic int var = 0; + atomic_fetch_add_explicit(&var, 1, memory_order_relaxed); + return 0; + }" AWS_HAVE_ARMv8_1) + set(CMAKE_REQUIRED_FLAGS "${old_flags}") + endif() # USE_CPU_EXTENSIONS # The part where the definition is added to the compiler flags has been moved to config.h.in @@ -80,6 +112,23 @@ endif() # USE_CPU_EXTENSIONS function(simd_add_source_avx target) foreach(file ${ARGN}) target_sources(${target} PRIVATE ${file}) - set_source_files_properties(${file} PROPERTIES COMPILE_FLAGS "${AVX_CFLAGS}") + set_source_files_properties(${file} PROPERTIES COMPILE_FLAGS " ${AVX_CFLAGS}") endforeach() endfunction(simd_add_source_avx) + +# The part where the definition is added to the compiler flags has been moved to config.h.in +# see git history for more details. + +# Adds compiler flags to the source and adds the source to target. +# Unfortunately the flags have to be passed as strings. Predefined flags are +# at the top of this file. +# Usage: simd_append_source_and_features(target file1.c ${AWS_AVX512_FLAG} ${AWS_AVX2_FLAG} ...) +function(simd_append_source_and_features target file) + set(CC_FLAGS "") + foreach(flag ${ARGN}) + set(CC_FLAGS "${CC_FLAGS} ${flag}") + endforeach() + + target_sources(${target} PRIVATE ${file}) + set_source_files_properties(${file} PROPERTIES COMPILE_FLAGS " ${CC_FLAGS}") +endfunction(simd_append_source_and_features) diff --git a/include/aws/common/config.h.in b/include/aws/common/config.h.in index d3dff3af2..381d99c99 100644 --- a/include/aws/common/config.h.in +++ b/include/aws/common/config.h.in @@ -22,5 +22,12 @@ #cmakedefine AWS_HAVE_AVX2_INTRINSICS #cmakedefine AWS_HAVE_AVX512_INTRINSICS #cmakedefine AWS_HAVE_MM256_EXTRACT_EPI64 +#cmakedefine AWS_HAVE_CLMUL +#cmakedefine AWS_HAVE_ARM32_CRC +#cmakedefine AWS_HAVE_ARMv8_1 +#cmakedefine AWS_ARCH_ARM64 +#cmakedefine AWS_ARCH_INTEL +#cmakedefine AWS_ARCH_INTEL_X64 +#cmakedefine AWS_USE_CPU_EXTENSIONS #endif diff --git a/include/aws/common/cpuid.h b/include/aws/common/cpuid.h index 84024a36b..9ab7d5059 100644 --- a/include/aws/common/cpuid.h +++ b/include/aws/common/cpuid.h @@ -18,6 +18,8 @@ enum aws_cpu_feature_name { AWS_CPU_FEATURE_ARM_CRC, AWS_CPU_FEATURE_BMI2, AWS_CPU_FEATURE_VPCLMULQDQ, + AWS_CPU_FEATURE_ARM_PMULL, + AWS_CPU_FEATURE_ARM_CRYPTO, AWS_CPU_FEATURE_COUNT, }; diff --git a/source/arch/arm/asm/cpuid.c b/source/arch/arm/auxv/cpuid.c similarity index 86% rename from source/arch/arm/asm/cpuid.c rename to source/arch/arm/auxv/cpuid.c index 6a306df98..10499da73 100644 --- a/source/arch/arm/asm/cpuid.c +++ b/source/arch/arm/auxv/cpuid.c @@ -29,7 +29,9 @@ struct cap_bits { # if (defined(__aarch64__)) struct cap_bits s_check_cap[AWS_CPU_FEATURE_COUNT] = { - [AWS_CPU_FEATURE_ARM_CRC] = {0, 1 << 7 /* HWCAP_CRC */}, + [AWS_CPU_FEATURE_ARM_CRC] = {0, 1 << 7 /* HWCAP_CRC32 */}, + [AWS_CPU_FEATURE_ARM_PMULL] = {0, 1 << 4 /* HWCAP_PMULL */}, + [AWS_CPU_FEATURE_ARM_CRYPTO] = {0, 1 << 3 /* HWCAP_AES */}, }; # else struct cap_bits s_check_cap[AWS_CPU_FEATURE_COUNT] = { @@ -67,6 +69,10 @@ bool aws_cpu_has_feature(enum aws_cpu_feature_name feature_name) { switch (feature_name) { case AWS_CPU_FEATURE_ARM_CRC: +# if (defined(__aarch64__)) + case AWS_CPU_FEATURE_ARM_PMULL: + case AWS_CPU_FEATURE_ARM_CRYPTO: +# endif // (defined(__aarch64__)) return s_hwcap[s_check_cap[feature_name].cap] & s_check_cap[feature_name].bit; default: return false; diff --git a/source/arch/arm/darwin/cpuid.c b/source/arch/arm/darwin/cpuid.c new file mode 100644 index 000000000..7552d4220 --- /dev/null +++ b/source/arch/arm/darwin/cpuid.c @@ -0,0 +1,40 @@ +/* + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"). + * You may not use this file except in compliance with the License. + * A copy of the License is located at + * + * http://aws.amazon.com/apache2.0 + * + * or in the "license" file accompanying this file. This file is distributed + * on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either + * express or implied. See the License for the specific language governing + * permissions and limitations under the License. + */ + +#include + +#include + +bool aws_cpu_has_feature(enum aws_cpu_feature_name feature_name) { + int64_t ret = 0; + size_t size = sizeof(ret); + + switch (feature_name) { + case AWS_CPU_FEATURE_ARM_PMULL: + if (sysctlbyname("hw.optional.arm.FEAT_PMULL", &ret, &size, NULL, 0) != -1) { + return ret == 1; + } + case AWS_CPU_FEATURE_ARM_CRC: + if (sysctlbyname("hw.optional.armv8_crc32", &ret, &size, NULL, 0) != -1) { + return ret == 1; + } + case AWS_CPU_FEATURE_ARM_CRYPTO: + if (sysctlbyname("hw.optional.arm.FEAT_AES", &ret, &size, NULL, 0) != -1) { + return ret == 1; + } + default: + return false; + } +} diff --git a/source/arch/arm/msvc/cpuid.c b/source/arch/arm/windows/cpuid.c similarity index 57% rename from source/arch/arm/msvc/cpuid.c rename to source/arch/arm/windows/cpuid.c index c10c5d15c..b7da0053f 100644 --- a/source/arch/arm/msvc/cpuid.c +++ b/source/arch/arm/windows/cpuid.c @@ -13,9 +13,18 @@ * permissions and limitations under the License. */ +#include #include -#include bool aws_cpu_has_feature(enum aws_cpu_feature_name feature_name) { - return false; + switch (feature_name) { + case AWS_CPU_FEATURE_ARM_CRC: + return IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE) != 0; + // this is the best we've got on windows as they don't separate PMULL and AES from each other. + case AWS_CPU_FEATURE_ARM_PMULL: + case AWS_CPU_FEATURE_ARM_CRYPTO: + return IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) != 0; + default: + return false; + } } diff --git a/source/arch/intel/cpuid.c b/source/arch/intel/cpuid.c index 44fdff078..465fccd17 100644 --- a/source/arch/intel/cpuid.c +++ b/source/arch/intel/cpuid.c @@ -116,8 +116,8 @@ static bool s_has_bmi2(void) { static bool s_has_vpclmulqdq(void) { uint32_t abcd[4]; /* Check VPCLMULQDQ: - * CPUID.(EAX=07H, ECX=0H):ECX.VPCLMULQDQ[bit 20]==1 */ - uint32_t vpclmulqdq_mask = (1 << 20); + * CPUID.(EAX=07H, ECX=0H):ECX.VPCLMULQDQ[bit 10]==1 */ + uint32_t vpclmulqdq_mask = (1 << 10); aws_run_cpuid(7, 0, abcd); if ((abcd[2] & vpclmulqdq_mask) != vpclmulqdq_mask) { return false;