Skip to content

Commit

Permalink
AVX512 and VPCLMULQDQ based CRC-32 and CRC-32C
Browse files Browse the repository at this point in the history
This implementation is based on crc32_refl_by16_vclmul_avx512
in https://github.com/intel/intel-ipsec-mb/ with some optimizations.

Some of the code is based on awslabs#72.
  • Loading branch information
dr-m committed May 11, 2024
1 parent 0884586 commit 092f12d
Show file tree
Hide file tree
Showing 7 changed files with 581 additions and 43 deletions.
53 changes: 45 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ string(REPLACE ";" "${AWS_MODULE_DIR};" AWS_MODULE_PATH "${CMAKE_PREFIX_PATH}${A
# Append that generated list to the module search path
list(APPEND CMAKE_MODULE_PATH ${AWS_MODULE_PATH})

include(AwsSIMD)
include(AwsCFlags)
include(AwsCheckHeaders)
include(AwsSharedLibSetup)
Expand Down Expand Up @@ -58,17 +59,48 @@ file(GLOB AWS_ARCH_SRC
)

if (USE_CPU_EXTENSIONS)
if(AWS_ARCH_INTEL)
# First, check if inline assembly is available. Inline assembly can also be supported by MSVC if the compiler in use is Clang.
if(AWS_HAVE_GCC_INLINE_ASM)
file(GLOB AWS_ARCH_SRC
"source/intel/asm/*.c"
if (AWS_ARCH_INTEL)
file (GLOB AWS_ARCH_INTEL_SRC
"source/intel/*.c"
)

if (AWS_HAVE_AVX512_INTRINSICS)
if (MSVC)
file(GLOB AWS_ARCH_INTRIN_SRC
"source/intel/intrin/*.c"
"source/intel/visualc/*.c"
)
elseif (MSVC)
file(GLOB AWS_ARCH_SRC
else()
file(GLOB AWS_ARCH_INTRIN_SRC
"source/intel/intrin/*.c"
)
endif()
else()
if (MSVC)
file(GLOB AWS_ARCH_INTRIN_SRC
"source/intel/visualc/*.c"
)
endif()
endif()

source_group("Source Files\\intel" FILES ${AWS_ARCH_INTEL_SRC})
source_group("Source Files\\intel\\intrin" FILES ${AWS_ARCH_INTRIN_SRC})

if (AWS_HAVE_GCC_INLINE_ASM)
file(GLOB AWS_ARCH_ASM_SRC
"source/intel/asm/*.c"
)

file(GLOB AWS_ARCH_SRC
${AWS_ARCH_INTEL_SRC}
${AWS_ARCH_INTRIN_SRC}
${AWS_ARCH_ASM_SRC}
)
else()
file(GLOB AWS_ARCH_SRC
${AWS_ARCH_INTEL_SRC}
${AWS_ARCH_INTRIN_SRC}
)
source_group("Source Files\\intel\\visualc" FILES ${AWS_ARCH_SRC})
endif()
endif()

Expand Down Expand Up @@ -114,6 +146,7 @@ file(GLOB CHECKSUMS_COMBINED_SRC


add_library(${PROJECT_NAME} ${CHECKSUMS_COMBINED_HEADERS} ${CHECKSUMS_COMBINED_SRC})

aws_set_common_properties(${PROJECT_NAME})
aws_prepare_symbol_visibility_args(${PROJECT_NAME} "AWS_CHECKSUMS")
aws_check_headers(${PROJECT_NAME} ${AWS_CHECKSUMS_HEADERS})
Expand All @@ -123,6 +156,10 @@ aws_add_sanitizers(${PROJECT_NAME})
# We are not ABI stable yet
set_target_properties(${PROJECT_NAME} PROPERTIES VERSION 1.0.0)

if (USE_CPU_EXTENSIONS AND AWS_ARCH_INTEL)
simd_add_source_avx(${PROJECT_NAME} ${AWS_ARCH_SRC})
endif()

target_include_directories(${PROJECT_NAME} PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>)
Expand Down
26 changes: 26 additions & 0 deletions include/aws/checksums/private/intel/crc32c_compiler_shims.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/**
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0.
*/

#include <aws/checksums/private/crc_priv.h>

#include <aws/common/config.h>
#include <nmmintrin.h>

#if _WIN64 || __x86_64__ || __ppc64_
typedef uint64_t *slice_ptr_type;
typedef uint64_t slice_ptr_int_type;
# define crc_intrin_fn _mm_crc32_u64
#else
typedef uint32_t *slice_ptr_type;
typedef uint32_t slice_ptr_int_type;
# define crc_intrin_fn _mm_crc32_u32
#endif

#ifdef AWS_HAVE_AVX512_INTRINSICS
uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t crc);
uint32_t aws_checksums_crc32_avx512(const uint8_t *input, int length, uint32_t crc);
#endif

uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t crc);
22 changes: 9 additions & 13 deletions source/intel/asm/crc32c_sse42_asm.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* SPDX-License-Identifier: Apache-2.0.
*/

#include <aws/checksums/private/crc_priv.h>
#include <aws/checksums/private/intel/crc32c_compiler_shims.h>

#include <aws/common/cpuid.h>

Expand Down Expand Up @@ -283,7 +283,7 @@ static bool detected_clmul = false;
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
* call.
*/
uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t previousCrc32) {

if (AWS_UNLIKELY(!detection_performed)) {
detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL);
Expand All @@ -293,7 +293,8 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
detection_performed = true;
}

uint32_t crc = ~previousCrc32;
/* this is called by a higher-level shim and previousCRC32 is already ~ */
uint32_t crc = previousCrc32;

/* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
if (AWS_UNLIKELY(length < 8)) {
Expand Down Expand Up @@ -358,22 +359,17 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev

return ~crc;
}
uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
return aws_checksums_crc32_sw(input, length, previousCrc32);
}

# if defined(__clang__)
# pragma clang diagnostic pop
# endif

#else
uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
return aws_checksums_crc32_sw(input, length, previousCrc32);
}

uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
return aws_checksums_crc32c_sw(input, length, previousCrc32);
uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t previousCrc32) {
/* these are nested in a larger computation. As a result the crc doesn't need to be bit flipped.
However, the sw function is also used as a standalone implementation that does need to do the
bit flip. So go ahead and flip it here, so the sw implementation flips it back. */
return aws_checksums_crc32c_sw(input, length, ~previousCrc32);
}

#endif
/* clang-format on */
127 changes: 127 additions & 0 deletions source/intel/crc_hw.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
/**
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0.
*/
#include <aws/checksums/private/intel/crc32c_compiler_shims.h>
#ifdef _MSC_VER
# include <intrin.h>
#else
# include <cpuid.h>
#endif

static bool detection_performed;
static bool detected_sse42;
static bool detected_clmul;
#ifdef AWS_HAVE_AVX512_INTRINSICS
static bool detected_vpclmulqdq;
#endif

static void aws_checksums_hw_detect(void)
{
#ifdef _MSC_VER
int regs[4];
__cpuid(regs, 1);
uint32_t ecx = regs[2];
#else
uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
__cpuid(1, reax, rebx, recx, redx);
#endif
detected_sse42 = ecx & 1U << 20;
detected_clmul = ecx & 1U << 1;

#ifdef AWS_HAVE_AVX512_INTRINSICS
# ifdef _MSC_VER
__cpuidex(regs, 7, 0);
uint32_t ebx = regs[1];
ecx = regs[2];
# else
__cpuid_count(7, 0, eax, ebx, ecx, edx);
# endif
detected_vpclmulqdq = ecx & 1U<<10/*VPCLMULQDQ*/ &&
!(~ebx & ((1U<<16/*AVX512F*/ | 1U<<17/*AVX512DQ*/ |
1U<<30/*AVX512BW*/ | 1U<<31/*AVX512VL*/)));
#endif

/* Simply setting the flag true to skip HW detection next time
Not using memory barriers since the worst that can
happen is a fallback to the non HW accelerated code. */
detection_performed = true;
}

/*
* Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and
* PCLMULQDQ machine instructions (if present).
* Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction.
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
* call.
*/
uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {

if (AWS_UNLIKELY(!detection_performed)) {
aws_checksums_hw_detect();
}

#ifdef AWS_HAVE_AVX512_INTRINSICS
if (detected_vpclmulqdq) {
return aws_checksums_crc32c_avx512(inputr, length, crc);
}
#endif

/* this is the entry point. We should only do the bit flip once. It should not be done for the subfunctions and
* branches.*/
uint32_t crc = ~previousCrc32;

/* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
if (length < (int)sizeof(slice_ptr_int_type)) {
while (length-- > 0) {
crc = (uint32_t)_mm_crc32_u8(crc, *input++);
}
return ~crc;
}

/* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */
int input_alignment = (uintptr_t)(input)&0x7;

/* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */
int leading = (8 - input_alignment) & 0x7;

/* reduce the length by the leading unaligned bytes we are about to process */
length -= leading;

/* spin through the leading unaligned input bytes (if any) one-by-one */
while (leading-- > 0) {
crc = (uint32_t)_mm_crc32_u8(crc, *input++);
}

if (detected_sse42 && detected_clmul) {
return aws_checksums_crc32c_sse42(input, length, crc);
}

/* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */
while (length >= (int)sizeof(slice_ptr_int_type)) {
crc = (uint32_t)crc_intrin_fn(crc, *input);
input += sizeof(slice_ptr_int_type);
length -= (int)sizeof(slice_ptr_int_type);
}

/* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */
while (length-- > 0) {
crc = (uint32_t)_mm_crc32_u8(crc, *input);
input++;
}

return ~crc;
}

uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
#ifdef AWS_HAVE_AVX512_INTRINSICS
if (AWS_UNLIKELY(!detection_performed)) {
aws_checksums_hw_detect();
}

if (detected_vpclmulqdq) {
return aws_checksums_crc32_avx512(inputr, length, crc);
}
#endif
return aws_checksums_crc32_sw(input, length, previousCrc32);
}
Loading

0 comments on commit 092f12d

Please sign in to comment.