Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AVX512 and VPCLMULQDQ based CRC-32 and CRC-32C #90

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 44 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -58,17 +58,48 @@ file(GLOB AWS_ARCH_SRC
)

if (USE_CPU_EXTENSIONS)
if(AWS_ARCH_INTEL)
# First, check if inline assembly is available. Inline assembly can also be supported by MSVC if the compiler in use is Clang.
if(AWS_HAVE_GCC_INLINE_ASM)
file(GLOB AWS_ARCH_SRC
"source/intel/asm/*.c"
if (AWS_ARCH_INTEL)
file (GLOB AWS_ARCH_INTEL_SRC
"source/intel/*.c"
)

if (AWS_HAVE_AVX512_INTRINSICS)
if (MSVC)
file(GLOB AWS_ARCH_INTRIN_SRC
"source/intel/intrin/*.c"
"source/intel/visualc/*.c"
)
elseif (MSVC)
file(GLOB AWS_ARCH_SRC
else()
file(GLOB AWS_ARCH_INTRIN_SRC
"source/intel/intrin/*.c"
)
endif()
else()
if (MSVC)
file(GLOB AWS_ARCH_INTRIN_SRC
"source/intel/visualc/*.c"
)
endif()
endif()

source_group("Source Files\\intel" FILES ${AWS_ARCH_INTEL_SRC})
source_group("Source Files\\intel\\intrin" FILES ${AWS_ARCH_INTRIN_SRC})

if (AWS_HAVE_GCC_INLINE_ASM)
file(GLOB AWS_ARCH_ASM_SRC
"source/intel/asm/*.c"
)

file(GLOB AWS_ARCH_SRC
${AWS_ARCH_INTEL_SRC}
${AWS_ARCH_INTRIN_SRC}
${AWS_ARCH_ASM_SRC}
)
else()
file(GLOB AWS_ARCH_SRC
${AWS_ARCH_INTEL_SRC}
${AWS_ARCH_INTRIN_SRC}
)
source_group("Source Files\\intel\\visualc" FILES ${AWS_ARCH_SRC})
endif()
endif()

Expand Down Expand Up @@ -114,6 +145,7 @@ file(GLOB CHECKSUMS_COMBINED_SRC


add_library(${PROJECT_NAME} ${CHECKSUMS_COMBINED_HEADERS} ${CHECKSUMS_COMBINED_SRC})

aws_set_common_properties(${PROJECT_NAME})
aws_prepare_symbol_visibility_args(${PROJECT_NAME} "AWS_CHECKSUMS")
aws_check_headers(${PROJECT_NAME} ${AWS_CHECKSUMS_HEADERS})
Expand All @@ -123,6 +155,10 @@ aws_add_sanitizers(${PROJECT_NAME})
# We are not ABI stable yet
set_target_properties(${PROJECT_NAME} PROPERTIES VERSION 1.0.0)

if (USE_CPU_EXTENSIONS AND AWS_ARCH_INTEL)
SET_SOURCE_FILES_PROPERTIES(source/intel/crc_hw.c PROPERTIES COMPILE_FLAGS -msse4.2)
endif()

target_include_directories(${PROJECT_NAME} PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>)
Expand Down
13 changes: 11 additions & 2 deletions include/aws/checksums/private/crc_priv.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,20 @@ AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_sw(const uint8_t *input, int leng
/* Computes the Castagnoli CRC32c (iSCSI) using a (slow) reference implementation. */
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c_sw(const uint8_t *input, int length, uint32_t previousCrc32c);

/* Computes CRC32 (Ethernet, gzip, et. al.) using crc instructions. */
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_hw(const uint8_t *data, int length, uint32_t previousCrc32);

/* Computes CRC32 (Ethernet, gzip, et. al.) using AVX512 and VPCLMULQDQ. */
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_avx512(const uint8_t *data, int length, uint32_t previousCrc32);

/* Computes the Castagnoli CRC32c (iSCSI). */
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c_hw(const uint8_t *data, int length, uint32_t previousCrc32);

/* Computes CRC32 (Ethernet, gzip, et. al.) using crc instructions. */
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_hw(const uint8_t *data, int length, uint32_t previousCrc32);
/* Computes the Castagnoli CRC32c (iSCSI) using 128-bit PCLMULQDQ. */
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c_clmul(const uint8_t *data, int length, uint32_t previousCrc32);

/* Computes the Castagnoli CRC32c (iSCSI) using AVX512 and VPCLMULQDQ. */
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c_avx512(const uint8_t *data, int length, uint32_t previousCrc32);

#ifdef __cplusplus
}
Expand Down
26 changes: 26 additions & 0 deletions include/aws/checksums/private/intel/crc32c_compiler_shims.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
/**
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0.
*/

#include <aws/checksums/private/crc_priv.h>

#include <aws/common/config.h>
#include <nmmintrin.h>

#if defined _WIN64 || defined __x86_64__
typedef uint64_t *slice_ptr_type;
typedef uint64_t slice_ptr_int_type;
# define crc_intrin_fn _mm_crc32_u64
#else
typedef uint32_t *slice_ptr_type;
typedef uint32_t slice_ptr_int_type;
# define crc_intrin_fn _mm_crc32_u32
#endif

#ifdef AWS_HAVE_AVX512_INTRINSICS
uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t crc);
uint32_t aws_checksums_crc32_avx512(const uint8_t *input, int length, uint32_t crc);
#endif

uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t crc);
35 changes: 29 additions & 6 deletions source/crc.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,45 @@ static uint32_t (*s_crc32_fn_ptr)(const uint8_t *input, int length, uint32_t pre

uint32_t aws_checksums_crc32(const uint8_t *input, int length, uint32_t previousCrc32) {
if (AWS_UNLIKELY(!s_crc32_fn_ptr)) {
if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) {
#ifdef AWS_HAVE_ARM32_CRC
if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC))
s_crc32_fn_ptr = aws_checksums_crc32_hw;
} else {
#elif defined AWS_HAVE_AVX512_INTRINSICS
if (aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512) &&
aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ))
s_crc32_fn_ptr = aws_checksums_crc32_avx512;
#else
if (0) {}
#endif
else
s_crc32_fn_ptr = aws_checksums_crc32_sw;
}
}
return s_crc32_fn_ptr(input, length, previousCrc32);
}

uint32_t aws_checksums_crc32c(const uint8_t *input, int length, uint32_t previousCrc32) {
if (AWS_UNLIKELY(!s_crc32c_fn_ptr)) {
if (aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2) || aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) {
#ifdef AWS_HAVE_ARM32_CRC
if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC))
s_crc32c_fn_ptr = aws_checksums_crc32c_hw;
} else {
s_crc32c_fn_ptr = aws_checksums_crc32c_sw;
#else
# ifdef AWS_HAVE_AVX512_INTRINSICS
if (aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512) &&
aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ))
s_crc32c_fn_ptr = aws_checksums_crc32c_avx512;
else
# endif
if (aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2)) {
# ifdef AWS_HAVE_CLMUL
if (aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL))
s_crc32c_fn_ptr = aws_checksums_crc32c_clmul;
else
# endif
s_crc32c_fn_ptr = aws_checksums_crc32c_hw;
}
#endif
else
s_crc32c_fn_ptr = aws_checksums_crc32c_sw;
}
return s_crc32c_fn_ptr(input, length, previousCrc32);
}
22 changes: 9 additions & 13 deletions source/intel/asm/crc32c_sse42_asm.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* SPDX-License-Identifier: Apache-2.0.
*/

#include <aws/checksums/private/crc_priv.h>
#include <aws/checksums/private/intel/crc32c_compiler_shims.h>

#include <aws/common/cpuid.h>

Expand Down Expand Up @@ -283,7 +283,7 @@ static bool detected_clmul = false;
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
* call.
*/
uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t previousCrc32) {

if (AWS_UNLIKELY(!detection_performed)) {
detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL);
Expand All @@ -293,7 +293,8 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
detection_performed = true;
}

uint32_t crc = ~previousCrc32;
/* this is called by a higher-level shim and previousCRC32 is already ~ */
uint32_t crc = previousCrc32;

/* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
if (AWS_UNLIKELY(length < 8)) {
Expand Down Expand Up @@ -358,22 +359,17 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev

return ~crc;
}
uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
return aws_checksums_crc32_sw(input, length, previousCrc32);
}

# if defined(__clang__)
# pragma clang diagnostic pop
# endif

#else
uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
return aws_checksums_crc32_sw(input, length, previousCrc32);
}

uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
return aws_checksums_crc32c_sw(input, length, previousCrc32);
uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t previousCrc32) {
/* these are nested in a larger computation. As a result the crc doesn't need to be bit flipped.
However, the sw function is also used as a standalone implementation that does need to do the
bit flip. So go ahead and flip it here, so the sw implementation flips it back. */
return aws_checksums_crc32c_sw(input, length, ~previousCrc32);
}

#endif
/* clang-format on */
92 changes: 92 additions & 0 deletions source/intel/crc_hw.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
/**
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0.
*/
#include <aws/checksums/private/intel/crc32c_compiler_shims.h>
#include <aws/common/macros.h>

static uint32_t aws_checksums_crc32c_hw_small(const uint8_t *input, int length, uint32_t crc) {
while (length-- > 0) {
crc = (uint32_t)_mm_crc32_u8(crc, *input++);
}
return ~crc;
}

static uint32_t aws_checksums_crc32c_hw_unaligned(const uint8_t **input, int *length, uint32_t crc) {
/* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */
int input_alignment = (uintptr_t)(*input)&0x7;

/* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */
int leading = (8 - input_alignment) & 0x7;

/* reduce the length by the leading unaligned bytes we are about to process */
*length -= leading;

/* spin through the leading unaligned input bytes (if any) one-by-one */
while (leading-- > 0) {
crc = (uint32_t)_mm_crc32_u8(crc, *(*input)++);
}

return crc;
}

/*
* Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) instructions.
* Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction.
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
* call.
*/
uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {

/* this is the entry point. We should only do the bit flip once. It should not be done for the subfunctions and
* branches.*/
uint32_t crc = ~previousCrc32;

/* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
if (length < (int)sizeof(slice_ptr_int_type)) {
return aws_checksums_crc32c_hw_small(input, length, crc);
}

crc = aws_checksums_crc32c_hw_unaligned(&input, &length, crc);
/* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */
while (length >= (int)sizeof(slice_ptr_int_type)) {
crc = (uint32_t)crc_intrin_fn(crc, *(const slice_ptr_int_type*) input);
input += sizeof(slice_ptr_int_type);
length -= (int)sizeof(slice_ptr_int_type);
}

/* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */
while (length-- > 0) {
crc = (uint32_t)_mm_crc32_u8(crc, *input);
input++;
}

return ~crc;
}

/*
* Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and
* PCLMULQDQ machine instructions (if present).
* Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction.
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
* call.
*/
uint32_t aws_checksums_crc32c_clmul(const uint8_t *input, int length, uint32_t previousCrc32) {

/* this is the entry point. We should only do the bit flip once. It should not be done for the subfunctions and
* branches.*/
uint32_t crc = ~previousCrc32;

/* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
if (length < (int)sizeof(slice_ptr_int_type)) {
return aws_checksums_crc32c_hw_small(input, length, crc);
}

crc = aws_checksums_crc32c_hw_unaligned(&input, &length, crc);

return aws_checksums_crc32c_sse42(input, length, crc);
}

uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
return aws_checksums_crc32_sw(input, length, previousCrc32);
}
Loading