forked from awslabs/aws-checksums
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
AVX512 and VPCLMULQDQ based CRC-32 and CRC-32C
This implementation is based on crc32_refl_by16_vclmul_avx512 in https://github.com/intel/intel-ipsec-mb/ with some optimizations. Changes to CMakeLists.txt and source/intel/asm/crc32c_sse42_asm.c are based on awslabs#72. This also fixes a bug in aws_checksums_crc32c_hw() when 128-bit pclmul is not available. crc_intrin_fn was being invoked on bytes instead of 32-bit or 64-bit words. The aws-checksums-tests was extended to cover all SIMD implementations. Note: The availability of the Intel CRC-32C instructions is checked as part of testing AWS_CPU_FEATURE_SSE_4_2. Both ISA extensions were introduced in the Intel Nehalem microarchitecture. For compiling this, https://github.com/awslabs/aws-c-common must be installed and CMAKE_MODULE_PATH must point to it, e.g.: cmake -DCMAKE_MODULE_PATH=/usr/local/lib/cmake. The AWS_CPU_FEATURE_AVX512 currently only checks for AVX512F and not other features that this implementation depends on: AVX512VL, AVX512BW, AVX512DQ. According to https://en.wikipedia.org/wiki/AVX-512#CPUs_with_AVX-512 there currently exist no CPUs that would support VPCLMULQDQ without supporting all those AVX512 features. The architecture target evex512 is something that was introduced as mandatory in GCC 14 and clang 18 as part of introducing the AVX10.1-512 target, which basically is a new name for a number of AVX512 features. Older compilers do not recognize this target, but they do emit EVEX encoded instructions.
- Loading branch information
Showing
9 changed files
with
609 additions
and
51 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
26 changes: 26 additions & 0 deletions
26
include/aws/checksums/private/intel/crc32c_compiler_shims.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
/** | ||
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
* SPDX-License-Identifier: Apache-2.0. | ||
*/ | ||
|
||
#include <aws/checksums/private/crc_priv.h> | ||
|
||
#include <aws/common/config.h> | ||
#include <nmmintrin.h> | ||
|
||
#if defined _WIN64 || defined __x86_64__ | ||
typedef uint64_t *slice_ptr_type; | ||
typedef uint64_t slice_ptr_int_type; | ||
# define crc_intrin_fn _mm_crc32_u64 | ||
#else | ||
typedef uint32_t *slice_ptr_type; | ||
typedef uint32_t slice_ptr_int_type; | ||
# define crc_intrin_fn _mm_crc32_u32 | ||
#endif | ||
|
||
#ifdef AWS_HAVE_AVX512_INTRINSICS | ||
uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t crc); | ||
uint32_t aws_checksums_crc32_avx512(const uint8_t *input, int length, uint32_t crc); | ||
#endif | ||
|
||
uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t crc); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
/** | ||
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
* SPDX-License-Identifier: Apache-2.0. | ||
*/ | ||
#include <aws/checksums/private/intel/crc32c_compiler_shims.h> | ||
#include <aws/common/macros.h> | ||
|
||
static uint32_t aws_checksums_crc32c_hw_small(const uint8_t *input, int length, uint32_t crc) { | ||
while (length-- > 0) { | ||
crc = (uint32_t)_mm_crc32_u8(crc, *input++); | ||
} | ||
return ~crc; | ||
} | ||
|
||
static uint32_t aws_checksums_crc32c_hw_unaligned(const uint8_t **input, int *length, uint32_t crc) { | ||
/* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */ | ||
int input_alignment = (uintptr_t)(*input)&0x7; | ||
|
||
/* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */ | ||
int leading = (8 - input_alignment) & 0x7; | ||
|
||
/* reduce the length by the leading unaligned bytes we are about to process */ | ||
*length -= leading; | ||
|
||
/* spin through the leading unaligned input bytes (if any) one-by-one */ | ||
while (leading-- > 0) { | ||
crc = (uint32_t)_mm_crc32_u8(crc, *(*input)++); | ||
} | ||
|
||
return crc; | ||
} | ||
|
||
/* | ||
* Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) instructions. | ||
* Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction. | ||
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent | ||
* call. | ||
*/ | ||
uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) { | ||
|
||
/* this is the entry point. We should only do the bit flip once. It should not be done for the subfunctions and | ||
* branches.*/ | ||
uint32_t crc = ~previousCrc32; | ||
|
||
/* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */ | ||
if (length < (int)sizeof(slice_ptr_int_type)) { | ||
return aws_checksums_crc32c_hw_small(input, length, crc); | ||
} | ||
|
||
crc = aws_checksums_crc32c_hw_unaligned(&input, &length, crc); | ||
/* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */ | ||
while (length >= (int)sizeof(slice_ptr_int_type)) { | ||
crc = (uint32_t)crc_intrin_fn(crc, *(const slice_ptr_int_type*) input); | ||
input += sizeof(slice_ptr_int_type); | ||
length -= (int)sizeof(slice_ptr_int_type); | ||
} | ||
|
||
/* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */ | ||
while (length-- > 0) { | ||
crc = (uint32_t)_mm_crc32_u8(crc, *input); | ||
input++; | ||
} | ||
|
||
return ~crc; | ||
} | ||
|
||
/* | ||
* Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and | ||
* PCLMULQDQ machine instructions (if present). | ||
* Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction. | ||
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent | ||
* call. | ||
*/ | ||
uint32_t aws_checksums_crc32c_clmul(const uint8_t *input, int length, uint32_t previousCrc32) { | ||
|
||
/* this is the entry point. We should only do the bit flip once. It should not be done for the subfunctions and | ||
* branches.*/ | ||
uint32_t crc = ~previousCrc32; | ||
|
||
/* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */ | ||
if (length < (int)sizeof(slice_ptr_int_type)) { | ||
return aws_checksums_crc32c_hw_small(input, length, crc); | ||
} | ||
|
||
crc = aws_checksums_crc32c_hw_unaligned(&input, &length, crc); | ||
|
||
return aws_checksums_crc32c_sse42(input, length, crc); | ||
} | ||
|
||
uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) { | ||
return aws_checksums_crc32_sw(input, length, previousCrc32); | ||
} |
Oops, something went wrong.