forked from awslabs/aws-checksums
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
AVX512 and VPCLMULQDQ based CRC-32 and CRC-32C
This implementation is based on crc32_refl_by16_vclmul_avx512 in https://github.com/intel/intel-ipsec-mb/ with some optimizations. Some of the code is based on awslabs#72.
- Loading branch information
Showing
7 changed files
with
581 additions
and
43 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
26 changes: 26 additions & 0 deletions
26
include/aws/checksums/private/intel/crc32c_compiler_shims.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
/** | ||
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
* SPDX-License-Identifier: Apache-2.0. | ||
*/ | ||
|
||
#include <aws/checksums/private/crc_priv.h> | ||
|
||
#include <aws/common/config.h> | ||
#include <nmmintrin.h> | ||
|
||
#if _WIN64 || __x86_64__ || __ppc64_ | ||
typedef uint64_t *slice_ptr_type; | ||
typedef uint64_t slice_ptr_int_type; | ||
# define crc_intrin_fn _mm_crc32_u64 | ||
#else | ||
typedef uint32_t *slice_ptr_type; | ||
typedef uint32_t slice_ptr_int_type; | ||
# define crc_intrin_fn _mm_crc32_u32 | ||
#endif | ||
|
||
#ifdef AWS_HAVE_AVX512_INTRINSICS | ||
uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t crc); | ||
uint32_t aws_checksums_crc32_avx512(const uint8_t *input, int length, uint32_t crc); | ||
#endif | ||
|
||
uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t crc); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
/** | ||
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. | ||
* SPDX-License-Identifier: Apache-2.0. | ||
*/ | ||
#include <aws/checksums/private/intel/crc32c_compiler_shims.h> | ||
#ifdef _MSC_VER | ||
# include <intrin.h> | ||
#else | ||
# include <cpuid.h> | ||
#endif | ||
|
||
static bool detection_performed; | ||
static bool detected_sse42; | ||
static bool detected_clmul; | ||
#ifdef AWS_HAVE_AVX512_INTRINSICS | ||
static bool detected_vpclmulqdq; | ||
#endif | ||
|
||
static void aws_checksums_hw_detect(void) | ||
{ | ||
#ifdef _MSC_VER | ||
int regs[4]; | ||
__cpuid(regs, 1); | ||
uint32_t ecx = regs[2]; | ||
#else | ||
uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0; | ||
__cpuid(1, reax, rebx, recx, redx); | ||
#endif | ||
detected_sse42 = ecx & 1U << 20; | ||
detected_clmul = ecx & 1U << 1; | ||
|
||
#ifdef AWS_HAVE_AVX512_INTRINSICS | ||
# ifdef _MSC_VER | ||
__cpuidex(regs, 7, 0); | ||
uint32_t ebx = regs[1]; | ||
ecx = regs[2]; | ||
# else | ||
__cpuid_count(7, 0, eax, ebx, ecx, edx); | ||
# endif | ||
detected_vpclmulqdq = ecx & 1U<<10/*VPCLMULQDQ*/ && | ||
!(~ebx & ((1U<<16/*AVX512F*/ | 1U<<17/*AVX512DQ*/ | | ||
1U<<30/*AVX512BW*/ | 1U<<31/*AVX512VL*/))); | ||
#endif | ||
|
||
/* Simply setting the flag true to skip HW detection next time | ||
Not using memory barriers since the worst that can | ||
happen is a fallback to the non HW accelerated code. */ | ||
detection_performed = true; | ||
} | ||
|
||
/* | ||
* Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and | ||
* PCLMULQDQ machine instructions (if present). | ||
* Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction. | ||
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent | ||
* call. | ||
*/ | ||
uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) { | ||
|
||
if (AWS_UNLIKELY(!detection_performed)) { | ||
aws_checksums_hw_detect(); | ||
} | ||
|
||
#ifdef AWS_HAVE_AVX512_INTRINSICS | ||
if (detected_vpclmulqdq) { | ||
return aws_checksums_crc32c_avx512(inputr, length, crc); | ||
} | ||
#endif | ||
|
||
/* this is the entry point. We should only do the bit flip once. It should not be done for the subfunctions and | ||
* branches.*/ | ||
uint32_t crc = ~previousCrc32; | ||
|
||
/* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */ | ||
if (length < (int)sizeof(slice_ptr_int_type)) { | ||
while (length-- > 0) { | ||
crc = (uint32_t)_mm_crc32_u8(crc, *input++); | ||
} | ||
return ~crc; | ||
} | ||
|
||
/* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */ | ||
int input_alignment = (uintptr_t)(input)&0x7; | ||
|
||
/* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */ | ||
int leading = (8 - input_alignment) & 0x7; | ||
|
||
/* reduce the length by the leading unaligned bytes we are about to process */ | ||
length -= leading; | ||
|
||
/* spin through the leading unaligned input bytes (if any) one-by-one */ | ||
while (leading-- > 0) { | ||
crc = (uint32_t)_mm_crc32_u8(crc, *input++); | ||
} | ||
|
||
if (detected_sse42 && detected_clmul) { | ||
return aws_checksums_crc32c_sse42(input, length, crc); | ||
} | ||
|
||
/* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */ | ||
while (length >= (int)sizeof(slice_ptr_int_type)) { | ||
crc = (uint32_t)crc_intrin_fn(crc, *input); | ||
input += sizeof(slice_ptr_int_type); | ||
length -= (int)sizeof(slice_ptr_int_type); | ||
} | ||
|
||
/* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */ | ||
while (length-- > 0) { | ||
crc = (uint32_t)_mm_crc32_u8(crc, *input); | ||
input++; | ||
} | ||
|
||
return ~crc; | ||
} | ||
|
||
uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) { | ||
#ifdef AWS_HAVE_AVX512_INTRINSICS | ||
if (AWS_UNLIKELY(!detection_performed)) { | ||
aws_checksums_hw_detect(); | ||
} | ||
|
||
if (detected_vpclmulqdq) { | ||
return aws_checksums_crc32_avx512(inputr, length, crc); | ||
} | ||
#endif | ||
return aws_checksums_crc32_sw(input, length, previousCrc32); | ||
} |
Oops, something went wrong.