From 8733d2f98632ec7a3d45349e544b809c5daff029 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= <marko.makela@iki.fi>
Date: Mon, 8 Jul 2024 10:43:07 +0300
Subject: [PATCH] AVX512 and VPCLMULQDQ based CRC-32 and CRC-32C

This implementation is based on crc32_refl_by16_vclmul_avx512
in https://github.com/intel/intel-ipsec-mb/ with some optimizations.

Changes to CMakeLists.txt and source/intel/asm/crc32c_sse42_asm.c
are based on #72.

This also fixes a bug in aws_checksums_crc32c_hw() when 128-bit pclmul
is not available. crc_intrin_fn was being invoked on bytes instead
of 32-bit or 64-bit words. The aws-checksums-tests was extended to cover
all SIMD implementations.

Note: The availability of the Intel CRC-32C instructions is checked
as part of testing AWS_CPU_FEATURE_SSE_4_2. Both ISA extensions were
introduced in the Intel Nehalem microarchitecture.

For compiling this, https://github.com/awslabs/aws-c-common must be
installed and CMAKE_MODULE_PATH must point to it, e.g.:
cmake -DCMAKE_MODULE_PATH=/usr/local/lib/cmake.

The AWS_CPU_FEATURE_AVX512 currently only checks for AVX512F and not
other features that this implementation depends on:
AVX512VL, AVX512BW, AVX512DQ. According to
https://en.wikipedia.org/wiki/AVX-512#CPUs_with_AVX-512
there currently exist no CPUs that would support VPCLMULQDQ without
supporting all those AVX512 features.

The architecture target evex512 is something that was introduced as
mandatory in GCC 14 and clang 18 as part of introducing the AVX10.1-512
target, which basically is a new name for a number of AVX512 features.
Older compilers do not recognize this target, but they do emit EVEX
encoded instructions.
---
 CMakeLists.txt                                |  52 ++-
 include/aws/checksums/private/crc_priv.h      |  13 +-
 .../private/intel/crc32c_compiler_shims.h     |  26 ++
 source/crc.c                                  |  35 +-
 source/intel/asm/crc32c_sse42_asm.c           |  22 +-
 source/intel/crc_hw.c                         |  92 +++++
 source/intel/intrin/crc32_avx512.c            | 346 ++++++++++++++++++
 source/intel/visualc/visualc_crc32c_sse42.c   |  28 +-
 tests/crc_test.c                              |  46 +++
 9 files changed, 609 insertions(+), 51 deletions(-)
 create mode 100644 include/aws/checksums/private/intel/crc32c_compiler_shims.h
 create mode 100644 source/intel/crc_hw.c
 create mode 100644 source/intel/intrin/crc32_avx512.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a21bc36..61197e0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -58,17 +58,48 @@ file(GLOB AWS_ARCH_SRC
         )
 
 if (USE_CPU_EXTENSIONS)
-    if(AWS_ARCH_INTEL)
-        # First, check if inline assembly is available. Inline assembly can also be supported by MSVC if the compiler in use is Clang.
-        if(AWS_HAVE_GCC_INLINE_ASM)
-            file(GLOB AWS_ARCH_SRC
-                    "source/intel/asm/*.c"
+    if (AWS_ARCH_INTEL)
+        file (GLOB AWS_ARCH_INTEL_SRC
+            "source/intel/*.c"
+        )
+
+        if (AWS_HAVE_AVX512_INTRINSICS)
+            if (MSVC)
+                file(GLOB AWS_ARCH_INTRIN_SRC
+                    "source/intel/intrin/*.c"
+                    "source/intel/visualc/*.c"
                 )
-        elseif (MSVC)
-            file(GLOB AWS_ARCH_SRC
+            else()
+                file(GLOB AWS_ARCH_INTRIN_SRC
+                    "source/intel/intrin/*.c"
+                )
+            endif()
+        else()
+            if (MSVC)
+               file(GLOB AWS_ARCH_INTRIN_SRC
                     "source/intel/visualc/*.c"
+               )
+            endif()
+        endif()
+
+        source_group("Source Files\\intel" FILES ${AWS_ARCH_INTEL_SRC})
+        source_group("Source Files\\intel\\intrin" FILES ${AWS_ARCH_INTRIN_SRC})
+
+        if (AWS_HAVE_GCC_INLINE_ASM)
+            file(GLOB AWS_ARCH_ASM_SRC
+                "source/intel/asm/*.c"
+            )
+
+            file(GLOB AWS_ARCH_SRC
+                    ${AWS_ARCH_INTEL_SRC}
+                    ${AWS_ARCH_INTRIN_SRC}
+                    ${AWS_ARCH_ASM_SRC}
+            )
+        else()
+            file(GLOB AWS_ARCH_SRC
+                    ${AWS_ARCH_INTEL_SRC}
+                    ${AWS_ARCH_INTRIN_SRC}
             )
-            source_group("Source Files\\intel\\visualc" FILES ${AWS_ARCH_SRC})
         endif()
     endif()
 
@@ -114,6 +145,7 @@ file(GLOB CHECKSUMS_COMBINED_SRC
 
 
 add_library(${PROJECT_NAME} ${CHECKSUMS_COMBINED_HEADERS} ${CHECKSUMS_COMBINED_SRC})
+
 aws_set_common_properties(${PROJECT_NAME})
 aws_prepare_symbol_visibility_args(${PROJECT_NAME} "AWS_CHECKSUMS")
 aws_check_headers(${PROJECT_NAME} ${AWS_CHECKSUMS_HEADERS})
@@ -123,6 +155,10 @@ aws_add_sanitizers(${PROJECT_NAME})
 # We are not ABI stable yet
 set_target_properties(${PROJECT_NAME} PROPERTIES VERSION 1.0.0)
 
+if (USE_CPU_EXTENSIONS AND AWS_ARCH_INTEL)
+   SET_SOURCE_FILES_PROPERTIES(source/intel/crc_hw.c PROPERTIES COMPILE_FLAGS -msse4.2)
+endif()
+
 target_include_directories(${PROJECT_NAME} PUBLIC
         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
         $<INSTALL_INTERFACE:include>)
diff --git a/include/aws/checksums/private/crc_priv.h b/include/aws/checksums/private/crc_priv.h
index 221c86f..bccdde1 100644
--- a/include/aws/checksums/private/crc_priv.h
+++ b/include/aws/checksums/private/crc_priv.h
@@ -20,11 +20,20 @@ AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_sw(const uint8_t *input, int leng
 /* Computes the Castagnoli CRC32c (iSCSI) using a (slow) reference implementation. */
 AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c_sw(const uint8_t *input, int length, uint32_t previousCrc32c);
 
+/* Computes CRC32 (Ethernet, gzip, et. al.) using crc instructions. */
+AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_hw(const uint8_t *data, int length, uint32_t previousCrc32);
+
+/* Computes CRC32 (Ethernet, gzip, et. al.) using AVX512 and VPCLMULQDQ. */
+AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_avx512(const uint8_t *data, int length, uint32_t previousCrc32);
+
 /* Computes the Castagnoli CRC32c (iSCSI). */
 AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c_hw(const uint8_t *data, int length, uint32_t previousCrc32);
 
-/* Computes CRC32 (Ethernet, gzip, et. al.) using crc instructions. */
-AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_hw(const uint8_t *data, int length, uint32_t previousCrc32);
+/* Computes the Castagnoli CRC32c (iSCSI) using 128-bit PCLMULQDQ. */
+AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c_clmul(const uint8_t *data, int length, uint32_t previousCrc32);
+
+/* Computes the Castagnoli CRC32c (iSCSI) using AVX512 and VPCLMULQDQ. */
+AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c_avx512(const uint8_t *data, int length, uint32_t previousCrc32);
 
 #ifdef __cplusplus
 }
diff --git a/include/aws/checksums/private/intel/crc32c_compiler_shims.h b/include/aws/checksums/private/intel/crc32c_compiler_shims.h
new file mode 100644
index 0000000..5c8812d
--- /dev/null
+++ b/include/aws/checksums/private/intel/crc32c_compiler_shims.h
@@ -0,0 +1,26 @@
+/**
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0.
+ */
+
+#include <aws/checksums/private/crc_priv.h>
+
+#include <aws/common/config.h>
+#include <nmmintrin.h>
+
+#if defined _WIN64 || defined __x86_64__
+typedef uint64_t *slice_ptr_type;
+typedef uint64_t slice_ptr_int_type;
+# define crc_intrin_fn _mm_crc32_u64
+#else
+typedef uint32_t *slice_ptr_type;
+typedef uint32_t slice_ptr_int_type;
+# define crc_intrin_fn _mm_crc32_u32
+#endif
+
+#ifdef AWS_HAVE_AVX512_INTRINSICS
+uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t crc);
+uint32_t aws_checksums_crc32_avx512(const uint8_t *input, int length, uint32_t crc);
+#endif
+
+uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t crc);
diff --git a/source/crc.c b/source/crc.c
index f5d3e80..4dce008 100644
--- a/source/crc.c
+++ b/source/crc.c
@@ -12,22 +12,45 @@ static uint32_t (*s_crc32_fn_ptr)(const uint8_t *input, int length, uint32_t pre
 
 uint32_t aws_checksums_crc32(const uint8_t *input, int length, uint32_t previousCrc32) {
     if (AWS_UNLIKELY(!s_crc32_fn_ptr)) {
-        if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) {
+#ifdef AWS_HAVE_ARM32_CRC
+        if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC))
             s_crc32_fn_ptr = aws_checksums_crc32_hw;
-        } else {
+#elif defined AWS_HAVE_AVX512_INTRINSICS
+        if (aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512) &&
+            aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ))
+            s_crc32_fn_ptr = aws_checksums_crc32_avx512;
+#else
+        if (0) {}
+#endif
+        else
             s_crc32_fn_ptr = aws_checksums_crc32_sw;
-        }
     }
     return s_crc32_fn_ptr(input, length, previousCrc32);
 }
 
 uint32_t aws_checksums_crc32c(const uint8_t *input, int length, uint32_t previousCrc32) {
     if (AWS_UNLIKELY(!s_crc32c_fn_ptr)) {
-        if (aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2) || aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) {
+#ifdef AWS_HAVE_ARM32_CRC
+        if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC))
             s_crc32c_fn_ptr = aws_checksums_crc32c_hw;
-        } else {
-            s_crc32c_fn_ptr = aws_checksums_crc32c_sw;
+#else
+# ifdef AWS_HAVE_AVX512_INTRINSICS
+        if (aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512) &&
+            aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ))
+            s_crc32c_fn_ptr = aws_checksums_crc32c_avx512;
+        else
+# endif
+        if (aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2)) {
+# ifdef AWS_HAVE_CLMUL
+            if (aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL))
+                s_crc32c_fn_ptr = aws_checksums_crc32c_clmul;
+            else
+# endif
+                s_crc32c_fn_ptr = aws_checksums_crc32c_hw;
         }
+#endif
+        else
+            s_crc32c_fn_ptr = aws_checksums_crc32c_sw;
     }
     return s_crc32c_fn_ptr(input, length, previousCrc32);
 }
diff --git a/source/intel/asm/crc32c_sse42_asm.c b/source/intel/asm/crc32c_sse42_asm.c
index 35e1d09..da56e90 100644
--- a/source/intel/asm/crc32c_sse42_asm.c
+++ b/source/intel/asm/crc32c_sse42_asm.c
@@ -3,7 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0.
  */
 
-#include <aws/checksums/private/crc_priv.h>
+#include <aws/checksums/private/intel/crc32c_compiler_shims.h>
 
 #include <aws/common/cpuid.h>
 
@@ -283,7 +283,7 @@ static bool detected_clmul = false;
  * Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
  * call.
  */
-uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
+uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t previousCrc32) {
 
     if (AWS_UNLIKELY(!detection_performed)) {
         detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL);
@@ -293,7 +293,8 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
         detection_performed = true;
     }
 
-    uint32_t crc = ~previousCrc32;
+    /* this is called by a higher-level shim and previousCRC32 is already ~ */
+    uint32_t crc = previousCrc32;
 
     /* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
     if (AWS_UNLIKELY(length < 8)) {
@@ -358,22 +359,17 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
 
     return ~crc;
 }
-uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
-    return aws_checksums_crc32_sw(input, length, previousCrc32);
-}
 
 #    if defined(__clang__)
 #        pragma clang diagnostic pop
 #    endif
 
 #else
-uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
-    return aws_checksums_crc32_sw(input, length, previousCrc32);
-}
-
-uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
-    return aws_checksums_crc32c_sw(input, length, previousCrc32);
+uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t previousCrc32) {
+    /* these are nested in a larger computation. As a result the crc doesn't need to be bit flipped.
+       However, the sw function is also used as a standalone implementation that does need to do the
+       bit flip. So go ahead and flip it here, so the sw implementation flips it back. */
+    return aws_checksums_crc32c_sw(input, length, ~previousCrc32);
 }
-
 #endif
 /* clang-format on */
diff --git a/source/intel/crc_hw.c b/source/intel/crc_hw.c
new file mode 100644
index 0000000..da229d8
--- /dev/null
+++ b/source/intel/crc_hw.c
@@ -0,0 +1,92 @@
+/**
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0.
+ */
+#include <aws/checksums/private/intel/crc32c_compiler_shims.h>
+#include <aws/common/macros.h>
+
+static uint32_t aws_checksums_crc32c_hw_small(const uint8_t *input, int length, uint32_t crc) {
+        while (length-- > 0) {
+            crc = (uint32_t)_mm_crc32_u8(crc, *input++);
+        }
+        return ~crc;
+}
+
+static uint32_t aws_checksums_crc32c_hw_unaligned(const uint8_t **input, int *length, uint32_t crc) {
+    /* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */
+    int input_alignment = (uintptr_t)(*input)&0x7;
+
+    /* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */
+    int leading = (8 - input_alignment) & 0x7;
+
+    /* reduce the length by the leading unaligned bytes we are about to process */
+    *length -= leading;
+
+    /* spin through the leading unaligned input bytes (if any) one-by-one */
+    while (leading-- > 0) {
+        crc = (uint32_t)_mm_crc32_u8(crc, *(*input)++);
+    }
+
+    return crc;
+}
+
+/*
+ * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) instructions.
+ * Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction.
+ * Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
+ * call.
+ */
+uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
+
+    /* this is the entry point. We should only do the bit flip once. It should not be done for the subfunctions and
+     * branches.*/
+    uint32_t crc = ~previousCrc32;
+
+    /* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
+    if (length < (int)sizeof(slice_ptr_int_type)) {
+        return aws_checksums_crc32c_hw_small(input, length, crc);
+    }
+
+    crc = aws_checksums_crc32c_hw_unaligned(&input, &length, crc);
+    /* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */
+    while (length >= (int)sizeof(slice_ptr_int_type)) {
+        crc = (uint32_t)crc_intrin_fn(crc, *(const slice_ptr_int_type*) input);
+        input += sizeof(slice_ptr_int_type);
+        length -= (int)sizeof(slice_ptr_int_type);
+    }
+
+    /* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */
+    while (length-- > 0) {
+        crc = (uint32_t)_mm_crc32_u8(crc, *input);
+        input++;
+    }
+
+    return ~crc;
+}
+
+/*
+ * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and
+ * PCLMULQDQ machine instructions (if present).
+ * Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction.
+ * Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
+ * call.
+ */
+uint32_t aws_checksums_crc32c_clmul(const uint8_t *input, int length, uint32_t previousCrc32) {
+
+    /* this is the entry point. We should only do the bit flip once. It should not be done for the subfunctions and
+     * branches.*/
+    uint32_t crc = ~previousCrc32;
+
+    /* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
+    if (length < (int)sizeof(slice_ptr_int_type)) {
+        return aws_checksums_crc32c_hw_small(input, length, crc);
+    }
+
+    crc = aws_checksums_crc32c_hw_unaligned(&input, &length, crc);
+
+    return aws_checksums_crc32c_sse42(input, length, crc);
+}
+
+uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
+    return aws_checksums_crc32_sw(input, length, previousCrc32);
+}
diff --git a/source/intel/intrin/crc32_avx512.c b/source/intel/intrin/crc32_avx512.c
new file mode 100644
index 0000000..ac4b18d
--- /dev/null
+++ b/source/intel/intrin/crc32_avx512.c
@@ -0,0 +1,346 @@
+/**
+ * SPDX-License-Identifier: Apache-2.0.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef _MSC_VER
+# include <intrin.h>
+# define USE_VPCLMULQDQ /* nothing */
+#else
+# include <cpuid.h>
+# if __GNUC__ >= 14 || (defined __clang_major__ && __clang_major__ >= 18)
+#  define TARGET "pclmul,evex512,avx512f,avx512dq,avx512bw,avx512vl,vpclmulqdq"
+# else
+#  define TARGET "pclmul,avx512f,avx512dq,avx512bw,avx512vl,vpclmulqdq"
+# endif
+# define USE_VPCLMULQDQ __attribute__((target(TARGET)))
+#endif
+
+#include <immintrin.h>
+
+#ifdef _MSC_VER
+/* MSVC does not seem to define this intrinsic for vmovdqa */
+# define _mm_load_epi32(x) (*(const __m128i)(x))
+# define ALIGNAS(n) __declspec(align(n))
+#elif __STDC_VERSION__ >= 201100L
+# define ALIGNAS(n) _Alignas(n)
+#else
+# define ALIGNAS(n) __attribute__((__aligned__((n))))
+#endif
+
+/*
+  This implementation is based on crc32_refl_by16_vclmul_avx512
+  in https://github.com/intel/intel-ipsec-mb/ with some optimizations.
+  The // comments in crc32_avx512() correspond to assembler labels.
+*/
+
+/** table of constants corresponding to a CRC polynomial up to degree 32 */
+struct crc32_tab
+{
+  const uint64_t b2048[2], b1024[2];
+  ALIGNAS(64) const uint64_t b896[6]; /* includes b768, b640 */
+  const uint64_t b512[2];
+  const uint64_t b384[2], b256[2], b128[2], zeropad_for_b384[2];
+  const uint64_t b64[2], b32[2];
+};
+
+/** ISO 3309 CRC-32 (reflected polynomial 0x04C11DB7); zlib crc32() */
+ALIGNAS(64) static const struct crc32_tab refl32 = {
+  { 0x00000000e95c1271, 0x00000000ce3371cb },
+  { 0x00000000910eeec1, 0x0000000033fff533 },
+  { 0x000000000cbec0ed, 0x0000000031f8303f,
+    0x0000000057c54819, 0x00000000df068dc2,
+    0x00000000ae0b5394, 0x000000001c279815 },
+  { 0x000000001d9513d7, 0x000000008f352d95 },
+  { 0x00000000af449247, 0x000000003db1ecdc },
+  { 0x0000000081256527, 0x00000000f1da05aa },
+  { 0x00000000ccaa009e, 0x00000000ae689191 },
+  { 0, 0 },
+  { 0x00000000ccaa009e, 0x00000000b8bc6765 },
+  { 0x00000001f7011640, 0x00000001db710640 }
+};
+
+/** Castagnoli CRC-32C (reflected polynomial 0x1EDC6F41) */
+ALIGNAS(64) static const struct crc32_tab refl32c = {
+  { 0x00000000b9e02b86, 0x00000000dcb17aa4 },
+  { 0x000000000d3b6092, 0x000000006992cea2 },
+  { 0x0000000047db8317, 0x000000002ad91c30,
+    0x000000000715ce53, 0x00000000c49f4f67,
+    0x0000000039d3b296, 0x00000000083a6eec },
+  { 0x000000009e4addf8, 0x00000000740eef02 },
+  { 0x00000000ddc0152b, 0x000000001c291d04 },
+  { 0x00000000ba4fc28e, 0x000000003da6d0cb },
+  { 0x00000000493c7d27, 0x00000000f20c0dfe },
+  { 0, 0 },
+  { 0x00000000493c7d27, 0x00000000dd45aab8 },
+  { 0x00000000dea713f0, 0x0000000105ec76f0 }
+};
+
+#define TERNARY_XOR3 (0xf0^0xcc^0xaa)
+#define TERNARY_XNOR3 (0xff-TERNARY_XOR3)
+#define TERNARY_XOR2_AND ((0xf0^0xcc)&0xaa)
+
+USE_VPCLMULQDQ
+/** @return a^b^c */
+static inline __m128i xor3_128(__m128i a, __m128i b, __m128i c)
+{
+  return _mm_ternarylogic_epi64(a, b, c, TERNARY_XOR3);
+}
+
+USE_VPCLMULQDQ
+/** @return ~(a^b^c) */
+static inline __m128i xnor3_128(__m128i a, __m128i b, __m128i c)
+{
+  return _mm_ternarylogic_epi64(a, b, c, TERNARY_XNOR3);
+}
+
+USE_VPCLMULQDQ
+/** @return a^b^c */
+static inline __m512i xor3_512(__m512i a, __m512i b, __m512i c)
+{
+  return _mm512_ternarylogic_epi64(a, b, c, TERNARY_XOR3);
+}
+
+USE_VPCLMULQDQ
+/** @return (a^b)&c */
+static inline __m128i xor2_and_128(__m128i a, __m128i b, __m128i c)
+{
+  return _mm_ternarylogic_epi64(a, b, c, TERNARY_XOR2_AND);
+}
+
+USE_VPCLMULQDQ
+/** Load 64 bytes */
+static inline __m512i load512(const uint8_t *b)
+{ return _mm512_loadu_epi8(b); }
+
+USE_VPCLMULQDQ
+/** Load 16 bytes */
+static inline __m128i load128(const uint8_t *b) { return _mm_loadu_epi64(b); }
+
+/** Combine 512 data bits with CRC */
+USE_VPCLMULQDQ
+static inline __m512i combine512(__m512i a, __m512i tab, __m512i b)
+{
+  return xor3_512(b, _mm512_clmulepi64_epi128(a, tab, 0x01),
+                  _mm512_clmulepi64_epi128(a, tab, 0x10));
+}
+
+#define xor512(a, b) _mm512_xor_epi64(a, b)
+#define xor256(a, b) _mm256_xor_epi64(a, b)
+#define xor128(a, b) _mm_xor_epi64(a, b)
+#define and128(a, b) _mm_and_si128(a, b)
+
+USE_VPCLMULQDQ
+/** Pick and zero-extend 128 bits of a 512-bit vector (vextracti32x4) */
+static inline __m512i extract512_128_3(__m512i a)
+{
+  return _mm512_zextsi128_si512(_mm512_extracti64x2_epi64(a, 3));
+}
+
+ALIGNAS(16) static const uint64_t shuffle128[4] = {
+  0x8786858483828100, 0x8f8e8d8c8b8a8988,
+  0x0706050403020100, 0x000e0d0c0b0a0908
+};
+
+static const __mmask16 size_mask[16] = {
+  0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff,
+  0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff
+};
+
+ALIGNAS(16) static const uint64_t shift128[4] = {
+  0x8786858483828100, 0x8f8e8d8c8b8a8988,
+  0x0706050403020100, 0x000e0d0c0b0a0908
+};
+
+static const uint8_t shift_1_to_3_reflect[7 + 11] = {
+  0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
+};
+
+USE_VPCLMULQDQ
+static uint32_t crc32_avx512(const uint8_t *buf, int size, uint32_t crc,
+                             const struct crc32_tab *tab)
+{
+  const __m512i crc_in = _mm512_castsi128_si512(_mm_cvtsi32_si128(~crc)),
+    b512 = _mm512_broadcast_i32x4(_mm_load_epi32(tab->b512));
+  __m128i crc_out;
+  __m512i lo;
+
+  if (size >= 256) {
+    lo = xor512(load512(buf), crc_in);
+    __m512i l1 = load512(buf + 64);
+
+    const __m512i b1024 = _mm512_broadcast_i32x4(_mm_load_epi32(&tab->b1024));
+    size -= 256;
+    if (size >= 256) {
+      __m512i h0 = load512(buf + 128),
+        hi = load512(buf + 192);
+      const __m512i b2048 = _mm512_broadcast_i32x4(_mm_load_epi32(&tab->b2048));
+      size -= 256;
+      do {
+        buf += 256;
+        lo = combine512(lo, b2048, load512(buf));
+        l1 = combine512(l1, b2048, load512(buf + 64));
+        h0 = combine512(h0, b2048, load512(buf + 128));
+        hi = combine512(hi, b2048, load512(buf + 192));
+        size -= 256;
+      } while (size >= 0);
+
+      buf += 256;
+      lo = combine512(lo, b1024, h0);
+      l1 = combine512(l1, b1024, hi);
+      size += 128;
+    } else {
+      do {
+        buf += 128;
+        lo = combine512(lo, b1024, load512(buf));
+        l1 = combine512(l1, b1024, load512(buf + 64));
+        size -= 128;
+      } while (size >= 0);
+
+      buf += 128;
+    }
+
+    if (size >= -64) {
+      size += 128;
+      lo = combine512(lo, b512, l1);
+      goto fold_64_B_loop;
+    }
+
+    const __m512i
+      b896 = _mm512_load_epi32(&tab->b896),
+      b384 = _mm512_load_epi32(&tab->b384);
+
+    __m512i c4 = xor3_512(_mm512_clmulepi64_epi128(lo, b896, 1),
+                          _mm512_clmulepi64_epi128(lo, b896, 0x10),
+                          _mm512_clmulepi64_epi128(l1, b384, 1));
+    c4 = xor3_512(c4, _mm512_clmulepi64_epi128(l1, b384, 0x10),
+                  extract512_128_3(l1));
+
+    __m256i c2 = _mm512_castsi512_si256(_mm512_shuffle_i64x2(c4, c4, 0x4e));
+    c2 = xor256(c2, _mm512_castsi512_si256(c4));
+    crc_out = xor128(_mm256_extracti64x2_epi64(c2, 1),
+                     _mm256_castsi256_si128(c2));
+    size += 128 - 16;
+    goto final_reduction;
+  }
+
+  __m128i b;
+
+  // less_than_256
+  if (size >= 32) {
+    if (size >= 64) {
+      lo = xor512(load512(buf), crc_in);
+
+      while (buf += 64, (size -= 64) >= 64)
+      fold_64_B_loop:
+        lo = combine512(lo, b512, load512(buf));
+
+      // reduce_64B
+      const __m512i b384 = _mm512_load_epi32(&tab->b384);
+      __m512i crc512 =
+        xor3_512(_mm512_clmulepi64_epi128(lo, b384, 1),
+                 _mm512_clmulepi64_epi128(lo, b384, 0x10),
+                 extract512_128_3(lo));
+      crc512 = xor512(crc512, _mm512_shuffle_i64x2(crc512, crc512, 0x4e));
+      const __m256i crc256 = _mm512_castsi512_si256(crc512);
+      crc_out = xor128(_mm256_extracti64x2_epi64(crc256, 1),
+                      _mm256_castsi256_si128(crc256));
+      size -= 16;
+    } else {
+      // less_than_64
+      crc_out = xor128(load128(buf),
+                       _mm512_castsi512_si128(crc_in));
+      buf += 16;
+      size -= 32;
+    }
+
+  final_reduction:
+    b = _mm_load_epi32(&tab->b128);
+
+    while (size >= 0) {
+      // reduction_loop_16B
+      crc_out = xor3_128(load128(buf),
+                         _mm_clmulepi64_si128(crc_out, b, 1),
+                         _mm_clmulepi64_si128(crc_out, b, 0x10));
+      buf += 16;
+      size -= 16;
+    }
+    // final_reduction_for_128
+
+    size += 16;
+    if (size) {
+      __m128i crc2, d;
+    get_last_two_xmms:
+      crc2 = crc_out, d = load128(buf + ssize_t(size) - 16);
+      __m128i S = load128(((const uint8_t*) shuffle128) + size);
+      crc_out = _mm_shuffle_epi8(crc_out, S);
+      S = xor128(S, _mm_set1_epi32(0x80808080));
+      crc_out = xor3_128(_mm_blendv_epi8(_mm_shuffle_epi8(crc2, S), d, S),
+                         _mm_clmulepi64_si128(crc_out, b, 1),
+                         _mm_clmulepi64_si128(crc_out, b, 0x10));
+    }
+
+    __m128i crc_tmp;
+  done_128:
+    b = _mm_load_epi32(&tab->b64);
+    crc_tmp = xor128(_mm_clmulepi64_si128(crc_out, b, 0x00),
+                     _mm_srli_si128(crc_out, 8));
+    crc_out = _mm_slli_si128(crc_tmp, 4);
+    crc_out = _mm_clmulepi64_si128(crc_out, b, 0x10);
+    crc_out = xor128(crc_out, crc_tmp);
+
+  barrett:
+    b = _mm_load_epi32(&tab->b32);
+    crc_tmp = crc_out;
+    crc_out = and128(crc_out, _mm_set_epi64x(~0ULL, ~0xFFFFFFFFULL));
+    crc_out = _mm_clmulepi64_si128(crc_out, b, 0);
+    crc_out = xor2_and_128(crc_out, crc_tmp, _mm_set_epi64x(0, ~0ULL));
+    crc_out = xnor3_128(crc_out, crc_tmp,
+                        _mm_clmulepi64_si128(crc_out, b, 0x10));
+    return _mm_extract_epi32(crc_out, 2);
+  } else {
+    // less_than_32
+    if (size > 0) {
+      if (size > 16) {
+        crc_out = xor128(load128(buf),
+                         _mm512_castsi512_si128(crc_in));
+        buf += 16;
+        size -= 16;
+        b = _mm_load_epi32(&tab->b128);
+        goto get_last_two_xmms;
+      } else if (size < 16) {
+        crc_out = _mm_maskz_loadu_epi8(size_mask[size - 1], buf);
+        crc_out = xor128(crc_out, _mm512_castsi512_si128(crc_in));
+
+        if (size >= 4) {
+          crc_out = _mm_shuffle_epi8(crc_out, load128(((const uint8_t*)
+                                                       shift128) + size));
+          goto done_128;
+        } else {
+          // only_less_than_4
+          /* Shift, zero-filling 5 to 7 of the 8-byte crc_out */
+          crc_out = _mm_shuffle_epi8(crc_out,
+                                     load128(shift_1_to_3_reflect + size - 1));
+          goto barrett;
+        }
+      } else {
+        crc_out = xor128(load128(buf), _mm512_castsi512_si128(crc_in));
+        goto done_128;
+      }
+    } else
+      return crc;
+  }
+}
+
+uint32_t aws_checksums_crc32_avx512(const uint8_t *input, int length, uint32_t crc)
+{
+  return crc32_avx512(input, length, crc, &refl32);
+}
+
+uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t crc)
+{
+  return crc32_avx512(input, length, crc, &refl32c);
+}
diff --git a/source/intel/visualc/visualc_crc32c_sse42.c b/source/intel/visualc/visualc_crc32c_sse42.c
index ca1aca4..707f2ba 100644
--- a/source/intel/visualc/visualc_crc32c_sse42.c
+++ b/source/intel/visualc/visualc_crc32c_sse42.c
@@ -3,26 +3,15 @@
  * SPDX-License-Identifier: Apache-2.0.
  */
 
-#include <aws/checksums/private/crc_priv.h>
-#include <intrin.h>
-
-#if defined(_M_X64) || defined(_M_IX86)
-
-#    if defined(_M_X64)
-typedef uint64_t *slice_ptr_type;
-typedef uint64_t slice_ptr_int_type;
-#    else
-typedef uint32_t *slice_ptr_type;
-typedef uint32_t slice_ptr_int_type;
-#    endif
+#include <aws/checksums/private/intel/crc32c_compiler_shims.h>
 
 /**
  * This implements crc32c via the intel sse 4.2 instructions.
  *  This is separate from the straight asm version, because visual c does not allow
  *  inline assembly for x64.
  */
-uint32_t aws_checksums_crc32c_hw(const uint8_t *data, int length, uint32_t previousCrc32) {
-    uint32_t crc = ~previousCrc32;
+uint32_t aws_checksums_crc32c_sse42(const uint8_t *data, int length, uint32_t previousCrc32) {
+    uint32_t crc = previousCrc32;
     int length_to_process = length;
 
     slice_ptr_type temp = (slice_ptr_type)data;
@@ -54,11 +43,11 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *data, int length, uint32_t previ
     uint32_t remainder = length_to_process % sizeof(temp);
 
     while (slices--) {
-#    if defined(_M_X64)
+#if defined(_M_X64)
         crc = (uint32_t)_mm_crc32_u64(crc, *temp++);
-#    else
+#else
         crc = _mm_crc32_u32(crc, *temp++);
-#    endif
+#endif
     }
 
     /* process the remaining parts that can't be done on the slice size. */
@@ -70,8 +59,3 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *data, int length, uint32_t previ
 
     return ~crc;
 }
-
-uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
-    return aws_checksums_crc32_sw(input, length, previousCrc32);
-}
-#endif /* x64 || x86 */
diff --git a/tests/crc_test.c b/tests/crc_test.c
index c975791..79d1318 100644
--- a/tests/crc_test.c
+++ b/tests/crc_test.c
@@ -5,6 +5,10 @@
 
 #include <aws/checksums/crc.h>
 #include <aws/checksums/private/crc_priv.h>
+
+#include <aws/common/device_random.h>
+#include <aws/common/cpuid.h>
+
 #include <aws/testing/aws_test_harness.h>
 
 static const uint8_t DATA_32_ZEROS[32] = {0};
@@ -101,6 +105,30 @@ static int s_test_crc32c(struct aws_allocator *allocator, void *ctx) {
     res |= s_test_known_crc32c(CRC_FUNC_NAME(aws_checksums_crc32c));
     res |= s_test_known_crc32c(CRC_FUNC_NAME(aws_checksums_crc32c_sw));
 
+    struct aws_byte_buf avx_buf;
+    /* enough for 3 avx512 runs */
+    aws_byte_buf_init(&avx_buf, allocator, 768);
+    aws_device_random_buffer(&avx_buf);
+
+    uint32_t crc = aws_checksums_crc32c_sw(avx_buf.buffer, (int)avx_buf.len, 0);
+    uint32_t hw_crc = aws_checksums_crc32c_hw(avx_buf.buffer, (int)avx_buf.len, 0);
+    ASSERT_UINT_EQUALS(hw_crc, crc);
+#ifdef AWS_HAVE_CLMUL
+    if (aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL)) {
+        uint32_t clmul = aws_checksums_crc32c_clmul(avx_buf.buffer, (int)avx_buf.len, 0);
+        ASSERT_UINT_EQUALS(clmul, crc);
+    }
+#endif
+#ifdef AWS_HAVE_AVX512_INTRINSICS
+    if (aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512) &&
+	aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ)) {
+        uint32_t clmul2 = aws_checksums_crc32c_avx512(avx_buf.buffer, (int)avx_buf.len, 0);
+        ASSERT_UINT_EQUALS(clmul2, crc);
+    }
+#endif
+
+    aws_byte_buf_clean_up(&avx_buf);
+
     return res;
 }
 AWS_TEST_CASE(test_crc32c, s_test_crc32c)
@@ -112,6 +140,24 @@ static int s_test_crc32(struct aws_allocator *allocator, void *ctx) {
     int res = 0;
     res |= s_test_known_crc32(CRC_FUNC_NAME(aws_checksums_crc32));
 
+    struct aws_byte_buf avx_buf;
+    /* enough for 3 avx512 runs */
+    aws_byte_buf_init(&avx_buf, allocator, 768);
+    aws_device_random_buffer(&avx_buf);
+
+    uint32_t crc = aws_checksums_crc32_sw(avx_buf.buffer, (int)avx_buf.len, 0);
+    uint32_t hw_crc = aws_checksums_crc32_hw(avx_buf.buffer, (int)avx_buf.len, 0);
+    ASSERT_UINT_EQUALS(hw_crc, crc);
+#ifdef AWS_HAVE_AVX512_INTRINSICS
+    if (aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512) &&
+	aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ)) {
+        uint32_t clmul2 = aws_checksums_crc32_avx512(avx_buf.buffer, (int)avx_buf.len, 0);
+        ASSERT_UINT_EQUALS(clmul2, crc);
+    }
+#endif
+
+    aws_byte_buf_clean_up(&avx_buf);
+
     return res;
 }
 AWS_TEST_CASE(test_crc32, s_test_crc32)