From ba55f96356f16a27733aca5096169cd2fcf5b348 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mamy=20Andr=C3=A9-Ratsimbazafy?= <mamy_github@numforge.co>
Date: Sun, 24 Nov 2019 11:22:40 +0100
Subject: [PATCH] Remove GEMM benchmarks for the moment

---
 .gitmodules                                   |    3 -
 benchmarks/cpuinfo                            |    1 -
 benchmarks/cpuinfo.h                          | 1693 -----------------
 benchmarks/cpuinfo.nim                        |  371 ----
 benchmarks/matmul/README.md                   |   38 -
 benchmarks/matmul/laser_gemm/gemm.nim         |  511 -----
 benchmarks/matmul/laser_gemm/gemm_packing.nim |   94 -
 .../matmul/laser_gemm/gemm_prepacked.nim      |  525 -----
 .../matmul/laser_gemm_backend/gemm_tiling.nim |  344 ----
 .../laser_gemm_backend/gemm_ukernel_avx.nim   |   44 -
 .../laser_gemm_backend/gemm_ukernel_avx2.nim  |   35 -
 .../gemm_ukernel_avx512.nim                   |   74 -
 .../gemm_ukernel_avx_fma.nim                  |   38 -
 .../gemm_ukernel_dispatch.nim                 |  125 --
 .../gemm_ukernel_generator.nim                |  250 ---
 .../gemm_ukernel_generic.nim                  |  138 --
 .../laser_gemm_backend/gemm_ukernel_sse.nim   |   26 -
 .../laser_gemm_backend/gemm_ukernel_sse2.nim  |  129 --
 .../gemm_ukernel_sse4_1.nim                   |   35 -
 .../matmul/laser_gemm_backend/gemm_utils.nim  |   60 -
 .../matmul/laser_utils/align_unroller.nim     |   41 -
 .../laser_utils/compiler_optim_hints.nim      |  149 --
 benchmarks/matmul/laser_utils/memory.nim      |   20 -
 benchmarks/matmul/laser_utils/openmp.nim      |  386 ----
 benchmarks/matmul/laser_utils/simd.nim        |  441 -----
 benchmarks/matmul/nim.cfg                     |   13 -
 benchmarks/matmul/weave_gemm/README.md        |    1 -
 27 files changed, 5585 deletions(-)
 delete mode 160000 benchmarks/cpuinfo
 delete mode 100644 benchmarks/cpuinfo.h
 delete mode 100644 benchmarks/cpuinfo.nim
 delete mode 100644 benchmarks/matmul/README.md
 delete mode 100644 benchmarks/matmul/laser_gemm/gemm.nim
 delete mode 100644 benchmarks/matmul/laser_gemm/gemm_packing.nim
 delete mode 100644 benchmarks/matmul/laser_gemm/gemm_prepacked.nim
 delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_tiling.nim
 delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx.nim
 delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx2.nim
 delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx512.nim
 delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx_fma.nim
 delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_ukernel_dispatch.nim
 delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_ukernel_generator.nim
 delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_ukernel_generic.nim
 delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse.nim
 delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse2.nim
 delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse4_1.nim
 delete mode 100644 benchmarks/matmul/laser_gemm_backend/gemm_utils.nim
 delete mode 100644 benchmarks/matmul/laser_utils/align_unroller.nim
 delete mode 100644 benchmarks/matmul/laser_utils/compiler_optim_hints.nim
 delete mode 100644 benchmarks/matmul/laser_utils/memory.nim
 delete mode 100644 benchmarks/matmul/laser_utils/openmp.nim
 delete mode 100644 benchmarks/matmul/laser_utils/simd.nim
 delete mode 100644 benchmarks/matmul/nim.cfg
 delete mode 100644 benchmarks/matmul/weave_gemm/README.md

diff --git a/.gitmodules b/.gitmodules
index 471fcd1..e69de29 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,3 +0,0 @@
-[submodule "benchmarks/cpuinfo"]
-	path = benchmarks/cpuinfo
-	url = https://github.com/pytorch/cpuinfo
diff --git a/benchmarks/cpuinfo b/benchmarks/cpuinfo
deleted file mode 160000
index d5e37ad..0000000
--- a/benchmarks/cpuinfo
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit d5e37adf1406cf899d7d9ec1d317c47506ccb970
diff --git a/benchmarks/cpuinfo.h b/benchmarks/cpuinfo.h
deleted file mode 100644
index 1abb806..0000000
--- a/benchmarks/cpuinfo.h
+++ /dev/null
@@ -1,1693 +0,0 @@
-#pragma once
-#ifndef CPUINFO_H
-#define CPUINFO_H
-
-#ifndef __cplusplus
-	#include <stdbool.h>
-#endif
-
-#ifdef __APPLE__
-	#include <TargetConditionals.h>
-#endif
-
-#include <stdint.h>
-
-/* Identify architecture and define corresponding macro */
-
-#if defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || defined(_M_IX86)
-	#define CPUINFO_ARCH_X86 1
-#endif
-
-#if defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64)
-	#define CPUINFO_ARCH_X86_64 1
-#endif
-
-#if defined(__arm__) || defined(_M_ARM)
-	#define CPUINFO_ARCH_ARM 1
-#endif
-
-#if defined(__aarch64__) || defined(_M_ARM64)
-	#define CPUINFO_ARCH_ARM64 1
-#endif
-
-#if defined(__PPC64__) || defined(__powerpc64__) || defined(_ARCH_PPC64)
-	#define CPUINFO_ARCH_PPC64 1
-#endif
-
-#if defined(__pnacl__)
-	#define CPUINFO_ARCH_PNACL 1
-#endif
-
-#if defined(EMSCRIPTEN)
-	#define CPUINFO_ARCH_ASMJS 1
-#endif
-
-#if CPUINFO_ARCH_X86 && defined(_MSC_VER)
-	#define CPUINFO_ABI __cdecl
-#elif CPUINFO_ARCH_X86 && defined(__GNUC__)
-	#define CPUINFO_ABI __attribute__((__cdecl__))
-#else
-	#define CPUINFO_ABI
-#endif
-
-/* Define other architecture-specific macros as 0 */
-
-#ifndef CPUINFO_ARCH_X86
-	#define CPUINFO_ARCH_X86 0
-#endif
-
-#ifndef CPUINFO_ARCH_X86_64
-	#define CPUINFO_ARCH_X86_64 0
-#endif
-
-#ifndef CPUINFO_ARCH_ARM
-	#define CPUINFO_ARCH_ARM 0
-#endif
-
-#ifndef CPUINFO_ARCH_ARM64
-	#define CPUINFO_ARCH_ARM64 0
-#endif
-
-#ifndef CPUINFO_ARCH_PPC64
-	#define CPUINFO_ARCH_PPC64 0
-#endif
-
-#ifndef CPUINFO_ARCH_PNACL
-	#define CPUINFO_ARCH_PNACL 0
-#endif
-
-#ifndef CPUINFO_ARCH_ASMJS
-	#define CPUINFO_ARCH_ASMJS 0
-#endif
-
-#define CPUINFO_CACHE_UNIFIED          0x00000001
-#define CPUINFO_CACHE_INCLUSIVE        0x00000002
-#define CPUINFO_CACHE_COMPLEX_INDEXING 0x00000004
-
-struct cpuinfo_cache {
-	/** Cache size in bytes */
-	uint32_t size;
-	/** Number of ways of associativity */
-	uint32_t associativity;
-	/** Number of sets */
-	uint32_t sets;
-	/** Number of partitions */
-	uint32_t partitions;
-	/** Line size in bytes */
-	uint32_t line_size;
-	/**
-	 * Binary characteristics of the cache (unified cache, inclusive cache, cache with complex indexing).
-	 *
-	 * @see CPUINFO_CACHE_UNIFIED, CPUINFO_CACHE_INCLUSIVE, CPUINFO_CACHE_COMPLEX_INDEXING
-	 */
-	uint32_t flags;
-	/** Index of the first logical processor that shares this cache */
-	uint32_t processor_start;
-	/** Number of logical processors that share this cache */
-	uint32_t processor_count;
-};
-
-struct cpuinfo_trace_cache {
-	uint32_t uops;
-	uint32_t associativity;
-};
-
-#define CPUINFO_PAGE_SIZE_4KB  0x1000
-#define CPUINFO_PAGE_SIZE_1MB  0x100000
-#define CPUINFO_PAGE_SIZE_2MB  0x200000
-#define CPUINFO_PAGE_SIZE_4MB  0x400000
-#define CPUINFO_PAGE_SIZE_16MB 0x1000000
-#define CPUINFO_PAGE_SIZE_1GB  0x40000000
-
-struct cpuinfo_tlb {
-	uint32_t entries;
-	uint32_t associativity;
-	uint64_t pages;
-};
-
-/** Vendor of processor core design */
-enum cpuinfo_vendor {
-	/** Processor vendor is not known to the library, or the library failed to get vendor information from the OS. */
-	cpuinfo_vendor_unknown = 0,
-
-	/* Active vendors of modern CPUs */
-
-	/**
-	 * Intel Corporation. Vendor of x86, x86-64, IA64, and ARM processor microarchitectures.
-	 *
-	 * Sold its ARM design subsidiary in 2006. The last ARM processor design was released in 2004.
-	 */
-	cpuinfo_vendor_intel    = 1,
-	/** Advanced Micro Devices, Inc. Vendor of x86 and x86-64 processor microarchitectures. */
-	cpuinfo_vendor_amd      = 2,
-	/** ARM Holdings plc. Vendor of ARM and ARM64 processor microarchitectures. */
-	cpuinfo_vendor_arm      = 3,
-	/** Qualcomm Incorporated. Vendor of ARM and ARM64 processor microarchitectures. */
-	cpuinfo_vendor_qualcomm = 4,
-	/** Apple Inc. Vendor of ARM and ARM64 processor microarchitectures. */
-	cpuinfo_vendor_apple    = 5,
-	/** Samsung Electronics Co., Ltd. Vendir if ARM64 processor microarchitectures. */
-	cpuinfo_vendor_samsung  = 6,
-	/** Nvidia Corporation. Vendor of ARM64-compatible processor microarchitectures. */
-	cpuinfo_vendor_nvidia   = 7,
-	/** MIPS Technologies, Inc. Vendor of MIPS processor microarchitectures. */
-	cpuinfo_vendor_mips     = 8,
-	/** International Business Machines Corporation. Vendor of PowerPC processor microarchitectures. */
-	cpuinfo_vendor_ibm      = 9,
-	/** Ingenic Semiconductor. Vendor of MIPS processor microarchitectures. */
-	cpuinfo_vendor_ingenic  = 10,
-	/**
-	 * VIA Technologies, Inc. Vendor of x86 and x86-64 processor microarchitectures.
-	 *
-	 * Processors are designed by Centaur Technology, a subsidiary of VIA Technologies.
-	 */
-	cpuinfo_vendor_via      = 11,
-	/** Cavium, Inc. Vendor of ARM64 processor microarchitectures. */
-	cpuinfo_vendor_cavium   = 12,
-	/** Broadcom, Inc. Vendor of ARM processor microarchitectures. */
-	cpuinfo_vendor_broadcom = 13,
-	/** Applied Micro Circuits Corporation (APM). Vendor of ARM64 processor microarchitectures. */
-	cpuinfo_vendor_apm      = 14,
-	/**
-	 * Huawei Technologies Co., Ltd. Vendor of ARM64 processor microarchitectures.
-	 *
-	 * Processors are designed by HiSilicon, a subsidiary of Huawei.
-	 */
-	cpuinfo_vendor_huawei   = 15,
-
-	/* Active vendors of embedded CPUs */
-
-	/** Texas Instruments Inc. Vendor of ARM processor microarchitectures. */
-	cpuinfo_vendor_texas_instruments = 30,
-	/** Marvell Technology Group Ltd. Vendor of ARM processor microarchitectures. */
-	cpuinfo_vendor_marvell           = 31,
-	/** RDC Semiconductor Co., Ltd. Vendor of x86 processor microarchitectures. */
-	cpuinfo_vendor_rdc               = 32,
-	/** DM&P Electronics Inc. Vendor of x86 processor microarchitectures. */
-	cpuinfo_vendor_dmp               = 33,
-	/** Motorola, Inc. Vendor of PowerPC and ARM processor microarchitectures. */
-	cpuinfo_vendor_motorola          = 34,
-
-	/* Defunct CPU vendors */
-
-	/**
-	 * Transmeta Corporation. Vendor of x86 processor microarchitectures.
-	 *
-	 * Now defunct. The last processor design was released in 2004.
-	 * Transmeta processors implemented VLIW ISA and used binary translation to execute x86 code.
-	 */
-	cpuinfo_vendor_transmeta = 50,
-	/**
-	 * Cyrix Corporation. Vendor of x86 processor microarchitectures.
-	 *
-	 * Now defunct. The last processor design was released in 1996.
-	 */
-	cpuinfo_vendor_cyrix     = 51,
-	/**
-	 * Rise Technology. Vendor of x86 processor microarchitectures.
-	 *
-	 * Now defunct. The last processor design was released in 1999.
-	 */
-	cpuinfo_vendor_rise      = 52,
-	/**
-	 * National Semiconductor. Vendor of x86 processor microarchitectures.
-	 *
-	 * Sold its x86 design subsidiary in 1999. The last processor design was released in 1998.
-	 */
-	cpuinfo_vendor_nsc       = 53,
-	/**
-	 * Silicon Integrated Systems. Vendor of x86 processor microarchitectures.
-	 *
-	 * Sold its x86 design subsidiary in 2001. The last processor design was released in 2001.
-	 */
-	cpuinfo_vendor_sis       = 54,
-	/**
-	 * NexGen. Vendor of x86 processor microarchitectures.
-	 *
-	 * Now defunct. The last processor design was released in 1994.
-	 * NexGen designed the first x86 microarchitecture which decomposed x86 instructions into simple microoperations.
-	 */
-	cpuinfo_vendor_nexgen    = 55,
-	/**
-	 * United Microelectronics Corporation. Vendor of x86 processor microarchitectures.
-	 *
-	 * Ceased x86 in the early 1990s. The last processor design was released in 1991.
-	 * Designed U5C and U5D processors. Both are 486 level.
-	 */
-	cpuinfo_vendor_umc       = 56,
-	/**
-	 * Digital Equipment Corporation. Vendor of ARM processor microarchitecture.
-	 *
-	 * Sold its ARM designs in 1997. The last processor design was released in 1997.
-	 */
-	cpuinfo_vendor_dec       = 57,
-};
-
-/**
- * Processor microarchitecture
- *
- * Processors with different microarchitectures often have different instruction performance characteristics,
- * and may have dramatically different pipeline organization.
- */
-enum cpuinfo_uarch {
-	/** Microarchitecture is unknown, or the library failed to get information about the microarchitecture from OS */
-	cpuinfo_uarch_unknown = 0,
-
-	/** Pentium and Pentium MMX microarchitecture. */
-	cpuinfo_uarch_p5    = 0x00100100,
-	/** Intel Quark microarchitecture. */
-	cpuinfo_uarch_quark = 0x00100101,
-
-	/** Pentium Pro, Pentium II, and Pentium III. */
-	cpuinfo_uarch_p6           = 0x00100200,
-	/** Pentium M. */
-	cpuinfo_uarch_dothan       = 0x00100201,
-	/** Intel Core microarchitecture. */
-	cpuinfo_uarch_yonah        = 0x00100202,
-	/** Intel Core 2 microarchitecture on 65 nm process. */
-	cpuinfo_uarch_conroe       = 0x00100203,
-	/** Intel Core 2 microarchitecture on 45 nm process. */
-	cpuinfo_uarch_penryn       = 0x00100204,
-	/** Intel Nehalem and Westmere microarchitectures (Core i3/i5/i7 1st gen). */
-	cpuinfo_uarch_nehalem      = 0x00100205,
-	/** Intel Sandy Bridge microarchitecture (Core i3/i5/i7 2nd gen). */
-	cpuinfo_uarch_sandy_bridge = 0x00100206,
-	/** Intel Ivy Bridge microarchitecture (Core i3/i5/i7 3rd gen). */
-	cpuinfo_uarch_ivy_bridge   = 0x00100207,
-	/** Intel Haswell microarchitecture (Core i3/i5/i7 4th gen). */
-	cpuinfo_uarch_haswell      = 0x00100208,
-	/** Intel Broadwell microarchitecture. */
-	cpuinfo_uarch_broadwell    = 0x00100209,
-	/** Intel Sky Lake microarchitecture. */
-	cpuinfo_uarch_sky_lake     = 0x0010020A,
-	/** Intel Kaby Lake microarchitecture. */
-	cpuinfo_uarch_kaby_lake    = 0x0010020B,
-
-	/** Pentium 4 with Willamette, Northwood, or Foster cores. */
-	cpuinfo_uarch_willamette = 0x00100300,
-	/** Pentium 4 with Prescott and later cores. */
-	cpuinfo_uarch_prescott   = 0x00100301,
-
-	/** Intel Atom on 45 nm process. */
-	cpuinfo_uarch_bonnell    = 0x00100400,
-	/** Intel Atom on 32 nm process. */
-	cpuinfo_uarch_saltwell   = 0x00100401,
-	/** Intel Silvermont microarchitecture (22 nm out-of-order Atom). */
-	cpuinfo_uarch_silvermont = 0x00100402,
-	/** Intel Airmont microarchitecture (14 nm out-of-order Atom). */
-	cpuinfo_uarch_airmont    = 0x00100403,
-
-	/** Intel Knights Ferry HPC boards. */
-	cpuinfo_uarch_knights_ferry   = 0x00100500,
-	/** Intel Knights Corner HPC boards (aka Xeon Phi). */
-	cpuinfo_uarch_knights_corner  = 0x00100501,
-	/** Intel Knights Landing microarchitecture (second-gen MIC). */
-	cpuinfo_uarch_knights_landing = 0x00100502,
-	/** Intel Knights Hill microarchitecture (third-gen MIC). */
-	cpuinfo_uarch_knights_hill    = 0x00100503,
-	/** Intel Knights Mill Xeon Phi. */
-	cpuinfo_uarch_knights_mill    = 0x00100504,
-
-	/** Intel/Marvell XScale series. */
-	cpuinfo_uarch_xscale = 0x00100600,
-
-	/** AMD K5. */
-	cpuinfo_uarch_k5        = 0x00200100,
-	/** AMD K6 and alike. */
-	cpuinfo_uarch_k6        = 0x00200101,
-	/** AMD Athlon and Duron. */
-	cpuinfo_uarch_k7        = 0x00200102,
-	/** AMD Athlon 64, Opteron 64. */
-	cpuinfo_uarch_k8        = 0x00200103,
-	/** AMD Family 10h (Barcelona, Istambul, Magny-Cours). */
-	cpuinfo_uarch_k10       = 0x00200104,
-	/**
-	 * AMD Bulldozer microarchitecture
-	 * Zambezi FX-series CPUs, Zurich, Valencia and Interlagos Opteron CPUs.
-	 */
-	cpuinfo_uarch_bulldozer = 0x00200105,
-	/**
-	 * AMD Piledriver microarchitecture
-	 * Vishera FX-series CPUs, Trinity and Richland APUs, Delhi, Seoul, Abu Dhabi Opteron CPUs.
-	 */
-	cpuinfo_uarch_piledriver  = 0x00200106,
-	/** AMD Steamroller microarchitecture (Kaveri APUs). */
-	cpuinfo_uarch_steamroller = 0x00200107,
-	/** AMD Excavator microarchitecture (Carizzo APUs). */
-	cpuinfo_uarch_excavator   = 0x00200108,
-	/** AMD Zen microarchitecture (Ryzen CPUs). */
-	cpuinfo_uarch_zen         = 0x00200109,
-
-	/** NSC Geode and AMD Geode GX and LX. */
-	cpuinfo_uarch_geode  = 0x00200200,
-	/** AMD Bobcat mobile microarchitecture. */
-	cpuinfo_uarch_bobcat = 0x00200201,
-	/** AMD Jaguar mobile microarchitecture. */
-	cpuinfo_uarch_jaguar = 0x00200202,
-	/** AMD Puma mobile microarchitecture. */
-	cpuinfo_uarch_puma   = 0x00200203,
-
-	/** ARM7 series. */
-	cpuinfo_uarch_arm7  = 0x00300100,
-	/** ARM9 series. */
-	cpuinfo_uarch_arm9  = 0x00300101,
-	/** ARM 1136, ARM 1156, ARM 1176, or ARM 11MPCore. */
-	cpuinfo_uarch_arm11 = 0x00300102,
-
-	/** ARM Cortex-A5. */
-	cpuinfo_uarch_cortex_a5  = 0x00300205,
-	/** ARM Cortex-A7. */
-	cpuinfo_uarch_cortex_a7  = 0x00300207,
-	/** ARM Cortex-A8. */
-	cpuinfo_uarch_cortex_a8  = 0x00300208,
-	/** ARM Cortex-A9. */
-	cpuinfo_uarch_cortex_a9  = 0x00300209,
-	/** ARM Cortex-A12. */
-	cpuinfo_uarch_cortex_a12 = 0x00300212,
-	/** ARM Cortex-A15. */
-	cpuinfo_uarch_cortex_a15 = 0x00300215,
-	/** ARM Cortex-A17. */
-	cpuinfo_uarch_cortex_a17 = 0x00300217,
-
-	/** ARM Cortex-A32. */
-	cpuinfo_uarch_cortex_a32 = 0x00300332,
-	/** ARM Cortex-A35. */
-	cpuinfo_uarch_cortex_a35 = 0x00300335,
-	/** ARM Cortex-A53. */
-	cpuinfo_uarch_cortex_a53 = 0x00300353,
-	/** ARM Cortex-A55. */
-	cpuinfo_uarch_cortex_a55 = 0x00300355,
-	/** ARM Cortex-A57. */
-	cpuinfo_uarch_cortex_a57 = 0x00300357,
-	/** ARM Cortex-A72. */
-	cpuinfo_uarch_cortex_a72 = 0x00300372,
-	/** ARM Cortex-A73. */
-	cpuinfo_uarch_cortex_a73 = 0x00300373,
-	/** ARM Cortex-A75. */
-	cpuinfo_uarch_cortex_a75 = 0x00300375,
-	/** ARM Cortex-A76. */
-	cpuinfo_uarch_cortex_a76 = 0x00300376,
-
-	/** Qualcomm Scorpion. */
-	cpuinfo_uarch_scorpion = 0x00400100,
-	/** Qualcomm Krait. */
-	cpuinfo_uarch_krait    = 0x00400101,
-	/** Qualcomm Kryo. */
-	cpuinfo_uarch_kryo     = 0x00400102,
-	/** Qualcomm Falkor. */
-	cpuinfo_uarch_falkor   = 0x00400103,
-	/** Qualcomm Saphira. */
-	cpuinfo_uarch_saphira  = 0x00400104,
-
-	/** Nvidia Denver. */
-	cpuinfo_uarch_denver   = 0x00500100,
-	/** Nvidia Denver 2. */
-	cpuinfo_uarch_denver2  = 0x00500101,
-	/** Nvidia Carmel. */
-	cpuinfo_uarch_carmel   = 0x00500102,
-
-	/** Samsung Mongoose M1 (Exynos 8890 big cores). */
-	cpuinfo_uarch_mongoose_m1 = 0x00600100,
-	/** Samsung Mongoose M2 (Exynos 8895 big cores). */
-	cpuinfo_uarch_mongoose_m2 = 0x00600101,
-	/** Samsung Meerkat M3 (Exynos 9810 big cores). */
-	cpuinfo_uarch_meerkat_m3  = 0x00600102,
-
-	/** Apple A6 and A6X processors. */
-	cpuinfo_uarch_swift     = 0x00700100,
-	/** Apple A7 processor. */
-	cpuinfo_uarch_cyclone   = 0x00700101,
-	/** Apple A8 and A8X processor. */
-	cpuinfo_uarch_typhoon   = 0x00700102,
-	/** Apple A9 and A9X processor. */
-	cpuinfo_uarch_twister   = 0x00700103,
-	/** Apple A10 and A10X processor. */
-	cpuinfo_uarch_hurricane = 0x00700104,
-	/** Apple A11 processor (big cores). */
-	cpuinfo_uarch_monsoon   = 0x00700105,
-	/** Apple A11 processor (little cores). */
-	cpuinfo_uarch_mistral   = 0x00700106,
-
-	/** Cavium ThunderX. */
-	cpuinfo_uarch_thunderx = 0x00800100,
-	/** Cavium ThunderX2 (originally Broadcom Vulkan). */
-	cpuinfo_uarch_thunderx2 = 0x00800200,
-
-	/** Marvell PJ4. */
-	cpuinfo_uarch_pj4 = 0x00900100,
-
-	/** Broadcom Brahma B15. */
-	cpuinfo_uarch_brahma_b15 = 0x00A00100,
-	/** Broadcom Brahma B53. */
-	cpuinfo_uarch_brahma_b53 = 0x00A00101,
-
-	/** Applied Micro X-Gene. */
-	cpuinfo_uarch_xgene = 0x00B00100,
-};
-
-struct cpuinfo_processor {
-	/** SMT (hyperthread) ID within a core */
-	uint32_t smt_id;
-	/** Core containing this logical processor */
-	const struct cpuinfo_core* core;
-	/** Cluster of cores containing this logical processor */
-	const struct cpuinfo_cluster* cluster;
-	/** Physical package containing this logical processor */
-	const struct cpuinfo_package* package;
-#if defined(__linux__)
-	/**
-	 * Linux-specific ID for the logical processor:
-	 * - Linux kernel exposes information about this logical processor in /sys/devices/system/cpu/cpu<linux_id>/
-	 * - Bit <linux_id> in the cpu_set_t identifies this logical processor
-	 */
-	int linux_id;
-#endif
-#if defined(_WIN32)
-	/** Windows-specific ID for the group containing the logical processor. */
-	uint16_t windows_group_id;
-	/**
-	 * Windows-specific ID of the logical processor within its group:
-	 * - Bit <windows_processor_id> in the KAFFINITY mask identifies this logical processor within its group.
-	 */
-	uint16_t windows_processor_id;
-#endif
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-	/** APIC ID (unique x86-specific ID of the logical processor) */
-	uint32_t apic_id;
-#endif
-	struct {
-		/** Level 1 instruction cache */
-		const struct cpuinfo_cache* l1i;
-		/** Level 1 data cache */
-		const struct cpuinfo_cache* l1d;
-		/** Level 2 unified or data cache */
-		const struct cpuinfo_cache* l2;
-		/** Level 3 unified or data cache */
-		const struct cpuinfo_cache* l3;
-		/** Level 4 unified or data cache */
-		const struct cpuinfo_cache* l4;
-	} cache;
-};
-
-struct cpuinfo_core {
-	/** Index of the first logical processor on this core. */
-	uint32_t processor_start;
-	/** Number of logical processors on this core */
-	uint32_t processor_count;
-	/** Core ID within a package */
-	uint32_t core_id;
-	/** Cluster containing this core */
-	const struct cpuinfo_cluster* cluster;
-	/** Physical package containing this core. */
-	const struct cpuinfo_package* package;
-	/** Vendor of the CPU microarchitecture for this core */
-	enum cpuinfo_vendor vendor;
-	/** CPU microarchitecture for this core */
-	enum cpuinfo_uarch uarch;
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-	/** Value of CPUID leaf 1 EAX register for this core */
-	uint32_t cpuid;
-#elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
-	/** Value of Main ID Register (MIDR) for this core */
-	uint32_t midr;
-#endif
-	/** Clock rate (non-Turbo) of the core, in Hz */
-	uint64_t frequency;
-};
-
-struct cpuinfo_cluster {
-	/** Index of the first logical processor in the cluster */
-	uint32_t processor_start;
-	/** Number of logical processors in the cluster */
-	uint32_t processor_count;
-	/** Index of the first core in the cluster */
-	uint32_t core_start;
-	/** Number of cores on the cluster */
-	uint32_t core_count;
-	/** Cluster ID within a package */
-	uint32_t cluster_id;
-	/** Physical package containing the cluster */
-	const struct cpuinfo_package* package;
-	/** CPU microarchitecture vendor of the cores in the cluster */
-	enum cpuinfo_vendor vendor;
-	/** CPU microarchitecture of the cores in the cluster */
-	enum cpuinfo_uarch uarch;
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-	/** Value of CPUID leaf 1 EAX register of the cores in the cluster */
-	uint32_t cpuid;
-#elif CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
-	/** Value of Main ID Register (MIDR) of the cores in the cluster */
-	uint32_t midr;
-#endif
-	/** Clock rate (non-Turbo) of the cores in the cluster, in Hz */
-	uint64_t frequency;
-};
-
-#define CPUINFO_PACKAGE_NAME_MAX 48
-#define CPUINFO_GPU_NAME_MAX 64
-
-struct cpuinfo_package {
-	/** SoC or processor chip model name */
-	char name[CPUINFO_PACKAGE_NAME_MAX];
-#if defined(__ANDROID__) || (defined(__APPLE__) && TARGET_OS_IPHONE)
-	/** Integrated GPU model name */
-	char gpu_name[CPUINFO_GPU_NAME_MAX];
-#endif
-	/** Index of the first logical processor on this physical package */
-	uint32_t processor_start;
-	/** Number of logical processors on this physical package */
-	uint32_t processor_count;
-	/** Index of the first core on this physical package */
-	uint32_t core_start;
-	/** Number of cores on this physical package */
-	uint32_t core_count;
-	/** Index of the first cluster of cores on this physical package */
-	uint32_t cluster_start;
-	/** Number of clusters of cores on this physical package */
-	uint32_t cluster_count;
-};
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-bool CPUINFO_ABI cpuinfo_initialize(void);
-
-void CPUINFO_ABI cpuinfo_deinitialize(void);
-
-#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-	/* This structure is not a part of stable API. Use cpuinfo_has_x86_* functions instead. */
-	struct cpuinfo_x86_isa {
-		#if CPUINFO_ARCH_X86
-			bool rdtsc;
-		#endif
-		bool rdtscp;
-		bool rdpid;
-		bool sysenter;
-		#if CPUINFO_ARCH_X86
-			bool syscall;
-		#endif
-		bool msr;
-		bool clzero;
-		bool clflush;
-		bool clflushopt;
-		bool mwait;
-		bool mwaitx;
-		#if CPUINFO_ARCH_X86
-			bool emmx;
-		#endif
-		bool fxsave;
-		bool xsave;
-		#if CPUINFO_ARCH_X86
-			bool fpu;
-			bool mmx;
-			bool mmx_plus;
-		#endif
-		bool three_d_now;
-		bool three_d_now_plus;
-		#if CPUINFO_ARCH_X86
-			bool three_d_now_geode;
-		#endif
-		bool prefetch;
-		bool prefetchw;
-		bool prefetchwt1;
-		#if CPUINFO_ARCH_X86
-			bool daz;
-			bool sse;
-			bool sse2;
-		#endif
-		bool sse3;
-		bool ssse3;
-		bool sse4_1;
-		bool sse4_2;
-		bool sse4a;
-		bool misaligned_sse;
-		bool avx;
-		bool fma3;
-		bool fma4;
-		bool xop;
-		bool f16c;
-		bool avx2;
-		bool avx512f;
-		bool avx512pf;
-		bool avx512er;
-		bool avx512cd;
-		bool avx512dq;
-		bool avx512bw;
-		bool avx512vl;
-		bool avx512ifma;
-		bool avx512vbmi;
-		bool avx512vbmi2;
-		bool avx512bitalg;
-		bool avx512vpopcntdq;
-		bool avx512vnni;
-		bool avx512_4vnniw;
-		bool avx512_4fmaps;
-		bool hle;
-		bool rtm;
-		bool xtest;
-		bool mpx;
-		#if CPUINFO_ARCH_X86
-			bool cmov;
-			bool cmpxchg8b;
-		#endif
-		bool cmpxchg16b;
-		bool clwb;
-		bool movbe;
-		#if CPUINFO_ARCH_X86_64
-			bool lahf_sahf;
-		#endif
-		bool fs_gs_base;
-		bool lzcnt;
-		bool popcnt;
-		bool tbm;
-		bool bmi;
-		bool bmi2;
-		bool adx;
-		bool aes;
-		bool vaes;
-		bool pclmulqdq;
-		bool vpclmulqdq;
-		bool gfni;
-		bool rdrand;
-		bool rdseed;
-		bool sha;
-		bool rng;
-		bool ace;
-		bool ace2;
-		bool phe;
-		bool pmm;
-		bool lwp;
-	};
-
-	extern struct cpuinfo_x86_isa cpuinfo_isa;
-#endif
-
-static inline bool cpuinfo_has_x86_rdtsc(void) {
-	#if CPUINFO_ARCH_X86_64
-		return true;
-	#elif CPUINFO_ARCH_X86
-		#if defined(__ANDROID__)
-			return true;
-		#else
-			return cpuinfo_isa.rdtsc;
-		#endif
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_rdtscp(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.rdtscp;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_rdpid(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.rdpid;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_clzero(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.clzero;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_mwait(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.mwait;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_mwaitx(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.mwaitx;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_fxsave(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.fxsave;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_xsave(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.xsave;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_fpu(void) {
-	#if CPUINFO_ARCH_X86_64
-		return true;
-	#elif CPUINFO_ARCH_X86
-		#if defined(__ANDROID__)
-			return true;
-		#else
-			return cpuinfo_isa.fpu;
-		#endif
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_mmx(void) {
-	#if CPUINFO_ARCH_X86_64
-		return true;
-	#elif CPUINFO_ARCH_X86
-		#if defined(__ANDROID__)
-			return true;
-		#else
-			return cpuinfo_isa.mmx;
-		#endif
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_mmx_plus(void) {
-	#if CPUINFO_ARCH_X86_64
-		return true;
-	#elif CPUINFO_ARCH_X86
-		#if defined(__ANDROID__)
-			return true;
-		#else
-			return cpuinfo_isa.mmx_plus;
-		#endif
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_3dnow(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.three_d_now;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_3dnow_plus(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.three_d_now_plus;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_3dnow_geode(void) {
-	#if CPUINFO_ARCH_X86_64
-		return false;
-	#elif CPUINFO_ARCH_X86
-		#if defined(__ANDROID__)
-			return false;
-		#else
-			return cpuinfo_isa.three_d_now_geode;
-		#endif
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_prefetch(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.prefetch;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_prefetchw(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.prefetchw;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_prefetchwt1(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.prefetchwt1;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_daz(void) {
-	#if CPUINFO_ARCH_X86_64
-		return true;
-	#elif CPUINFO_ARCH_X86
-		#if defined(__ANDROID__)
-			return true;
-		#else
-			return cpuinfo_isa.daz;
-		#endif
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_sse(void) {
-	#if CPUINFO_ARCH_X86_64
-		return true;
-	#elif CPUINFO_ARCH_X86
-		#if defined(__ANDROID__)
-			return true;
-		#else
-			return cpuinfo_isa.sse;
-		#endif
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_sse2(void) {
-	#if CPUINFO_ARCH_X86_64
-		return true;
-	#elif CPUINFO_ARCH_X86
-		#if defined(__ANDROID__)
-			return true;
-		#else
-			return cpuinfo_isa.sse2;
-		#endif
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_sse3(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		#if defined(__ANDROID__)
-			return true;
-		#else
-			return cpuinfo_isa.sse3;
-		#endif
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_ssse3(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		#if defined(__ANDROID__)
-			return true;
-		#else
-			return cpuinfo_isa.ssse3;
-		#endif
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_sse4_1(void) {
-	#if CPUINFO_ARCH_X86_64
-		#if defined(__ANDROID__)
-			return true;
-		#else
-			return cpuinfo_isa.sse4_1;
-		#endif
-	#elif CPUINFO_ARCH_X86
-		return cpuinfo_isa.sse4_1;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_sse4_2(void) {
-	#if CPUINFO_ARCH_X86_64
-		#if defined(__ANDROID__)
-			return true;
-		#else
-			return cpuinfo_isa.sse4_2;
-		#endif
-	#elif CPUINFO_ARCH_X86
-		return cpuinfo_isa.sse4_2;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_sse4a(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.sse4a;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_misaligned_sse(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.misaligned_sse;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_avx(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.avx;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_fma3(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.fma3;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_fma4(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.fma4;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_xop(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.xop;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_f16c(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.f16c;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_avx2(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.avx2;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_avx512f(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.avx512f;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_avx512pf(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.avx512pf;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_avx512er(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.avx512er;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_avx512cd(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.avx512cd;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_avx512dq(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.avx512dq;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_avx512bw(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.avx512bw;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_avx512vl(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.avx512vl;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_avx512ifma(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.avx512ifma;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_avx512vbmi(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.avx512vbmi;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_avx512vbmi2(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.avx512vbmi2;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_avx512bitalg(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.avx512bitalg;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_avx512vpopcntdq(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.avx512vpopcntdq;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_avx512vnni(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.avx512vnni;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_avx512_4vnniw(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.avx512_4vnniw;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_avx512_4fmaps(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.avx512_4fmaps;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_hle(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.hle;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_rtm(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.rtm;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_xtest(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.xtest;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_mpx(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.mpx;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_cmov(void) {
-	#if CPUINFO_ARCH_X86_64
-		return true;
-	#elif CPUINFO_ARCH_X86
-		return cpuinfo_isa.cmov;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_cmpxchg8b(void) {
-	#if CPUINFO_ARCH_X86_64
-		return true;
-	#elif CPUINFO_ARCH_X86
-		return cpuinfo_isa.cmpxchg8b;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_cmpxchg16b(void) {
-	#if CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.cmpxchg16b;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_clwb(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.clwb;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_movbe(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.movbe;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_lahf_sahf(void) {
-	#if CPUINFO_ARCH_X86
-		return true;
-	#elif CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.lahf_sahf;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_lzcnt(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.lzcnt;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_popcnt(void) {
-	#if CPUINFO_ARCH_X86_64
-		#if defined(__ANDROID__)
-			return true;
-		#else
-			return cpuinfo_isa.popcnt;
-		#endif
-	#elif CPUINFO_ARCH_X86
-		return cpuinfo_isa.popcnt;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_tbm(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.tbm;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_bmi(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.bmi;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_bmi2(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.bmi2;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_adx(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.adx;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_aes(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.aes;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_vaes(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.vaes;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_pclmulqdq(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.pclmulqdq;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_vpclmulqdq(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.vpclmulqdq;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_gfni(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.gfni;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_rdrand(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.rdrand;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_rdseed(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.rdseed;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_x86_sha(void) {
-	#if CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64
-		return cpuinfo_isa.sha;
-	#else
-		return false;
-	#endif
-}
-
-#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
-	/* This structure is not a part of stable API. Use cpuinfo_has_arm_* functions instead. */
-	struct cpuinfo_arm_isa {
-		#if CPUINFO_ARCH_ARM
-			bool thumb;
-			bool thumb2;
-			bool thumbee;
-			bool jazelle;
-			bool armv5e;
-			bool armv6;
-			bool armv6k;
-			bool armv7;
-			bool armv7mp;
-			bool idiv;
-
-			bool vfpv2;
-			bool vfpv3;
-			bool d32;
-			bool fp16;
-			bool fma;
-
-			bool wmmx;
-			bool wmmx2;
-			bool neon;
-		#endif
-		#if CPUINFO_ARCH_ARM64
-			bool atomics;
-		#endif
-		bool rdm;
-		bool fp16arith;
-		bool jscvt;
-		bool fcma;
-
-		bool aes;
-		bool sha1;
-		bool sha2;
-		bool pmull;
-		bool crc32;
-	};
-
-	extern struct cpuinfo_arm_isa cpuinfo_isa;
-#endif
-
-static inline bool cpuinfo_has_arm_thumb(void) {
-	#if CPUINFO_ARCH_ARM
-		return cpuinfo_isa.thumb;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_thumb2(void) {
-	#if CPUINFO_ARCH_ARM
-		return cpuinfo_isa.thumb2;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_v5e(void) {
-	#if CPUINFO_ARCH_ARM
-		return cpuinfo_isa.armv5e;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_v6(void) {
-	#if CPUINFO_ARCH_ARM
-		return cpuinfo_isa.armv6;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_v6k(void) {
-	#if CPUINFO_ARCH_ARM
-		return cpuinfo_isa.armv6k;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_v7(void) {
-	#if CPUINFO_ARCH_ARM
-		return cpuinfo_isa.armv7;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_v7mp(void) {
-	#if CPUINFO_ARCH_ARM
-		return cpuinfo_isa.armv7mp;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_idiv(void) {
-	#if CPUINFO_ARCH_ARM64
-		return true;
-	#elif CPUINFO_ARCH_ARM
-		return cpuinfo_isa.idiv;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_vfpv2(void) {
-	#if CPUINFO_ARCH_ARM
-		return cpuinfo_isa.vfpv2;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_vfpv3(void) {
-	#if CPUINFO_ARCH_ARM64
-		return true;
-	#elif CPUINFO_ARCH_ARM
-		return cpuinfo_isa.vfpv3;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_vfpv3_d32(void) {
-	#if CPUINFO_ARCH_ARM64
-		return true;
-	#elif CPUINFO_ARCH_ARM
-		return cpuinfo_isa.vfpv3 && cpuinfo_isa.d32;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_vfpv3_fp16(void) {
-	#if CPUINFO_ARCH_ARM64
-		return true;
-	#elif CPUINFO_ARCH_ARM
-		return cpuinfo_isa.vfpv3 && cpuinfo_isa.fp16;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_vfpv3_fp16_d32(void) {
-	#if CPUINFO_ARCH_ARM64
-		return true;
-	#elif CPUINFO_ARCH_ARM
-		return cpuinfo_isa.vfpv3 && cpuinfo_isa.fp16 && cpuinfo_isa.d32;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_vfpv4(void) {
-	#if CPUINFO_ARCH_ARM64
-		return true;
-	#elif CPUINFO_ARCH_ARM
-		return cpuinfo_isa.vfpv3 && cpuinfo_isa.fma;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_vfpv4_d32(void) {
-	#if CPUINFO_ARCH_ARM64
-		return true;
-	#elif CPUINFO_ARCH_ARM
-		return cpuinfo_isa.vfpv3 && cpuinfo_isa.fma && cpuinfo_isa.d32;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_wmmx(void) {
-	#if CPUINFO_ARCH_ARM
-		return cpuinfo_isa.wmmx;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_wmmx2(void) {
-	#if CPUINFO_ARCH_ARM
-		return cpuinfo_isa.wmmx2;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_neon(void) {
-	#if CPUINFO_ARCH_ARM64
-		return true;
-	#elif CPUINFO_ARCH_ARM
-		return cpuinfo_isa.neon;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_neon_fp16(void) {
-	#if CPUINFO_ARCH_ARM64
-		return true;
-	#elif CPUINFO_ARCH_ARM
-		return cpuinfo_isa.neon && cpuinfo_isa.fp16;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_neon_fma(void) {
-	#if CPUINFO_ARCH_ARM64
-		return true;
-	#elif CPUINFO_ARCH_ARM
-		return cpuinfo_isa.neon && cpuinfo_isa.fma;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_atomics(void) {
-	#if CPUINFO_ARCH_ARM64
-		return cpuinfo_isa.atomics;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_neon_rdm(void) {
-	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
-		return cpuinfo_isa.rdm;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_neon_fp16_arith(void) {
-	#if CPUINFO_ARCH_ARM
-		return cpuinfo_isa.neon && cpuinfo_isa.fp16arith;
-	#elif CPUINFO_ARCH_ARM64
-		return cpuinfo_isa.fp16arith;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_fp16_arith(void) {
-	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
-		return cpuinfo_isa.fp16arith;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_jscvt(void) {
-	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
-		return cpuinfo_isa.jscvt;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_fcma(void) {
-	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
-		return cpuinfo_isa.fcma;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_aes(void) {
-	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
-		return cpuinfo_isa.aes;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_sha1(void) {
-	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
-		return cpuinfo_isa.sha1;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_sha2(void) {
-	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
-		return cpuinfo_isa.sha2;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_pmull(void) {
-	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
-		return cpuinfo_isa.pmull;
-	#else
-		return false;
-	#endif
-}
-
-static inline bool cpuinfo_has_arm_crc32(void) {
-	#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
-		return cpuinfo_isa.crc32;
-	#else
-		return false;
-	#endif
-}
-
-const struct cpuinfo_processor* CPUINFO_ABI cpuinfo_get_processors(void);
-const struct cpuinfo_core* CPUINFO_ABI cpuinfo_get_cores(void);
-const struct cpuinfo_cluster* CPUINFO_ABI cpuinfo_get_clusters(void);
-const struct cpuinfo_package* CPUINFO_ABI cpuinfo_get_packages(void);
-const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l1i_caches(void);
-const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l1d_caches(void);
-const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l2_caches(void);
-const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l3_caches(void);
-const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l4_caches(void);
-
-const struct cpuinfo_processor* CPUINFO_ABI cpuinfo_get_processor(uint32_t index);
-const struct cpuinfo_core* CPUINFO_ABI cpuinfo_get_core(uint32_t index);
-const struct cpuinfo_cluster* CPUINFO_ABI cpuinfo_get_cluster(uint32_t index);
-const struct cpuinfo_package* CPUINFO_ABI cpuinfo_get_package(uint32_t index);
-const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l1i_cache(uint32_t index);
-const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l1d_cache(uint32_t index);
-const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l2_cache(uint32_t index);
-const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l3_cache(uint32_t index);
-const struct cpuinfo_cache* CPUINFO_ABI cpuinfo_get_l4_cache(uint32_t index);
-
-uint32_t CPUINFO_ABI cpuinfo_get_processors_count(void);
-uint32_t CPUINFO_ABI cpuinfo_get_cores_count(void);
-uint32_t CPUINFO_ABI cpuinfo_get_clusters_count(void);
-uint32_t CPUINFO_ABI cpuinfo_get_packages_count(void);
-uint32_t CPUINFO_ABI cpuinfo_get_l1i_caches_count(void);
-uint32_t CPUINFO_ABI cpuinfo_get_l1d_caches_count(void);
-uint32_t CPUINFO_ABI cpuinfo_get_l2_caches_count(void);
-uint32_t CPUINFO_ABI cpuinfo_get_l3_caches_count(void);
-uint32_t CPUINFO_ABI cpuinfo_get_l4_caches_count(void);
-
-const struct cpuinfo_processor* CPUINFO_ABI cpuinfo_get_current_processor(void);
-const struct cpuinfo_core* CPUINFO_ABI cpuinfo_get_current_core(void);
-
-// Patch to avoid "error: must use 'struct' tag to refer to type 'cpuinfo_processor'"
-// in Nim code. Also we can't reuse the struct name for typedef,
-// it would conflicts with extern struct in api.h
-typedef struct cpuinfo_cache cpuinfo_cache_exported;
-typedef struct cpuinfo_processor cpuinfo_processor_exported;
-typedef struct cpuinfo_core cpuinfo_core_exported;
-typedef struct cpuinfo_cluster cpuinfo_cluster_exported;
-typedef struct cpuinfo_package cpuinfo_package_exported;
-
-#ifdef __cplusplus
-} /* extern "C" */
-#endif
-
-#endif /* CPUINFO_H */
diff --git a/benchmarks/cpuinfo.nim b/benchmarks/cpuinfo.nim
deleted file mode 100644
index 6411937..0000000
--- a/benchmarks/cpuinfo.nim
+++ /dev/null
@@ -1,371 +0,0 @@
-# Original cpuinfo.h header
-# Copyright (c) 2017-2018 Facebook Inc.
-# Copyright (C) 2012-2017 Georgia Institute of Technology
-# Copyright (C) 2010-2012 Marat Dukhan
-#
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are met:
-#
-# * Redistributions of source code must retain the above copyright notice, this
-#   list of conditions and the following disclaimer.
-#
-# * Redistributions in binary form must reproduce the above copyright notice,
-#   this list of conditions and the following disclaimer in the documentation
-#   and/or other materials provided with the distribution.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
-# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-# Nim wrapper - Laser
-# Copyright (c) 2018 Mamy André-Ratsimbazafy
-# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
-# This file may not be copied, modified, or distributed except according to those terms.
-
-from strutils import split, rsplit
-from os import DirSep
-
-const
-  curSrcFolder = currentSourcePath.rsplit(DirSep, 1)[0]
-  cpuinfoPath = curSrcFolder & DirSep & "cpuinfo" & DirSep
-  # We use a patched header as the original one doesn't typedef its struct
-  # which lead to "error: must use 'struct' tag to refer to type 'cpuinfo_processor'"
-  # headerPath = cpuinfoPath & DirSep & "include" & DirSep & "cpuinfo.h"
-  headerPath = curSrcFolder & DirSep & "cpuinfo.h"
-
-###########################################
-############### Public API ################
-
-{.pragma:cpuinfo_type, header: headerPath, bycopy.}
-
-# Check compiler defined consts in:
-#   - https://github.com/nim-lang/Nim/blob/devel/compiler/platform.nim
-
-type
-  CPUInfo_cache* {.importc: "cpuinfo_cache_exported", cpuinfo_type.} = object
-    size* {.importc.}: uint32
-    associativity* {.importc.}: uint32
-    sets* {.importc.}: uint32
-    partitions* {.importc.}: uint32
-    line_size* {.importc.}: uint32
-    flags* {.importc.}: uint32
-    processor_start* {.importc.}: uint32
-    processor_count* {.importc.}: uint32
-
-  ProcCache* {.bycopy.} = object
-    l1i*: ptr CPUInfo_cache
-    l1d*: ptr CPUInfo_cache
-    l2*: ptr CPUInfo_cache
-    l3*: ptr CPUInfo_cache
-    l4*: ptr CPUInfo_cache
-
-  CPUInfo_processor* {.importc: "cpuinfo_processor_exported", cpuinfo_type.} = object
-    smt_id* {.importc.}: uint32
-    core* {.importc.}: ptr CPUInfo_core
-    cluster* {.importc.}: ptr CPUInfo_cluster
-    package* {.importc.}: ptr CPUInfo_package
-    cache* {.importc.}: ptr ProcCache
-
-  CPUInfo_core* {.importc: "cpuinfo_core_exported", cpuinfo_type.} = object
-    processor_start* {.importc.}: uint32
-    processor_count* {.importc.}: uint32
-    core_id* {.importc.}: uint32
-    cluster* {.importc.}: ptr CPUInfo_cluster
-    package* {.importc.}: ptr CPUInfo_package
-    vendor* {.importc.}: CPUInfo_vendor
-    uarch* {.importc.}: CPUInfo_uarch
-    when defined(i386) or defined(amd64):
-      cpuid* {.importc.}: uint32
-    when defined(arm) or defined(arm64):
-      midr* {.importc.}: uint32
-    frequency* {.importc.}: uint64
-
-  CPUInfo_cluster* {.importc: "cpuinfo_cluster_exported", cpuinfo_type.} = object
-    processor_start* {.importc.}: uint32
-    processor_count* {.importc.}: uint32
-    core_start* {.importc.}: uint32
-    core_count* {.importc.}: uint32
-    cluster_id* {.importc.}: uint32
-    package* {.importc.}: ptr CPUInfo_package
-    vendor* {.importc.}: CPUInfo_vendor
-    uarch* {.importc.}: CPUInfo_uarch
-    when defined(i386) or defined(amd64):
-      cpuid* {.importc.}: uint32
-    when defined(arm) or defined(arm64):
-      midr* {.importc.}: uint32
-    frequency* {.importc.}: uint64
-
-  CPUInfo_package* {.importc: "cpuinfo_package_exported", cpuinfo_type.} = object
-    name* {.importc.}: array[48, char]
-    when defined(android) or defined(ios):
-      # Make sure iOS is defined - https://github.com/nim-lang/Nim/issues/9369
-      gpu_name* {.importc.}: array[64, char]
-    processor_start* {.importc.}: uint32
-    processor_count* {.importc.}: uint32
-    core_start* {.importc.}: uint32
-    core_count* {.importc.}: uint32
-    cluster_start* {.importc.}: uint32
-    cluster_count* {.importc.}: uint32
-
-  CPUInfo_vendor* {.size: sizeof(cint).} = enum
-    ## * Processor vendor is not known to the library, or the library failed to get vendor information from the OS.
-    cpuinfo_vendor_unknown = 0,
-    ##  Active vendors of modern CPUs
-    cpuinfo_vendor_intel = 1,
-    cpuinfo_vendor_amd = 2,
-    cpuinfo_vendor_arm = 3,
-    cpuinfo_vendor_qualcomm = 4,
-    cpuinfo_vendor_apple = 5,
-    cpuinfo_vendor_samsung = 6,
-    cpuinfo_vendor_nvidia = 7,
-    cpuinfo_vendor_mips = 8,
-    cpuinfo_vendor_ibm = 9,
-    cpuinfo_vendor_ingenic = 10,
-    cpuinfo_vendor_via = 11,
-    cpuinfo_vendor_cavium = 12,
-    cpuinfo_vendor_broadcom = 13,
-    cpuinfo_vendor_apm = 14, # Applied Micro Circuits Corporation (APM)
-    cpuinfo_vendor_huawei = 15,
-    cpuinfo_vendor_texas_instruments = 30,
-    cpuinfo_vendor_marvell = 31,
-    cpuinfo_vendor_rdc = 32, # RDC Semiconductor Co.
-    cpuinfo_vendor_dmp = 33, # DM&P Electronics Inc.
-    cpuinfo_vendor_motorola = 34,
-    cpuinfo_vendor_transmeta = 50,
-    cpuinfo_vendor_cyrix = 51,
-    cpuinfo_vendor_rise = 52,
-    cpuinfo_vendor_nsc = 53, # National Semiconductor
-    cpuinfo_vendor_sis = 54, # Silicon Integrated Systems
-    cpuinfo_vendor_nexgen = 55,
-    cpuinfo_vendor_umc = 56, # United Microelectronics Corporation
-    cpuinfo_vendor_dec = 57 # Digital Equipment Corporation
-
-  CPUInfo_uarch* {.size: sizeof(cint).} = enum
-    cpuinfo_uarch_unknown = 0, ## Microarchitecture is unknown, or the library failed to get information about the microarchitecture from OS
-    cpuinfo_uarch_p5 = 0x00100100, ## Pentium and Pentium MMX microarchitecture.
-    cpuinfo_uarch_quark = 0x00100101, ## Intel Quark microarchitecture.
-    cpuinfo_uarch_p6 = 0x00100200, ## Pentium Pro, Pentium II, and Pentium III.
-    cpuinfo_uarch_dothan = 0x00100201, ## Pentium M.
-    cpuinfo_uarch_yonah = 0x00100202, ## Intel Core microarchitecture.
-    cpuinfo_uarch_conroe = 0x00100203, ## Intel Core 2 microarchitecture on 65 nm process.
-    cpuinfo_uarch_penryn = 0x00100204, ## Intel Core 2 microarchitecture on 45 nm process.
-    cpuinfo_uarch_nehalem = 0x00100205, ## Intel Nehalem and Westmere microarchitectures (Core i3/i5/i7 1st gen).
-    cpuinfo_uarch_sandy_bridge = 0x00100206, ## Intel Sandy Bridge microarchitecture (Core i3/i5/i7 2nd gen).
-    cpuinfo_uarch_ivy_bridge = 0x00100207, ## Intel Ivy Bridge microarchitecture (Core i3/i5/i7 3rd gen).
-    cpuinfo_uarch_haswell = 0x00100208, ## Intel Haswell microarchitecture (Core i3/i5/i7 4th gen).
-    cpuinfo_uarch_broadwell = 0x00100209, ## Intel Broadwell microarchitecture.
-    cpuinfo_uarch_sky_lake = 0x0010020A, ## Intel Sky Lake microarchitecture.
-    cpuinfo_uarch_kaby_lake = 0x0010020B, ## Intel Kaby Lake microarchitecture.
-    cpuinfo_uarch_willamette = 0x00100300, ## Pentium 4 with Willamette, Northwood, or Foster cores.
-    cpuinfo_uarch_prescott = 0x00100301, ## Pentium 4 with Prescott and later cores.
-    cpuinfo_uarch_bonnell = 0x00100400, ## Intel Atom on 45 nm process.
-    cpuinfo_uarch_saltwell = 0x00100401, ## Intel Atom on 32 nm process.
-    cpuinfo_uarch_silvermont = 0x00100402, ## Intel Silvermont microarchitecture (22 nm out-of-order Atom).
-    cpuinfo_uarch_airmont = 0x00100403, ## Intel Airmont microarchitecture (14 nm out-of-order Atom).
-    cpuinfo_uarch_knights_ferry = 0x00100500, ## Intel Knights Ferry HPC boards.
-    cpuinfo_uarch_knights_corner = 0x00100501, ## Intel Knights Corner HPC boards (aka Xeon Phi).
-    cpuinfo_uarch_knights_landing = 0x00100502, ## Intel Knights Landing microarchitecture (second-gen MIC).
-    cpuinfo_uarch_knights_hill = 0x00100503, ## Intel Knights Hill microarchitecture (third-gen MIC).
-    cpuinfo_uarch_knights_mill = 0x00100504, ## Intel Knights Mill Xeon Phi.
-    cpuinfo_uarch_xscale = 0x00100600, ## Intel/Marvell XScale series.
-    cpuinfo_uarch_k5 = 0x00200100, ## AMD K5.
-    cpuinfo_uarch_k6 = 0x00200101, ## AMD K6 and alike.
-    cpuinfo_uarch_k7 = 0x00200102, ## AMD Athlon and Duron.
-    cpuinfo_uarch_k8 = 0x00200103, ## AMD Athlon 64, Opteron 64.
-    cpuinfo_uarch_k10 = 0x00200104, ## AMD Family 10h (Barcelona, Istambul, Magny-Cours).
-    cpuinfo_uarch_bulldozer = 0x00200105, ## AMD Bulldozer microarchitecture. Zambezi FX-series CPUs, Zurich, Valencia and Interlagos Opteron CPUs.
-    cpuinfo_uarch_piledriver = 0x00200106, ## AMD Piledriver microarchitecture. Vishera FX-series CPUs, Trinity and Richland APUs, Delhi, Seoul, Abu Dhabi Opteron CPUs.
-    cpuinfo_uarch_steamroller = 0x00200107, ## AMD Steamroller microarchitecture (Kaveri APUs).
-    cpuinfo_uarch_excavator = 0x00200108, ## AMD Excavator microarchitecture (Carizzo APUs).
-    cpuinfo_uarch_zen = 0x00200109, ## AMD Zen microarchitecture (Ryzen CPUs).
-    cpuinfo_uarch_geode = 0x00200200, ## NSC Geode and AMD Geode GX and LX.
-    cpuinfo_uarch_bobcat = 0x00200201, ## AMD Bobcat mobile microarchitecture.
-    cpuinfo_uarch_jaguar = 0x00200202, ## AMD Jaguar mobile microarchitecture.
-    cpuinfo_uarch_puma = 0x00200203, ## AMD Puma mobile microarchitecture.
-    cpuinfo_uarch_arm7 = 0x00300100, ## ARM7 series.
-    cpuinfo_uarch_arm9 = 0x00300101, ## ARM9 series.
-    cpuinfo_uarch_arm11 = 0x00300102, ## ARM 1136, ARM 1156, ARM 1176, or ARM 11MPCore.
-    cpuinfo_uarch_cortex_a5 = 0x00300205, ## ARM Cortex-A5.
-    cpuinfo_uarch_cortex_a7 = 0x00300207, ## ARM Cortex-A7.
-    cpuinfo_uarch_cortex_a8 = 0x00300208, ## ARM Cortex-A8.
-    cpuinfo_uarch_cortex_a9 = 0x00300209, ## ARM Cortex-A9.
-    cpuinfo_uarch_cortex_a12 = 0x00300212, ## ARM Cortex-A12.
-    cpuinfo_uarch_cortex_a15 = 0x00300215, ## ARM Cortex-A15.
-    cpuinfo_uarch_cortex_a17 = 0x00300217, ## ARM Cortex-A17.
-    cpuinfo_uarch_cortex_a32 = 0x00300332, ## ARM Cortex-A32.
-    cpuinfo_uarch_cortex_a35 = 0x00300335, ## ARM Cortex-A35.
-    cpuinfo_uarch_cortex_a53 = 0x00300353, ## ARM Cortex-A53.
-    cpuinfo_uarch_cortex_a55 = 0x00300355, ## ARM Cortex-A55.
-    cpuinfo_uarch_cortex_a57 = 0x00300357, ## ARM Cortex-A57.
-    cpuinfo_uarch_cortex_a72 = 0x00300372, ## ARM Cortex-A72.
-    cpuinfo_uarch_cortex_a73 = 0x00300373, ## ARM Cortex-A73.
-    cpuinfo_uarch_cortex_a75 = 0x00300375, ## ARM Cortex-A75.
-    cpuinfo_uarch_cortex_a76 = 0x00300376, ## ARM Cortex-A76.
-    cpuinfo_uarch_scorpion = 0x00400100, ## Qualcomm Scorpion.
-    cpuinfo_uarch_krait = 0x00400101, ## Qualcomm Krait.
-    cpuinfo_uarch_kryo = 0x00400102, ## Qualcomm Kryo.
-    cpuinfo_uarch_falkor = 0x00400103, ## Qualcomm Falkor.
-    cpuinfo_uarch_saphira = 0x00400104, ## Qualcomm Saphira.
-    cpuinfo_uarch_denver = 0x00500100, ## Nvidia Denver.
-    cpuinfo_uarch_denver2 = 0x00500101, ## Nvidia Denver 2.
-    cpuinfo_uarch_carmel = 0x00500102, ## Nvidia Carmel.
-    cpuinfo_uarch_mongoose_m1 = 0x00600100, ## Samsung Mongoose M1 (Exynos 8890 big cores).
-    cpuinfo_uarch_mongoose_m2 = 0x00600101, ## Samsung Mongoose M2 (Exynos 8895 big cores).
-    cpuinfo_uarch_meerkat_m3 = 0x00600102, ## Samsung Meerkat M3 (Exynos 9810 big cores).
-    cpuinfo_uarch_swift = 0x00700100, ## Apple A6 and A6X processors.
-    cpuinfo_uarch_cyclone = 0x00700101, ## Apple A7 processor.
-    cpuinfo_uarch_typhoon = 0x00700102, ## Apple A8 and A8X processor.
-    cpuinfo_uarch_twister = 0x00700103, ## Apple A9 and A9X processor.
-    cpuinfo_uarch_hurricane = 0x00700104, ## Apple A10 and A10X processor.
-    cpuinfo_uarch_monsoon = 0x00700105, ## Apple A11 processor (big cores).
-    cpuinfo_uarch_mistral = 0x00700106, ## Apple A11 processor (little cores).
-    cpuinfo_uarch_thunderx = 0x00800100, ## Cavium ThunderX.
-    cpuinfo_uarch_thunderx2 = 0x00800200, ## Cavium ThunderX2 (originally Broadcom Vulkan).
-    cpuinfo_uarch_pj4 = 0x00900100, ## Marvell PJ4.
-    cpuinfo_uarch_brahma_b15 = 0x00A00100, ## Broadcom Brahma B15.
-    cpuinfo_uarch_brahma_b53 = 0x00A00101, ## Broadcom Brahma B53.
-    cpuinfo_uarch_xgene = 0x00B00100 ## Applied Micro X-Gene.
-
-{.pragma: cpuinfo_proc, importc, header: headerPath, cdecl.}
-
-proc cpuinfo_initialize(): bool {.cpuinfo_proc.}
-proc cpuinfo_deinitialize() {.cpuinfo_proc, noconv.} # noconv for addQuitProc
-
-proc cpuinfo_get_processors*(): ptr CPUInfo_processor {.cpuinfo_proc.}
-proc cpuinfo_get_cores*(): ptr CPUInfo_core {.cpuinfo_proc.}
-proc cpuinfo_get_clusters*(): ptr CPUInfo_cluster {.cpuinfo_proc.}
-proc cpuinfo_get_packages*(): ptr CPUInfo_package {.cpuinfo_proc.}
-proc cpuinfo_get_l1i_caches*(): ptr CPUInfo_cache {.cpuinfo_proc.}
-proc cpuinfo_get_l1d_caches*(): ptr CPUInfo_cache {.cpuinfo_proc.}
-proc cpuinfo_get_l2_caches*(): ptr CPUInfo_cache {.cpuinfo_proc.}
-proc cpuinfo_get_l3_caches*(): ptr CPUInfo_cache {.cpuinfo_proc.}
-proc cpuinfo_get_l4_caches*(): ptr CPUInfo_cache {.cpuinfo_proc.}
-proc cpuinfo_get_processor*(index: uint32): ptr CPUInfo_processor {.cpuinfo_proc.}
-proc cpuinfo_get_core*(index: uint32): ptr CPUInfo_core {.cpuinfo_proc.}
-proc cpuinfo_get_cluster*(index: uint32): ptr CPUInfo_cluster {.cpuinfo_proc.}
-proc cpuinfo_get_package*(index: uint32): ptr CPUInfo_package {.cpuinfo_proc.}
-proc cpuinfo_get_l1i_cache*(index: uint32): ptr CPUInfo_cache {.cpuinfo_proc.}
-proc cpuinfo_get_l1d_cache*(index: uint32): ptr CPUInfo_cache {.cpuinfo_proc.}
-proc cpuinfo_get_l2_cache*(index: uint32): ptr CPUInfo_cache {.cpuinfo_proc.}
-proc cpuinfo_get_l3_cache*(index: uint32): ptr CPUInfo_cache {.cpuinfo_proc.}
-proc cpuinfo_get_l4_cache*(index: uint32): ptr CPUInfo_cache {.cpuinfo_proc.}
-proc cpuinfo_get_processors_count*(): uint32 {.cpuinfo_proc.}
-proc cpuinfo_get_cores_count*(): uint32 {.cpuinfo_proc.}
-proc cpuinfo_get_clusters_count*(): uint32 {.cpuinfo_proc.}
-proc cpuinfo_get_packages_count*(): uint32 {.cpuinfo_proc.}
-proc cpuinfo_get_l1i_caches_count*(): uint32 {.cpuinfo_proc.}
-proc cpuinfo_get_l1d_caches_count*(): uint32 {.cpuinfo_proc.}
-proc cpuinfo_get_l2_caches_count*(): uint32 {.cpuinfo_proc.}
-proc cpuinfo_get_l3_caches_count*(): uint32 {.cpuinfo_proc.}
-proc cpuinfo_get_l4_caches_count*(): uint32 {.cpuinfo_proc.}
-proc cpuinfo_get_current_processor*(): ptr CPUInfo_processor {.cpuinfo_proc.}
-proc cpuinfo_get_current_core*(): ptr CPUInfo_core {.cpuinfo_proc.}
-
-###########################################
-################# C files #################
-
-# clog dependency
-{.passC: "-I" & cpuinfoPath & "deps/clog/include".}
-{.compile: cpuinfoPath & "deps/clog/src/clog.c".}
-
-# Headers - use the patched header with typedefs
-# Also for some reason we need to passC in the same line
-# Otherwise "curSrcFolder" is ignored
-{.passC: "-I" & cpuinfoPath & "src -I" & curSrcFolder.}
-
-when defined(linux):
-  {.passC: "-D_GNU_SOURCE".}
-  {.passL: "-lpthread".}
-
-template compile(path: static string): untyped =
-  # Path: the path from cpuinfo/src folder
-  const compiled_object = block:
-    var obj_name = "cpuinfo"
-    for subPath in path.split(DirSep):
-      obj_name &= "_" & subPath
-    obj_name &= ".o"
-    obj_name
-  # we need to use relative paths https://github.com/nim-lang/Nim/issues/9370
-  {.compile:("./cpuinfo/src/" & path, compiled_object).}
-
-when defined(arm) or defined(arm64):
-  when defined(android):
-    compile"arm/android/gpu.c"
-    compile"arm/android/properties.c"
-  elif defined(linux):
-    compile"arm/linux/aarch32-isa.c"
-    compile"arm/linux/aarch64-isa.c"
-    compile"arm/linux/chipset.c"
-    compile"arm/linux/clusters.c"
-    compile"arm/linux/cpuinfo.c"
-    compile"arm/linux/hwcap.c"
-    compile"arm/linux/init.c"
-    compile"arm/linux/midr.c"
-  elif defined(iOS): # we don't support GNU Hurd ¯\_(ツ)_/¯
-    compile"arm/mach/init.c"
-    # iOS GPU
-    # compile"gpu/gles-ios.m" # TODO: Obj-C compilation
-  compile"arm/cache.c"
-  compile"arm/tlb.c"
-  compile"arm/uarch.c"
-  # ARM GPU
-  compile"gpu/gles2.c"
-
-when defined(linux):
-  compile"linux/cpulist.c"
-  compile"linux/current.c"
-  compile"linux/gpu.c"
-  compile"linux/multiline.c"
-  compile"linux/processors.c"
-  compile"linux/smallfile.c"
-
-when defined(iOS) or defined(macos) or defined(macosx): # # we don't support GNU Hurd ¯\_(ツ)_/¯
-  compile"mach/topology.c"
-
-when defined(i386) or defined(amd64):
-  compile"x86/cache/descriptor.c"
-  compile"x86/cache/deterministic.c"
-  compile"x86/cache/init.c"
-  when defined(linux):
-    compile"x86/linux/cpuinfo.c"
-    compile"x86/linux/init.c"
-  elif defined(iOS) or defined(macos) or defined(macosx):
-    compile"x86/mach/init.c"
-  # compile"src/x86/nacl/isa.c" # TODO: NaCl support
-  compile"x86/info.c"
-  compile"x86/init.c"
-  compile"x86/isa.c"
-  compile"x86/name.c"
-  compile"x86/topology.c"
-  compile"x86/uarch.c"
-  compile"x86/vendor.c"
-
-compile"api.c"
-compile"init.c"
-
-###########################################
-################# Runtime #################
-
-if not cpuinfo_initialize():
-  raise newException(LibraryError, "Could not initialize the cpuinfo module")
-addQuitProc(cpuinfo_deinitialize)
-
-{.pragma: cpuinfo, cdecl, header: headerPath.}
-func cpuinfo_has_x86_sse*(): bool {.cpuinfo.}
-func cpuinfo_has_x86_sse2*(): bool {.cpuinfo.}
-func cpuinfo_has_x86_sse3*(): bool {.cpuinfo.}
-func cpuinfo_has_x86_sse4_1*(): bool {.cpuinfo.}
-func cpuinfo_has_x86_avx*(): bool {.cpuinfo.}
-func cpuinfo_has_x86_avx2*(): bool {.cpuinfo.}
-func cpuinfo_has_x86_avx512f*(): bool {.cpuinfo.}
-
-func cpuinfo_has_x86_fma3*(): bool {.cpuinfo.}
diff --git a/benchmarks/matmul/README.md b/benchmarks/matmul/README.md
deleted file mode 100644
index 0fc6f7e..0000000
--- a/benchmarks/matmul/README.md
+++ /dev/null
@@ -1,38 +0,0 @@
-# Benchmarks versus State-of-the-Art Matrix Multiplication.
-
-This evaluates threading performance versus state-of-the-art
-implementation of matrix multiplication.
-
-Those implementations commonly known as GEMM (GEneralied Matrix Multiplication)
-are part of a BLAS library, BLAS being a standard interface for vector and matrices linear algebra.
-
-Common implementations include:
-- OpenBLAS (Assembly)
-- Intel MKL (Assembly)
-- Apple Accelerate
-- BLIS
-- ARM Performance library
-
-which are highly-tuned to use CPU L1 and L2, prefetching
-SIMD vectorizations and core parallelism.
-
-Memory locality has a significant impact on performance
-and threading frameworks should minimize cache misses and memory footprint.
-
-Note that the common naive benchmarking of matrix multiplication
-with triple for-loop can be 100x times slower than optimized implementation
-on decently sized matrices. This performance gap grows at N^3 rate, asuming
-we multiply two NxN matrices.
-
-[Laser](https://github.com/numforge/laser) implements a state-of-the-art
-GEMM that reaches performance similar to OpenBLAS in pure Nim and parallelism
-is obtained through OpenMP.
-
-We reuse Laser implementation and switch the backend to Weave's scheduler
-to measure overhead on high-performance computing workload.
-
-Laser code as of Aug. 24, 2019 (https://github.com/numforge/laser/tree/af191c086b4a98c49049ecf18f5519dc6856cc77)
-
-`laser_gemm_backend` contains backend routines with no parallelism.
-
-`laser_utils` contains routines that are used across all Laser primitives
diff --git a/benchmarks/matmul/laser_gemm/gemm.nim b/benchmarks/matmul/laser_gemm/gemm.nim
deleted file mode 100644
index a761120..0000000
--- a/benchmarks/matmul/laser_gemm/gemm.nim
+++ /dev/null
@@ -1,511 +0,0 @@
-# Laser
-# Copyright (c) 2018 Mamy André-Ratsimbazafy
-# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
-# This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ../../cpuinfo,
-  ../laser_utils/[compiler_optim_hints, openmp],
-  ../laser_gemm_backend/[
-    gemm_tiling, gemm_utils,
-    gemm_ukernel_dispatch
-  ],
-  ./gemm_packing
-
-withCompilerOptimHints()
-
-# ############################################################
-#
-#      Optimized GEMM (Generalized Matrix-Multiplication)
-#
-# ############################################################
-
-# Features
-#  - Arbitrary stride support
-#  - Efficient implementation (within 90% of the speed of OpenBLAS, more tuning to expect)
-#  - Parallel and scale linearly with number of cores
-#
-# Future
-#  - Implementation extended to integers
-#  - ARM Neon optimisation
-#  - Small matrix multiply optimisation
-#  - Pre-packing to when computing using the same matrix
-#  - batched matrix multiplication
-
-# Terminology
-#   - M, Matrix: Both dimension are large or unknown
-#   - P, Panel: one of the dimension is small
-#   - B, Block: both dimension are small
-#
-#   - GEMM: GEneralized Matrix-Matrix multiplication
-#   - GEPP: GEneralized Panel-Panel multiplication
-#   - GEBP: Generalized Block-Panel multiplication (macrokernel)
-#   - GEBB: GEneralized Block-Block multiplication (microkernel)
-#   ...
-
-# ############################################################
-#
-#                     GEBP Macrokernel
-#
-# ############################################################
-
-proc gebp_mkernel*[T; ukernel: static MicroKernel](
-      mc, nc, kc: int,
-      alpha: T, packA, packB: ptr UncheckedArray[T],
-      beta: T,
-      mcncC: MatrixView[T]
-    ) =
-  ## Macro kernel, multiply:
-  ##  - a block A[mc, kc] * panel B[kc, N]
-
-  # Since nr is small this the the good place to parallelize
-  # See: Anatomy of High-Performance Many-Threaded Matrix Multiplication
-  #      Smith et al
-  #      - http://www.cs.utexas.edu/users/flame/pubs/blis3_ipdps14.pdf
-
-  # ⚠ We need to ensure that loop variables and pointers
-  # are private to each thread
-
-  # Nim doesn't support arbitrary increment with OpenMP
-  # So we store indexing/edge case data in tiles
-  const
-    MR = ukernel.extract_mr
-    NR = ukernel.extract_nr
-    PT = ukernel.extract_pt
-
-  # #####################################
-  # 4. for jr = 0,...,nc−1 in steps of nr
-  for jr in `||`(0, nc-1, NR, "taskloop"):
-    let nr = min(nc - jr, NR)                        # C[ic:ic+mc, jc+jr:jc+jr+nr]
-
-    # ###################################
-    # 5. for ir = 0,...,m−1 in steps of mr
-    for ir in countup(0, mc-1, MR):
-      let mr = min(mc - ir, MR)
-      let c_aux = mcncC.stride(ir, jr)               # C[ic+ir:ic+ir+mr, jc+jr:jc+jr+nr]
-
-      let upanel_b = packB + jr*kc
-      prefetch(upanel_b, Read, ModerateTemporalLocality)
-      let upanel_a = packA + ir*kc
-      prefetch(upanel_a, Read, ModerateTemporalLocality)
-
-      if nr == NR and mr == MR:
-        # General case
-        gebb_ukernel[T, ukernel](                    # GEBB microkernel + epilogue
-                kc,                                  #   C[ic+ir:ic+ir+mr, jc+jr:jc+jr+nr] =
-          alpha, upanel_a, upanel_b,                 #    αA[ic+ir:ic+ir+mr, pc:pc+kc] *
-          beta, c_aux                                #     B[pc:pc+kc, jc+jr:jc+jr+nr] +
-        )                                            #    βC[ic:ic+mc, jc:jc+nc]
-      else:
-        # Matrix edges
-        gebb_ukernel_edge[T, ukernel](               # GEBB microkernel + epilogue
-          mr, nr, kc,                                #   C[ic+ir:ic+ir+mr, jc+jr:jc+jr+nr] =
-          alpha, upanel_a, upanel_b,                 #    αA[ic+ir:ic+ir+mr, pc:pc+kc] *
-          beta, c_aux                                #     B[pc:pc+kc, jc+jr:jc+jr+nr] +
-        )                                            #    βC[ic:ic+mc, jc:jc+nc]
-
-# ###########################################################################################
-#
-#              GEMM Internal Implementation
-#
-# ###########################################################################################
-
-proc gemm_impl[T; ukernel: static MicroKernel](
-      M, N, K: int,
-      alpha: T, vA: MatrixView[T], vB: MatrixView[T],
-      beta: T, vC: MatrixView[T],
-      tiles: Tiles[T]
-    ) =
-
-  # ####################################################################
-  # Loop partitioning
-  #   - We parallelize around ic loop (partitions M dimension)
-  #   - and jr loop (partitions N dimension)
-  #
-  # Currently the first loop nc = N is not partitioned.
-  # According to BLIS paper, it should be partitioned at socket level.
-  # This can be done with OpenMP using
-  #
-  #   omp_set_nested(1);
-  #   n_sockets = omp_get_num_places();
-  #   #pragma omp parallel num_threads(n_sockets) proc_bind(spread)
-  #   {
-  #       n_procs = omp_get_place_num_procs(omp_get_num_places());
-  #       #pragma omp parallel num_threads(n_procs) proc_bind(close)
-  #       doStuff();
-  #   }
-
-  # Hyperthreading will pollute the L1, L2 caches and the TLB
-  # as we intentionally choose parameters so that about
-  # half of the core caches is taken by micropanels of A and B.
-  # But somehow fixing num_threads to anything other than my number of logical threads
-  # kills my perf (and even also OpenBLAS when it's run at the same time)
-
-  const PT = ukernel.extract_pt
-  let parallelize = M*N*K > PT*PT*PT
-  # let nb_threads = cpuinfo_get_cores_count() # get physical cores
-
-  # ####################################################################
-  # 1. for jc = 0,...,n−1 in steps of nc
-  let nc = N                                          # B[0:K, jc:jc+nc]
-                                                      # C[0:M, jc:jc+nc]
-  # ######################################
-  # 2.   for pc = 0,...,k−1 in steps of kc
-  for pc in countup(0, K-1, tiles.kc):
-    prefetch(tiles.b, Write, LowTemporalLocality)
-    let kc = min(K - pc, tiles.kc) # Deal with edges  # A[0:M, pc:pc+kc]
-
-    let kcncB = vB.stride(pc, 0)                      # B[pc:pc+kc, jc:jc+nc]
-    pack_B_kc_nc[T, ukernel](tiles.b, kc, nc, kcncB)  # PackB panel [kc, nc] (nc is large or unknown)
-
-    # First time writing to C, we scale it, otherwise accumulate
-    let beta = if pc == 0: beta else: 1.T
-
-    omp_parallel_if(parallelize):
-      # ####################################
-      # 3. for ic = 0,...,m−1 in steps of mc
-      omp_for(icb, tiles.ic_num_tasks, use_simd=false, nowait=true):
-        let packA = tiles.a + icb * tiles.upanelA_size
-        prefetch(packA, Write, LowTemporalLocality)
-        let ic = icb * tiles.mc
-        let mc = min(M-ic, tiles.mc)                    # C[ic:ic+mc, jc:jc+nc]
-
-        let mckcA = vA.stride(ic, pc)                   # A[ic:ic+mc, pc:pc+kc]
-        pack_A_mc_kc[T, ukernel](packA, mc, kc, mckcA)  # PackA block [mc, kc]
-
-        gebp_mkernel[T, ukernel](                       # GEBP macrokernel:
-            mc, nc, kc,                                 #   C[ic:ic+mc, jc:jc+nc] =
-            alpha, packA, tiles.b,                      #    αA[ic:ic+mc, pc:pc+kc] * B[pc:pc+kc, jc:jc+nc] +
-            beta, vC.stride(ic, 0)                      #    βC[ic:ic+mc, jc:jc+nc]
-          )
-
-# ############################################################
-#
-#   Exported function and dispatch with CPU runtime detection
-#
-# ############################################################
-
-proc gemm_strided*[T: SomeNumber](
-      M, N, K: int,
-      alpha: T,
-      A: ptr T,
-      rowStrideA, colStrideA: int,
-      B: ptr T,
-      rowStrideB, colStrideB: int,
-      beta: T,
-      C: ptr T,
-      rowStrideC, colStrideC: int) =
-
-    # TODO: shortcut alpha = 0 or K = 0
-    # TODO: elementwise epilogue fusion like relu/tanh/sigmoid
-    # TODO: shortcut for small gemm
-
-    # Create a view to abstract deling with strides
-    # and passing those in each proc
-    let vA = A.toMatrixView(rowStrideA, colStrideA)
-    let vB = B.toMatrixView(rowStrideB, colStrideB)
-    let vC = C.toMatrixView(rowStrideC, colStrideC)
-
-    # Cache hierarchy:
-    #   - block C: mr*nr registers
-    #   - block B: kc*nr L1 cache
-    #   - block A: mc*kc L2 cache
-    #   - panel B: kc*nc L3 cache
-
-    template dispatch(cpu_features: static CPUFeatureX86): untyped{.dirty.} =
-      template apply(ukernel: MicroKernel): untyped {.dirty.} =
-        let tiles = ukernel.newTiles(T, M, N, K)
-        gemm_impl[T, ukernel](
-          M, N, K,
-          alpha, vA, vB,
-          beta, vC,
-          tiles
-        )
-        return
-      if colStrideC == 1:
-        const ukernel = cpu_features.x86_ukernel(T, true)
-        apply(ukernel)
-      else:
-        const ukernel = cpu_features.x86_ukernel(T, false)
-        apply(ukernel)
-
-    when defined(i386) or defined(amd64):
-      when T is float32:
-        if cpuinfo_has_x86_avx512f():   dispatch(x86_AVX512)
-        elif cpuinfo_has_x86_fma3():   dispatch(x86_AVX_FMA)
-        elif cpuinfo_has_x86_avx():  dispatch(x86_AVX)
-        elif cpuinfo_has_x86_sse():    dispatch(x86_SSE)
-      elif T is float64:
-        if cpuinfo_has_x86_avx512f():   dispatch(x86_AVX512)
-        elif cpuinfo_has_x86_fma3():   dispatch(x86_AVX_FMA)
-        elif cpuinfo_has_x86_avx():  dispatch(x86_AVX)
-        elif cpuinfo_has_x86_sse2():    dispatch(x86_SSE2)
-      elif T is int32 or T is uint32:
-        if cpuinfo_has_x86_avx512f():   dispatch(x86_AVX512)
-        elif cpuinfo_has_x86_avx2():   dispatch(x86_AVX2)
-        elif cpuinfo_has_x86_sse41():   dispatch(x86_SSE4_1)
-        elif cpuinfo_has_x86_sse2():   dispatch(x86_SSE2)
-      elif T is int64:
-        if cpuinfo_has_x86_avx512f():   dispatch(x86_AVX512)
-        elif cpuinfo_has_x86_sse2():   dispatch(x86_SSE2)
-    dispatch(x86_Generic)
-
-# ############################################################
-#
-#                       Private tests
-#
-# ############################################################
-
-when isMainModule:
-  # Tests
-  block:
-    let a = [[1.0, 2, 3],
-             [1.0, 1, 1],
-             [1.0, 1, 1]]
-
-    let b = [[1.0, 1],
-             [1.0, 1],
-             [1.0, 1]]
-
-    let ab = [[6.0, 6],
-              [3.0, 3],
-              [3.0, 3]]
-
-    var res_ab: array[3, array[2, float]]
-    gemm_strided(
-      3, 2, 3,
-      1.0,  a[0][0].unsafeAddr, 3, 1,
-            b[0][0].unsafeAddr, 2, 1,
-      0.0,  res_ab[0][0].addr,  2, 1
-      )
-
-    # echo "expected: ", ab
-    # echo "result: ", res_ab
-
-    doAssert res_ab == ab, $res_ab
-    echo "SUCCESS\n"
-
-  block:
-    let a = [[1.0, 2, 3],
-             [4.0, 5, 6],
-             [7.0, 8, 9]]
-
-    let b = [[1.0, 1],
-             [1.0, 1],
-             [1.0, 1]]
-
-    let ab = [[ 6.0,  6],
-              [15.0, 15],
-              [24.0, 24]]
-
-    var res_ab: array[3, array[2, float]]
-    gemm_strided(
-      3, 2, 3,
-      1.0,  a[0][0].unsafeAddr, 3, 1,
-            b[0][0].unsafeAddr, 2, 1,
-      0.0,  res_ab[0][0].addr,  2, 1
-      )
-
-    # echo "expected: ", ab
-    # echo "result: ", res_ab
-
-    doAssert res_ab == ab, $res_ab
-    echo "SUCCESS\n"
-
-  block:
-    let a = [[1.0,2,3],
-             [4.0,5,6]]
-
-    let b = [[7.0,  8],
-             [9.0, 10],
-             [11.0,12]]
-
-    let ab = [[ 58.0, 64],
-              [139.0,154]]
-
-    var res_ab: array[2, array[2, float]]
-    gemm_strided(
-      2, 2, 3,
-      1.0,  a[0][0].unsafeAddr, 3, 1,
-            b[0][0].unsafeAddr, 2, 1,
-      0.0,  res_ab[0][0].addr,  2, 1
-      )
-
-    # echo "expected: ", ab
-    # echo "result: ", res_ab
-
-    doAssert res_ab == ab, $res_ab
-    echo "SUCCESS\n"
-
-  block:
-    # example from http://www.intmath.com/matrices-determinants/matrix-multiplication-examples.php
-    echo "\n## (M x K) * (K x N) with M < N"
-    let a = [[-2,-3,-1],
-             [ 3, 0, 4]]
-    let b = [[ 1, 5, 2,-1],
-             [-3, 0, 3, 4],
-             [ 6,-2, 7,-4]]
-
-    let ab = [[ 1,-8,-20, -6],
-              [27, 7, 34,-19]]
-
-    var res_ab: array[2, array[4, int]]
-    gemm_strided(
-      2, 4, 3,
-      1,  a[0][0].unsafeAddr, 3, 1,
-          b[0][0].unsafeAddr, 4, 1,
-      0,  res_ab[0][0].addr,  4, 1
-      )
-
-    # echo "expected: ", ab
-    # echo "result: ", res_ab
-
-    doAssert res_ab == ab, $res_ab
-    echo "SUCCESS\n"
-
-  block:
-    # from http://www.calcul.com/show/calculator/matrix-multiplication_;5;5;5;5?matrix1=[[%225%22,%226%22,%225%22,%228%22],[%228%22,%222%22,%228%22,%228%22],[%220%22,%225%22,%224%22,%220%22],[%224%22,%220%22,%225%22,%226%22],[%224%22,%225%22,%220%22,%223%22]]&matrix2=[[%225%22,%223%22,%226%22,%220%22],[%225%22,%222%22,%223%22,%223%22],[%228%22,%228%22,%222%22,%220%22],[%227%22,%227%22,%220%22,%220%22]]&operator=*
-    echo "\n## (M x K) * (K x N) with M > N and M > block-size (4x4)"
-    let a =  [[5,6,5,8],
-              [8,2,8,8],
-              [0,5,4,0],
-              [4,0,5,6],
-              [4,5,0,3]]
-    let b =  [[5,3,6,0],
-              [5,2,3,3],
-              [8,8,2,0],
-              [7,7,0,0]]
-
-    let ab = [[151,123,58,18],
-              [170,148,70, 6],
-              [ 57, 42,23,15],
-              [102, 94,34, 0],
-              [ 66, 43,39,15]]
-
-    var res_ab: array[5, array[4, int]]
-    gemm_strided(
-      5, 4, 4,
-      1,  a[0][0].unsafeAddr, 4, 1,
-          b[0][0].unsafeAddr, 4, 1,
-      0,  res_ab[0][0].addr,  4, 1
-      )
-
-    # echo "expected: ", ab
-    # echo "result: ", res_ab
-
-    doAssert res_ab == ab, $res_ab
-    echo "SUCCESS\n"
-
-  block:
-    let a =  [[2, 4,  3,  1,  3,  1,  3,  1],
-              [4, 3,  2,  4,  1,  0,  0,  0]]
-
-
-    let b =  [[2, 2],
-              [2, 1],
-              [0, 3],
-              [0, 1],
-              [0, 2],
-              [4, 3],
-              [3, 3],
-              [2, 1]]
-
-    let ab = [[27,37],
-              [14,23]]
-
-    var res_ab: array[2, array[2, int]]
-    gemm_strided(
-      2, 2, 8,
-      1,  a[0][0].unsafeAddr, 8, 1,
-          b[0][0].unsafeAddr, 2, 1,
-      0,  res_ab[0][0].addr,  2, 1
-      )
-
-    # echo "expected: ", ab
-    # echo "result: ", res_ab
-
-    doAssert res_ab == ab, $res_ab
-    echo "SUCCESS\n"
-
-  block:
-    let a =  [[2, 1],
-              [1, 3],
-              [2, 1],
-              [1, 0],
-              [3, 4],
-              [2, 4],
-              [3, 1],
-              [4, 0]]
-
-
-    let b =  [[2, 2,  0,  4,  0,  0,  4,  2],
-              [2, 1,  2,  1,  2,  4,  4,  1]]
-
-    let ab = [[ 6,  5,  2,  9,  2,  4, 12,  5],
-              [ 8,  5,  6,  7,  6, 12, 16,  5],
-              [ 6,  5,  2,  9,  2,  4, 12,  5],
-              [ 2,  2,  0,  4,  0,  0,  4,  2],
-              [14, 10,  8, 16,  8, 16, 28, 10],
-              [12,  8,  8, 12,  8, 16, 24,  8],
-              [ 8,  7,  2, 13,  2,  4, 16,  7],
-              [ 8,  8,  0, 16,  0,  0, 16,  8]]
-
-    var res_ab: array[8, array[8, int]]
-    gemm_strided(
-      8, 8, 2,
-      1,  a[0][0].unsafeAddr, 2, 1,
-          b[0][0].unsafeAddr, 8, 1,
-      0,  res_ab[0][0].addr,  8, 1
-      )
-
-    # echo "expected: ", ab
-    # echo "result: ",   res_ab
-
-    doAssert res_ab == ab, $res_ab
-    echo "SUCCESS\n"
-
-  block:
-    # from http://www.calcul.com/show/calculator/matrix-multiplication?matrix1=[[%222%22,%224%22,%223%22,%221%22,%223%22,%221%22,%223%22,%221%22],[%221%22,%222%22,%221%22,%221%22,%222%22,%220%22,%224%22,%223%22],[%222%22,%220%22,%220%22,%223%22,%220%22,%224%22,%224%22,%221%22],[%221%22,%221%22,%224%22,%220%22,%223%22,%221%22,%223%22,%220%22],[%223%22,%224%22,%221%22,%221%22,%224%22,%222%22,%223%22,%224%22],[%222%22,%224%22,%220%22,%222%22,%223%22,%223%22,%223%22,%224%22],[%223%22,%220%22,%220%22,%223%22,%221%22,%224%22,%223%22,%221%22],[%224%22,%223%22,%222%22,%224%22,%221%22,%220%22,%220%22,%220%22]]&matrix2=[[%222%22,%222%22,%220%22,%224%22,%220%22,%220%22,%224%22,%222%22],[%222%22,%220%22,%220%22,%221%22,%221%22,%221%22,%223%22,%221%22],[%220%22,%222%22,%222%22,%220%22,%222%22,%222%22,%223%22,%223%22],[%220%22,%220%22,%221%22,%220%22,%224%22,%222%22,%224%22,%221%22],[%220%22,%220%22,%221%22,%223%22,%224%22,%222%22,%224%22,%222%22],[%224%22,%223%22,%224%22,%221%22,%224%22,%224%22,%220%22,%223%22],[%223%22,%223%22,%220%22,%222%22,%221%22,%222%22,%223%22,%223%22],[%222%22,%221%22,%222%22,%221%22,%222%22,%224%22,%224%22,%221%22]]&operator=*
-    echo "\n## (N x N) * (N x N) with N multiple of block size"
-
-    let a =  [[2, 4,  3,  1,  3,  1,  3,  1],
-              [1, 2,  1,  1,  2,  0,  4,  3],
-              [2, 0,  0,  3,  0,  4,  4,  1],
-              [1, 1,  4,  0,  3,  1,  3,  0],
-              [3, 4,  1,  1,  4,  2,  3,  4],
-              [2, 4,  0,  2,  3,  3,  3,  4],
-              [3, 0,  0,  3,  1,  4,  3,  1],
-              [4, 3,  2,  4,  1,  0,  0,  0]]
-
-
-    let b =  [[2, 2,  0,  4,  0,  0,  4,  2],
-              [2, 0,  0,  1,  1,  1,  3,  1],
-              [0, 2,  2,  0,  2,  2,  3,  3],
-              [0, 0,  1,  0,  4,  2,  4,  1],
-              [0, 0,  1,  3,  4,  2,  4,  2],
-              [4, 3,  4,  1,  4,  4,  0,  3],
-              [3, 3,  0,  2,  1,  2,  3,  3],
-              [2, 1,  2,  1,  2,  4,  4,  1]]
-
-    let ab = [[27,23,16,29,35,32,58,37],
-              [24,19,11,23,26,30,49,27],
-              [34,29,21,21,34,34,36,32],
-              [17,22,15,21,28,25,40,33],
-              [39,27,23,40,45,46,72,41],
-              [41,26,25,34,47,48,65,38],
-              [33,28,22,26,37,34,41,33],
-              [14,12, 9,22,27,17,51,23]]
-
-    var res_ab: array[8, array[8, int]]
-    gemm_strided(
-      8, 8, 8,
-      1,  a[0][0].unsafeAddr, 8, 1,
-          b[0][0].unsafeAddr, 8, 1,
-      0,  res_ab[0][0].addr,  8, 1
-      )
-
-    # echo "expected: ", ab
-    # echo "result: ",   res_ab
-
-    doAssert res_ab == ab, $res_ab
-    echo "SUCCESS\n"
diff --git a/benchmarks/matmul/laser_gemm/gemm_packing.nim b/benchmarks/matmul/laser_gemm/gemm_packing.nim
deleted file mode 100644
index ceed386..0000000
--- a/benchmarks/matmul/laser_gemm/gemm_packing.nim
+++ /dev/null
@@ -1,94 +0,0 @@
-# Laser
-# Copyright (c) 2018 Mamy André-Ratsimbazafy
-# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
-# This file may not be copied, modified, or distributed except according to those terms.
-
-# Due to issue with "static MicroKernel" as parameter
-# as of 0.19.9 we pass it as a generic param
-#   - 1. undeclared identifier mr/nr, when accessing ukernel
-#   - 2. object constructor needs and object type when workaround first issue with macro
-
-import
-  ../laser_utils/compiler_optim_hints,
-  ../laser_utils/align_unroller,
-  ../laser_gemm_backend/[gemm_utils, gemm_tiling]
-
-withCompilerOptimHints()
-
-# ############################################################
-#
-#                    Packing A
-#
-# ############################################################
-
-proc pack_A_mc_kc*[T; ukernel: static MicroKernel](
-      packedA: ptr UncheckedArray[T],
-      mc, kc: int,
-      A: MatrixView[T]) =
-  ## Packs panel [kc, mc] into buffer Ã (size ~half-L2 cache)
-  ## Pads if needed
-  ## Note that A is of shape [M, K] so it is transposed.
-  ##
-  ## Concretely the outer dimension of packed matrices
-  ## is k so that C[i, j] = A[i, k] * B[k, j]
-  ## does not require strided access
-  let buffer{.restrict.} = assume_aligned packedA
-  const MR = ukernel.extract_mr()
-  let unroll_stop = mc.round_step_down(MR)
-
-  # 1. Pack m matrices of size kc*mr, m = mc/mr
-  {.emit:"""
-      for (int i = 0; i < `unroll_stop`; i+=`MR`)
-        for (int k = 0; k < `kc`; k++)
-          for (int ii = 0; ii < `MR`; ii++)
-            `buffer`[i*`kc`+k*`MR`+ii] = `A`.buffer[(i+ii)*`A`.rowStride + k*`A`.colStride];
-  """.}
-
-  # 2. Process the tail
-  let remainder = mc - unroll_stop
-  if remainder > 0:
-    let offBuf = buffer + kc*unroll_stop
-    for k in 0 ..< kc:
-      for i in 0 ..< remainder:
-        offBuf[k*MR + i] = A[unroll_stop+i, k]
-      for i in remainder ..< MR: # Pad with 0 if packing over the edge
-        offBuf[k*MR + i] = 0.T
-
-# ############################################################
-#
-#                    Packing B
-#
-# ############################################################
-
-proc pack_B_kc_nc*[T; ukernel: static MicroKernel](
-      packedB: ptr UncheckedArray[T],
-      kc, nc: int,
-      B: MatrixView[T]) =
-  ## Packs panel [kc, nc] for ~B (half-L1 cache)
-  ## Pads if needed
-  ##
-  ## Concretely the outer dimension of packed matrices
-  ## is k so that C[i, j] = A[i, k] * B[k, j]
-  ## does not require strided access
-  let buffer{.restrict.} = assume_aligned packedB
-  const NR = ukernel.extract_nr()
-  let unroll_stop = nc.round_step_down(NR)
-
-  # 1. Pack n matrices of size kc*nr, n = nc/nr
-  {.emit:"""
-      #pragma omp parallel for
-      for (int j = 0; j < `unroll_stop`; j+=`NR`)
-        for (int k = 0; k < `kc`; k++)
-          for (int jj = 0; jj < `NR`; jj++)
-            `buffer`[j*`kc`+k*`NR`+jj] = `B`.buffer[k*`B`.rowStride + (j+jj)*`B`.colStride];
-  """.}
-
-  # 2. Process the tail
-  let remainder = nc - unroll_stop
-  if remainder > 0:
-    let offBuf = buffer + kc*unroll_stop
-    for k in 0 ..< kc:
-      for j in 0 ..< remainder:
-        offBuf[k*NR + j] = B[k, unroll_stop+j]
-      for j in remainder ..< NR: # Pad with 0 if packing over the edge
-        offBuf[k*NR + j] = 0.T
diff --git a/benchmarks/matmul/laser_gemm/gemm_prepacked.nim b/benchmarks/matmul/laser_gemm/gemm_prepacked.nim
deleted file mode 100644
index 65850bd..0000000
--- a/benchmarks/matmul/laser_gemm/gemm_prepacked.nim
+++ /dev/null
@@ -1,525 +0,0 @@
-# Laser
-# Copyright (c) 2018 Mamy André-Ratsimbazafy
-# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
-# This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ../../cpuinfo,
-  ../laser_utils/[compiler_optim_hints, openmp, align_unroller],
-  ../laser_utils/laser_gemm_backend/[
-    gemm_tiling, gemm_utils, gemm_packing,
-    gemm_ukernel_dispatch, gemm
-  ]
-
-withCompilerOptimHints()
-
-# ############################################################
-#
-#            GEMM Prepacked Matrices A and B
-#
-# ############################################################
-
-template dispatch(
-    return_void: static bool,
-    func_call: untyped): untyped {.dirty.} =
-  ## Warning: statements after dispatch are unreachable
-  template dispatch_opt(cpu_features: static CPUFeatureX86): untyped {.dirty.} =
-    ## Dispatch depending on detected CPU features.
-    type A = T # workaround "Cannot evaluate at compile-time
-    # c_unit_stride is not relevant here
-    const ukernel = cpu_features.x86_ukernel(A, c_unit_stride = false)
-
-    when return_void:
-      func_call
-      return
-    else:
-      return func_call
-
-  when defined(i386) or defined(amd64):
-    when T is float32:
-      if cpuinfo_has_x86_avx512f():   dispatch_opt(x86_AVX512)
-      elif cpuinfo_has_x86_fma3():    dispatch_opt(x86_AVX_FMA)
-      elif cpuinfo_has_x86_avx():     dispatch_opt(x86_AVX)
-      elif cpuinfo_has_x86_sse():     dispatch_opt(x86_SSE)
-    elif T is float64:
-      if cpuinfo_has_x86_avx512f():   dispatch_opt(x86_AVX512)
-      elif cpuinfo_has_x86_fma3():    dispatch_opt(x86_AVX_FMA)
-      elif cpuinfo_has_x86_avx():     dispatch_opt(x86_AVX)
-      elif cpuinfo_has_x86_sse2():    dispatch_opt(x86_SSE2)
-    elif T is int32 or T is uint32:
-      if cpuinfo_has_x86_avx512f():   dispatch_opt(x86_AVX512)
-      elif cpuinfo_has_x86_avx2():    dispatch_opt(x86_AVX2)
-      elif cpuinfo_has_x86_sse41():   dispatch_opt(x86_SSE4_1)
-      elif cpuinfo_has_x86_sse2():    dispatch_opt(x86_SSE2)
-    elif T is int64:
-      if cpuinfo_has_x86_avx512f():   dispatch_opt(x86_AVX512)
-      elif cpuinfo_has_x86_sse2():    dispatch_opt(x86_SSE2)
-  dispatch_opt(x86_Generic)
-
-# ############################################################
-#
-#                    Packing B
-#
-# ############################################################
-
-func gemm_prepackB_mem_required_impl*(
-  ukernel: static MicroKernel,
-  T: typedesc,
-  M, N, K: int): int =
-
-  let (MC, NC, KC) = ukernel.partitionMNK(T, M, N, K)
-  const NR = ukernel.nr
-
-  let pc_num_iter = get_num_tiles(K, KC)
-  let upanelB_size = KC * round_step_up(NC, NR)
-
-  result = T.sizeof * upanelB_size * pc_num_iter
-
-func gemm_prepackB_mem_required*(
-  T: type,
-  M, N, K: int): int =
-  ## Returns the amount of memory that needs to be preallocated
-  ## to pack matrix B.
-
-  dispatch(return_void = false):
-    gemm_prepackB_mem_required_impl(
-      ukernel, T, M, N, K
-    )
-
-proc gemm_prepackB_impl[T; ukernel: static MicroKernel](
-        dst: ptr UncheckedArray[T],
-        M, N, K: int,
-        vB: MatrixView[T]
-      ) =
-
-  let (MC, NC, KC) = ukernel.partitionMNK(T, M, N, K)
-  let pc_num_iter = get_num_tiles(K, KC)
-  let upanelB_size = KC * round_step_up(NC, ukernel.nr)
-  for pcb in 0||(pc_num_iter-1):
-    let packB = dst + pcb * upanelB_size
-    prefetch(packB, Write, LowTemporalLocality)
-
-    let pc = pcb * KC
-    let kc = min(K - pc, KC)
-    let kcncB = vB.stride(pc, 0)
-
-    # Note: pack_B also creates a parallel region
-    #       this will cause issues if omp_get_nested = 1
-    pack_B_kc_nc[T, ukernel](
-      packB,
-      kc, NC, kcncB
-    )
-
-proc gemm_prepackB*[T](
-        dst_packedB: ptr (T or UncheckedArray[T]),
-        M, N, K: int,
-        src_B: ptr T, rowStrideB, colStrideB: int) =
-  ## Prepack matrix B of shape KxN
-  ## and strides rowStrideB and colStrideB
-  ## for matrix multiplication.
-  ## B must be 64-bit aligned.
-  ##
-  ## For optimal performance packing is machine and architecture dependent
-  ## i.e. it depends on detected features like AVX and number of cores
-  ## and may depend on your machine cache sizes in the future.
-  ## It is unsafe to store or serialize it.
-
-  doAssert (cast[int](dst_packedB) and 63) == 0, "The destination pointer must be 64-bit aligned"
-
-  let vB = src_B.toMatrixView(rowStrideB, colStrideB)
-  let dst = cast[ptr UncheckedArray[T]](dst_packedB)
-
-  dispatch(return_void = true):
-    gemm_prepackB_impl[T, ukernel](
-      dst,
-      M, N, K,
-      vB
-    )
-
-# ############################################################
-#
-#                    Packing A
-#
-# ############################################################
-
-func gemm_prepackA_mem_required_impl*(
-  ukernel: static MicroKernel,
-  T: typedesc,
-  M, N, K: int): int =
-
-  let (MC, NC, KC) = ukernel.partitionMNK(T, M, N, K)
-  const MR = ukernel.mr
-
-  let pc_num_iter = get_num_tiles(K, KC)
-  let ic_num_iter = get_num_tiles(M, MC)
-  let upanelA_size = KC * round_step_up(MC, MR)
-
-  result = T.sizeof * upanelA_size * pc_num_iter * ic_num_iter
-
-func gemm_prepackA_mem_required*(
-  T: typedesc,
-  M, N, K: int): int =
-  ## Returns the amount of memory that needs to be preallocated
-  ## to pack matrix B.
-
-  dispatch(return_void = false):
-    gemm_prepackA_mem_required_impl(
-      ukernel, T, M, N, K
-    )
-
-proc gemm_prepackA_impl[T; ukernel: static MicroKernel](
-        dst: ptr UncheckedArray[T],
-        M, N, K: int,
-        vA: MatrixView[T]
-      ) =
-
-  let (MC, NC, KC) = ukernel.partitionMNK(T, M, N, K)
-  const MR = ukernel.mr
-
-  let pc_num_iter = get_num_tiles(K, KC)
-  let ic_num_iter = get_num_tiles(M, MC)
-  let upanelA_size = KC * round_step_up(MC, MR)
-
-  for pcb in 0||(pc_num_iter-1):
-    let pc = pcb * KC
-    let kc = min(K - pc, KC)
-
-    for icb in 0 ..< ic_num_iter:
-      let packA = dst + pc*pc_num_iter + icb*upanelA_size
-      prefetch(packA, Write, LowTemporalLocality)
-      let ic = icb * MC
-      let mc = min(M-ic, MC)
-
-      let mckcA = vA.stride(ic, pc)
-      pack_A_mc_kc[T, ukernel](packA, mc, kc, mckcA)
-
-proc gemm_prepackA*[T](
-        dst_packedA: ptr (T or UncheckedArray[T]),
-        M, N, K: int,
-        src_A: ptr T, rowStrideA, colStrideA: int) =
-  ## Prepack matrix A of shape MxK
-  ## and strides rowStrideA and colStrideA
-  ## for matrix multiplication.
-  ## A must be 64-bit aligned.
-  ##
-  ## For optimal performance packing is machine and architecture dependent
-  ## i.e. it depends on detected features like AVX and number of cores
-  ## and may depend on your machine cache sizes in the future.
-  ## It is unsafe to store or serialize it.
-
-  doAssert (cast[int](dst_packedA) and 63) == 0, "The destination pointer must be 64-bit aligned"
-
-  let vA = src_A.toMatrixView(rowStrideA, colStrideA)
-  let dst = cast[ptr UncheckedArray[T]](dst_packedA)
-
-  dispatch(return_void = true):
-    gemm_prepackA_impl[T, ukernel](
-      dst,
-      M, N, K,
-      vA
-    )
-
-# ############################################################
-#
-#                    Prepacked GEMM
-#
-# ############################################################
-
-proc gemm_packed_impl[T](
-      ukernel: static MicroKernel,
-      M, N, K: int,
-      alpha: T, packedA, packedB: ptr (T or UncheckedArray[T]),
-      beta: T, vC: MatrixView[T]
-    ) =
-
-  withCompilerOptimHints()
-
-  const
-    MR = ukernel.mr
-    NR = ukernel.nr
-    PT = ukernel.pt
-
-  let
-    parallelize = M*N*K > PT*PT*PT
-
-    (MC, NC, KC) = ukernel.partitionMNK(T, M, N, K)
-    pc_num_iter = get_num_tiles(K, KC)
-    ic_num_iter = get_num_tiles(M, MC)
-
-    upanelB_size = KC * round_step_up(NC, NR)
-    upanelA_size = KC * round_step_up(MC, MR)
-
-
-  # ######################################
-  # 2.   for pc = 0,...,k−1 in steps of kc
-  for pcb in 0 ..< pc_num_iter:
-    let packedB{.restrict.} = cast[ptr UncheckedArray[T]](packedB + pcb * upanelB_size)
-    let pc = pcb * KC
-    let kc = min(K - pc, KC)
-
-    # First time writing to C, we scale it, otherwise accumulate
-    let beta = if pc == 0: beta else: 1.T
-
-    omp_parallel_if(parallelize):
-      # ####################################
-      # 3. for ic = 0,...,m−1 in steps of mc
-      omp_for(icb, ic_num_iter, use_simd=false, nowait=true):
-        let packedA{.restrict.} = cast[ptr UncheckedArray[T]](packedA + icb * upanelA_size)
-        let ic = icb * MC
-        let mc = min(M-ic, MC)
-
-        gebp_mkernel[T, ukernel](
-          mc, NC, kc,
-          alpha, packedA, packedB,
-          beta, vc.stride(ic, 0)
-        )
-
-proc gemm_packed*[T: SomeNumber](
-      M, N, K: int,
-      alpha: T,
-      packedA: ptr (T or UncheckedArray[T]),
-      packedB: ptr (T or UncheckedArray[T]),
-      beta: T,
-      C: ptr (T or UncheckedArray[T]),
-      rowStrideC, colStrideC: int) =
-
-  let vC = C.toMatrixView(rowStrideC, colStrideC)
-
-  dispatch(return_void = true):
-    # TODO - dispatch specialization when C is unit strided
-    ukernel.gemm_packed_impl(
-      M, N, K,
-      alpha, packedA, packedB,
-      beta, vC
-    )
-
-# ############################################################
-#
-#                       Private tests
-#
-# ############################################################
-
-when isMainModule:
-
-  import
-    ../../tensor/[allocator, datatypes, initialization],
-    strformat
-
-  proc toPtr*[T](t: Tensor[T]): ptr T =
-    cast[ptr T](t.unsafe_raw_data)
-
-  proc `$`[T](t: Tensor[T]): string =
-    var tmp = newSeq[T](t.size)
-    copyMem(tmp[0].addr, cast[ptr T](t.unsafe_raw_data), t.size * sizeof(T))
-    result = $tmp
-
-  proc pack_and_test[M, N, K: static int; T](
-          a: array[M, array[K, T]],
-          b: array[K, array[N, T]],
-          ab: array[M, array[N, T]]
-        ) =
-    echo "M: ", M
-    echo "N: ", N
-    echo "K: ", K
-    echo fmt"A [{M}x{K}] * B[{K}x{N}] -> C[{M}x{N}]"
-    let packedA_size = gemm_prepackA_mem_required(T, M, N, K)
-    var packA = newTensor[T](packedA_size)
-    gemm_prepackA(
-      packA.toPtr,
-      M, N, K,
-      a[0][0].unsafeAddr,
-      K, 1
-    )
-    # echo packA
-
-    let packedB_size = gemm_prepackB_mem_required(T, M, N, K)
-    var packB = newTensor[T](packedB_size)
-    gemm_prepackB(
-      packB.toPtr,
-      M, N, K,
-      b[0][0].unsafeAddr,
-      N, 1
-    )
-    # echo packB
-
-    var res_ab: array[M, array[N, T]]
-    gemm_packed(
-      M, N, K,
-      T(1), packA.toPtr, packB.toPtr,
-      T(0), res_ab[0][0].addr, N, 1
-    )
-
-    doAssert res_ab == ab, $res_ab
-    echo "SUCCESS\n"
-
-  # Tests
-  block:
-    let a = [[1.0, 2, 3],
-             [4.0, 5, 6],
-             [7.0, 8, 9]]
-
-    let b = [[1.0, 2, 3],
-             [4.0, 5, 6],
-             [7.0, 8, 9]]
-
-    let ab = [[30.0, 36, 42],
-             [66.0, 81, 96],
-             [102.0, 126, 150]]
-
-    pack_and_test(a, b, ab)
-
-  block:
-    let a = [[1.0, 2, 3],
-             [1.0, 1, 1],
-             [1.0, 1, 1]]
-
-    let b = [[1.0, 1],
-             [1.0, 1],
-             [1.0, 1]]
-
-    let ab = [[6.0, 6],
-              [3.0, 3],
-              [3.0, 3]]
-
-    pack_and_test(a, b, ab)
-
-  block:
-    let a = [[1.0, 2, 3],
-             [4.0, 5, 6],
-             [7.0, 8, 9]]
-
-    let b = [[1.0, 1],
-             [1.0, 1],
-             [1.0, 1]]
-
-    let ab = [[ 6.0,  6],
-              [15.0, 15],
-              [24.0, 24]]
-
-    pack_and_test(a, b, ab)
-
-  block:
-    let a = [[1.0,2,3],
-             [4.0,5,6]]
-
-    let b = [[7.0,  8],
-             [9.0, 10],
-             [11.0,12]]
-
-    let ab = [[ 58.0, 64],
-              [139.0,154]]
-
-    pack_and_test(a, b, ab)
-
-  block:
-    # example from http://www.intmath.com/matrices-determinants/matrix-multiplication-examples.php
-    echo "\n## (M x K) * (K x N) with M < N"
-    let a = [[-2,-3,-1],
-             [ 3, 0, 4]]
-    let b = [[ 1, 5, 2,-1],
-             [-3, 0, 3, 4],
-             [ 6,-2, 7,-4]]
-
-    let ab = [[ 1,-8,-20, -6],
-              [27, 7, 34,-19]]
-
-    pack_and_test(a, b, ab)
-
-  block:
-    # from http://www.calcul.com/show/calculator/matrix-multiplication_;5;5;5;5?matrix1=[[%225%22,%226%22,%225%22,%228%22],[%228%22,%222%22,%228%22,%228%22],[%220%22,%225%22,%224%22,%220%22],[%224%22,%220%22,%225%22,%226%22],[%224%22,%225%22,%220%22,%223%22]]&matrix2=[[%225%22,%223%22,%226%22,%220%22],[%225%22,%222%22,%223%22,%223%22],[%228%22,%228%22,%222%22,%220%22],[%227%22,%227%22,%220%22,%220%22]]&operator=*
-    echo "\n## (M x K) * (K x N) with M > N and M > block-size (4x4)"
-    let a =  [[5,6,5,8],
-              [8,2,8,8],
-              [0,5,4,0],
-              [4,0,5,6],
-              [4,5,0,3]]
-    let b =  [[5,3,6,0],
-              [5,2,3,3],
-              [8,8,2,0],
-              [7,7,0,0]]
-
-    let ab = [[151,123,58,18],
-              [170,148,70, 6],
-              [ 57, 42,23,15],
-              [102, 94,34, 0],
-              [ 66, 43,39,15]]
-
-    pack_and_test(a, b, ab)
-
-  block:
-    let a =  [[2, 4,  3,  1,  3,  1,  3,  1],
-              [4, 3,  2,  4,  1,  0,  0,  0]]
-
-
-    let b =  [[2, 2],
-              [2, 1],
-              [0, 3],
-              [0, 1],
-              [0, 2],
-              [4, 3],
-              [3, 3],
-              [2, 1]]
-
-    let ab = [[27,37],
-              [14,23]]
-
-    pack_and_test(a, b, ab)
-
-  block:
-    let a =  [[2, 1],
-              [1, 3],
-              [2, 1],
-              [1, 0],
-              [3, 4],
-              [2, 4],
-              [3, 1],
-              [4, 0]]
-
-
-    let b =  [[2, 2,  0,  4,  0,  0,  4,  2],
-              [2, 1,  2,  1,  2,  4,  4,  1]]
-
-    let ab = [[ 6,  5,  2,  9,  2,  4, 12,  5],
-              [ 8,  5,  6,  7,  6, 12, 16,  5],
-              [ 6,  5,  2,  9,  2,  4, 12,  5],
-              [ 2,  2,  0,  4,  0,  0,  4,  2],
-              [14, 10,  8, 16,  8, 16, 28, 10],
-              [12,  8,  8, 12,  8, 16, 24,  8],
-              [ 8,  7,  2, 13,  2,  4, 16,  7],
-              [ 8,  8,  0, 16,  0,  0, 16,  8]]
-
-    pack_and_test(a, b, ab)
-
-  block:
-    # from http://www.calcul.com/show/calculator/matrix-multiplication?matrix1=[[%222%22,%224%22,%223%22,%221%22,%223%22,%221%22,%223%22,%221%22],[%221%22,%222%22,%221%22,%221%22,%222%22,%220%22,%224%22,%223%22],[%222%22,%220%22,%220%22,%223%22,%220%22,%224%22,%224%22,%221%22],[%221%22,%221%22,%224%22,%220%22,%223%22,%221%22,%223%22,%220%22],[%223%22,%224%22,%221%22,%221%22,%224%22,%222%22,%223%22,%224%22],[%222%22,%224%22,%220%22,%222%22,%223%22,%223%22,%223%22,%224%22],[%223%22,%220%22,%220%22,%223%22,%221%22,%224%22,%223%22,%221%22],[%224%22,%223%22,%222%22,%224%22,%221%22,%220%22,%220%22,%220%22]]&matrix2=[[%222%22,%222%22,%220%22,%224%22,%220%22,%220%22,%224%22,%222%22],[%222%22,%220%22,%220%22,%221%22,%221%22,%221%22,%223%22,%221%22],[%220%22,%222%22,%222%22,%220%22,%222%22,%222%22,%223%22,%223%22],[%220%22,%220%22,%221%22,%220%22,%224%22,%222%22,%224%22,%221%22],[%220%22,%220%22,%221%22,%223%22,%224%22,%222%22,%224%22,%222%22],[%224%22,%223%22,%224%22,%221%22,%224%22,%224%22,%220%22,%223%22],[%223%22,%223%22,%220%22,%222%22,%221%22,%222%22,%223%22,%223%22],[%222%22,%221%22,%222%22,%221%22,%222%22,%224%22,%224%22,%221%22]]&operator=*
-    echo "\n## (N x N) * (N x N) with N multiple of block size"
-
-    let a =  [[2, 4,  3,  1,  3,  1,  3,  1],
-              [1, 2,  1,  1,  2,  0,  4,  3],
-              [2, 0,  0,  3,  0,  4,  4,  1],
-              [1, 1,  4,  0,  3,  1,  3,  0],
-              [3, 4,  1,  1,  4,  2,  3,  4],
-              [2, 4,  0,  2,  3,  3,  3,  4],
-              [3, 0,  0,  3,  1,  4,  3,  1],
-              [4, 3,  2,  4,  1,  0,  0,  0]]
-
-
-    let b =  [[2, 2,  0,  4,  0,  0,  4,  2],
-              [2, 0,  0,  1,  1,  1,  3,  1],
-              [0, 2,  2,  0,  2,  2,  3,  3],
-              [0, 0,  1,  0,  4,  2,  4,  1],
-              [0, 0,  1,  3,  4,  2,  4,  2],
-              [4, 3,  4,  1,  4,  4,  0,  3],
-              [3, 3,  0,  2,  1,  2,  3,  3],
-              [2, 1,  2,  1,  2,  4,  4,  1]]
-
-    let ab = [[27,23,16,29,35,32,58,37],
-              [24,19,11,23,26,30,49,27],
-              [34,29,21,21,34,34,36,32],
-              [17,22,15,21,28,25,40,33],
-              [39,27,23,40,45,46,72,41],
-              [41,26,25,34,47,48,65,38],
-              [33,28,22,26,37,34,41,33],
-              [14,12, 9,22,27,17,51,23]]
-
-    pack_and_test(a, b, ab)
diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_tiling.nim b/benchmarks/matmul/laser_gemm_backend/gemm_tiling.nim
deleted file mode 100644
index 6d6c885..0000000
--- a/benchmarks/matmul/laser_gemm_backend/gemm_tiling.nim
+++ /dev/null
@@ -1,344 +0,0 @@
-# Laser
-# Copyright (c) 2018 Mamy André-Ratsimbazafy
-# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
-# This file may not be copied, modified, or distributed except according to those terms.
-
-# ############################################################
-#
-#               Cache and register optimizations
-#
-# ############################################################
-
-# Papers:
-#   [1] Anatomy of High-Performance Matrix Multiplication (Revised)
-#       Kazushige Goto, Robert A. Van de Geijn
-#     - http://www.cs.utexas.edu/~flame/pubs/GotoTOMS_revision.pdf
-#
-#   [2] Anatomy of High-Performance Many-Threaded Matrix Multiplication
-#       Smith et al
-#     - http://www.cs.utexas.edu/users/flame/pubs/blis3_ipdps14.pdf
-#
-#   [3] Automating the Last-Mile for High Performance Dense Linear Algebra
-#       Veras et al
-#     - https://arxiv.org/pdf/1611.08035.pdf
-#
-#   [4] GEMM: From Pure C to SSE Optimized Micro Kernels
-#       Michael Lehn
-#     - http://apfel.mathematik.uni-ulm.de/~lehn/sghpc/gemm/index.html
-#
-#   Laser wiki - GEMM optimization resources
-#     - https://github.com/numforge/laser/wiki/GEMM-optimization-resources
-
-import
-  ../../cpuinfo,
-  ../laser_utils/[
-    compiler_optim_hints,
-    memory, align_unroller,
-  ],
-  typetraits, macros,
-  ./gemm_utils
-
-# ############################################################
-#
-#                    Microkernel (µkernel)
-#
-# ############################################################
-
-# We have to take into account vectorisation
-# so that the microkernel can be processed with vectorized intrinsics.
-#
-# Caux [mr, nr] must stay in register.
-#   - mr ~= nr is optimal to amortize register load cost
-#   - some registers must be left to prefetch Ã and ~B (PackedA and PackedB)
-#   - nr >= (flops/cycle) / (bytes/cycle) * sizeof(element)
-#
-# For example Haswell is capable of
-#   - 32 single-precision FLOPs/cycle
-#   - 32 bytes/cycle store and 64 bytes/cycle load (store C, load A and B)
-#
-#   so nr >= 32/32 * 4
-#   For that number of FLOP it must schedule
-#   2xFMA so consume 16 single-precision float
-#   so mr*nr >= 16
-
-type
-  MicroKernel* = object
-    mr*, nr*: int
-    cpu_simd*: CPUFeatureX86
-    nb_scalars*: int # Ideally MicroKernel should be generic over T
-    nb_vecs_nr*: int
-    c_unit_stride*: bool # We can use SIMD for the epilogue of C has a unit_stride
-    pt*: int # Parallelization threshold
-
-    # TODO: ARM support
-    #   - https://github.com/nim-lang/Nim/issues/9679
-    #   - https://github.com/nim-lang/Nim/issues/9678
-
-  CPUFeatureX86* = enum
-    x86_Generic,
-    x86_SSE,
-    x86_SSE2,
-    x86_SSE4_1,
-    x86_AVX,
-    x86_AVX_FMA,
-    x86_AVX2,
-    x86_AVX512
-    #   Note that Skylake SP, Xeon Bronze Silver and Gold 5XXX
-    #   only have a single AVX512 port and AVX2 can be faster
-    #   due to AVX512 downclocking
-
-  X86_FeatureMap = array[CPUFeatureX86, int]
-
-const X86_vecwidth_float: X86_FeatureMap = [
-  x86_Generic:         1,
-  x86_SSE:     128 div 8,
-  x86_SSE2:    128 div 8,
-  x86_SSE4_1:  128 div 8,
-  x86_AVX:     256 div 8,
-  x86_AVX_FMA: 256 div 8,
-  x86_AVX2:    256 div 8,
-  x86_AVX512:  512 div 8
-]
-
-const X86_vecwidth_int: X86_FeatureMap = [
-  x86_Generic:         1,
-  x86_SSE:             1,
-  x86_SSE2:    128 div 8,
-  x86_SSE4_1:  128 div 8,
-  x86_AVX:     128 div 8,  # Not even addition with integer AVX
-  x86_AVX_FMA: 128 div 8,
-  x86_AVX2:    256 div 8,
-  x86_AVX512:  512 div 8
-]
-
-# Registers constraints and micro-kernel tuning
-#   - To issue 2xFMAs in parallel we need to use 2x SIMD registers
-#   - We want to hold C of size MR * NR completely in SIMD registers as well
-#     as each value is reused k times during accumulation C[i, j] += A[i, k] * B[k, j]
-#   - We should have enough SIMD registers left to hold
-#     the corresponding sections of A and B (at least 4, 2xA and 2xB for FMAs)
-#
-# On x86-64 X SIMD registers that can issue 2xFMAs per cycle:
-#    - NbVecs is 2 minimum
-#    - RegsPerVec = 2 * NbVecs => 4 minimum (for A and for B)
-#    - NR = NbVecs * NbScalarsPerSIMD
-#    - C: MR*NR and uses MR*NbVecs SIMD registers
-#    - MR*NbVecs + RegsPerVec <= X
-#       -> MR*NbVecs + 2 * NbVecs <= X
-#       -> (MR+2) * NbVecs <= X
-#
-# Some solutions:
-#    - AVX with 16 registers:
-#          - MR = 6, NbVecs = 2
-#            FP32: 8xFP32 per SIMD --> NR = 2x8
-#                  ukernel = 6x16
-#            FP64, ukernel = 6x8
-#          - MR = 2, NbVecs = 4
-#            FP32: 8xFP32 per SIMD --> NR = 4x8
-#                  ukernel = 2x32
-#            FP64, ukernel = 2x16
-#    - AVX512 with 32 registers
-#          - MR = 6, NbVecs = 4
-#            FP32 ukernel = 6x64
-#            FP64 ukernel = 6x32
-#          - MR = 2, NbVecs = 8
-#            FP32 ukernel = 2x128
-#            FP64 ukernel = 2x64
-#          - MR = 14, NbVecs = 2
-#            FP32 ukernel = 14x32
-#            FP64 ukernel = 14x16
-when defined(amd64): # 64-bit
-  # MR configuration - rows of Ã in micro kernel
-  # 16 General purpose registers
-  const X86_regs: X86_FeatureMap = [
-    x86_Generic: 2,
-    x86_SSE:     6,
-    x86_SSE2:    6,
-    x86_SSE4_1:  6,
-    x86_AVX:     6,
-    x86_AVX_FMA: 6,
-    x86_AVX2:    6,
-    x86_AVX512:  14
-  ]
-
-  # NR configuration - Nb of ~B SIMD vectors
-  # We will also keep as many rows of Ã in SIMD registers at the same time
-  const NbVecs: X86_FeatureMap = [
-      x86_Generic: 1,
-      x86_SSE:     2, # 16 XMM registers
-      x86_SSE2:    2,
-      x86_SSE4_1:  2,
-      x86_AVX:     2, # 16 YMM registers
-      x86_AVX_FMA: 2,
-      x86_AVX2:    2,
-      x86_AVX512:  2  # 32 ZMM registers
-    ]
-else: # 32-bit
-  # MR configuration - registers for the rows of Ã
-  # 8 General purpose registers
-  const X86_regs: X86_FeatureMap = [
-    x86_Generic: 2,
-    x86_SSE:     2,
-    x86_SSE2:    2,
-    x86_SSE4_1:  2,
-    x86_AVX:     2,
-    x86_AVX_FMA: 2,
-    x86_AVX2:    2,
-    x86_AVX512:  2
-  ]
-
-  # NR configuration - Nb of ~B SIMD vectors
-  const NbVecs: X86_FeatureMap = [
-      x86_Generic: 1,
-      x86_SSE:     2, # 8 XMM registers
-      x86_SSE2:    2,
-      x86_SSE4_1:  2,
-      x86_AVX:     2, # 8 YMM registers
-      x86_AVX_FMA: 2,
-      x86_AVX2:    2,
-      x86_AVX512:  2  # 8 ZMM registers
-    ]
-
-func x86_ukernel*(cpu: CPUFeatureX86, T: typedesc, c_unit_stride: bool): MicroKernel =
-  result.cpu_simd = cpu
-  result.c_unit_stride = c_unit_stride
-  result.pt = 128
-  when T is SomeFloat:
-    result.nb_scalars = max(1, X86_vecwidth_float[cpu] div T.sizeof)
-  elif T is SomeInteger: # Integers
-    result.nb_scalars = max(1, X86_vecwidth_int[cpu] div T.sizeof)
-  else:
-    {.error: "Unsupported type: " & T.type.name.}
-
-  # The inner microkernel loop does:
-  #   AB[m][n] = A[m] * B[n]
-  # So n should be the vector size
-  # if most matrices are row-Major.
-  # This avoids dealing with transpose
-  # in the inner loop and untranspose in the epilogue
-
-  result.mr = X86_regs[cpu]                 # 2~6 registers for the rows of Ã
-  result.nb_vecs_nr = NbVecs[cpu]           # SIMD vectors of B
-  result.nr = result.nb_vecs_nr * result.nb_scalars
-
-#############################################
-# Workaround "undeclared identifier mr or nr"
-# for some reason the compiler cannot access fields in
-# the static MicroKernel.
-
-macro extract_mr*(ukernel: static MicroKernel): untyped =
-  result = newLit ukernel.mr
-macro extract_nr*(ukernel: static MicroKernel): untyped =
-  result = newLit ukernel.nr
-macro extract_cpu_simd*(ukernel: static MicroKernel): untyped =
-  let simd = ukernel.cpu_simd
-  result = quote do: CPUFeatureX86(`simd`)
-macro extract_nb_scalars*(ukernel: static MicroKernel): untyped =
-  result = newLit ukernel.nb_scalars
-macro extract_nb_vecs_nr*(ukernel: static MicroKernel): untyped =
-  result = newLit ukernel.nb_vecs_nr
-macro extract_c_unit_stride*(ukernel: static MicroKernel): untyped =
-  result = newLit ukernel.c_unit_stride
-macro extract_pt*(ukernel: static MicroKernel): untyped =
-  result = newLit ukernel.pt
-
-
-# ############################################################
-#
-#                    Loop tiling
-#
-# ############################################################
-
-# multithreading info in [2] and https://github.com/flame/blis/blob/master/docs/Multithreading.md
-
-type Tiles*[T] = ref object
-  a*: ptr UncheckedArray[T]
-  b*: ptr UncheckedArray[T]
-  mc*, nc*, kc*: int
-
-  # Multithreaded panels
-  ic_num_tasks*: int   # For private L1-L2 and shared L3
-  upanelA_size*: int   # Each thread uses a different upanel of A
-
-  # Allocation data
-  a_alloc_mem: pointer
-  b_alloc_mem: pointer
-  # The Tiles data structure takes 64-byte = 1 cache-line
-
-
-proc deallocTiles[T](tiles: Tiles[T]) =
-  if not tiles.a_alloc_mem.isNil:
-    deallocShared tiles.a_alloc_mem
-  if not tiles.b_alloc_mem.isNil:
-    deallocShared tiles.b_alloc_mem
-
-func get_num_tiles*(dim_size, tile_size: int): int {.inline.} =
-  ## Get the number of tiles along a dimension depending on the tile size
-  (dim_size + tile_size - 1) div tile_size
-
-func partitionMNK*(
-      ukernel: static MicroKernel,
-      T: typedesc,
-      M, N, K: Natural,
-    ): tuple[mc, nc, kc: int] =
-
-  result.nc = N # We don't partition over N
-
-  # ## Panel sizes
-  # - TLB constraint
-  #   TA ̃ + 2(TBj + TCj)≤T
-  #   Note: optimizing the similar problem mckc/(2mc+2kc)
-  #         under the constraint that mckc ≤ K is the problem
-  #         of maximizing the area of a rectangle
-  #         while minimizing the perimeter,
-  #
-  # Goto paper [1] section 6.3: choosing kc
-  #   - kc should be as large as possible to amortize the mr*nr updates of Cj
-  #   - Elements from Bj [kc, nr] must remain in L1 cache.
-  #   - kc * nr should occupy less than half the L1 cache
-  #     so that Ã and Caux do not evict element of Bj
-  #   - Ã [kc, mc] should occupy
-  #     a considerable fraction of the L2 cache
-  #   In our experience optimal choice is so that "kc" float64 occupy half a page
-  #     -> a page is 4096 bytes = 512 float64 -> half a page = 256
-
-  # Goto paper [1] section 6.3: choosing mc
-  #   - mc*kc should fill a considerable part of (1) the memory addressable
-  #     by the TLB and (2) the L2 cache
-  #     In practice mc is chosen so that A occupies about half the smaller of (1) and (2)
-
-
-  # TODO: heuristics to compute the size
-  result.mc = min( 768 div T.sizeof, M)
-  result.kc = min(2048 div T.sizeof, K)
-
-proc newTiles*(
-        ukernel: static MicroKernel,
-        T: typedesc,
-        M, N, K: Natural,
-        ): Tiles[T] =
-  # BLIS paper [2] section II Figure 2:
-  #   - kc * nr in L1 cache µkernel
-  #   - mc * kc in L2 cache Ã
-  #   - kc * nc in L3 cache ~B (no L3 in Xeon Phi ¯\_(ツ)_/¯)
-  new result, deallocTiles[T]
-  const
-    nr = ukernel.nr
-    mr = ukernel.mr
-
-  (result.mc, result.nc, result.kc) = ukernel.partitionMNK(T, M, N, K)
-
-  # Parallel config
-  # Ic loop parallel means that each thread will share a panel B and pack a different A
-  result.ic_num_tasks = get_num_tiles(M, result.mc)
-
-  # Packing
-  # During packing the max size is unroll_stop*kc+kc*LR, LR = MR or NR
-  result.upanelA_size = result.kc*round_step_up(result.mc, mr)
-  let bufA_size = T.sizeof * result.upanelA_size * result.ic_num_tasks
-  let bufB_size = T.sizeof * result.kc*round_step_up(result.nc, nr)
-
-  result.a_alloc_mem = allocShared(bufA_size + 63)
-  result.b_alloc_mem = allocShared(bufB_size + 63)
-  result.a = assume_aligned align_raw_data(T, result.a_alloc_mem)
-  result.b = assume_aligned align_raw_data(T, result.b_alloc_mem)
diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx.nim b/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx.nim
deleted file mode 100644
index b074d74..0000000
--- a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx.nim
+++ /dev/null
@@ -1,44 +0,0 @@
-# Laser
-# Copyright (c) 2018 Mamy André-Ratsimbazafy
-# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
-# This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ./gemm_ukernel_generator, ./gemm_tiling,
-  ../laser_utils/simd
-
-template float32x8_muladd_unfused(a, b, c: m256): m256 =
-  mm256_add_ps(mm256_mul_ps(a, b), c)
-
-template float64x4_muladd_unfused(a, b, c: m256d): m256d =
-  mm256_add_pd(mm256_mul_pd(a, b), c)
-
-ukernel_generator(
-      x86_AVX,
-      typ = float32,
-      vectype = m256,
-      nb_scalars = 8,
-      simd_setZero = mm256_setzero_ps,
-      simd_broadcast_value = mm256_set1_ps,
-      simd_load_aligned = mm256_load_ps,
-      simd_load_unaligned = mm256_loadu_ps,
-      simd_store_unaligned = mm256_storeu_ps,
-      simd_mul = mm256_mul_ps,
-      simd_add = mm256_add_ps,
-      simd_fma = float32x8_muladd_unfused
-    )
-
-ukernel_generator(
-      x86_AVX,
-      typ = float64,
-      vectype = m256d,
-      nb_scalars = 4,
-      simd_setZero = mm256_setzero_pd,
-      simd_broadcast_value = mm256_set1_pd,
-      simd_load_aligned = mm256_load_pd,
-      simd_load_unaligned = mm256_loadu_pd,
-      simd_store_unaligned = mm256_storeu_pd,
-      simd_mul = mm256_mul_pd,
-      simd_add = mm256_add_pd,
-      simd_fma = float64x4_muladd_unfused
-    )
diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx2.nim b/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx2.nim
deleted file mode 100644
index d1d7b15..0000000
--- a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx2.nim
+++ /dev/null
@@ -1,35 +0,0 @@
-# Laser
-# Copyright (c) 2018 Mamy André-Ratsimbazafy
-# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
-# This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ./gemm_ukernel_generator, ./gemm_tiling,
-  ../laser_utils/simd
-
-template int32x8_muladd_unfused_avx2(a, b, c: m256i): m256i =
-  mm256_add_epi32(mm256_mullo_epi32(a, b), c)
-
-template int32x8_loada(mem_addr: ptr int32): m256i =
-  mm256_load_si256(cast[ptr m256i](mem_addr))
-
-template int32x8_loadu(mem_addr: ptr int32): m256i =
-  mm256_loadu_si256(cast[ptr m256i](mem_addr))
-
-template int32x8_storeu(mem_addr: ptr int32, a: m256i) =
-  mm256_storeu_si256(cast[ptr m256i](mem_addr), a)
-
-ukernel_generator(
-      x86_AVX2,
-      typ = int32,
-      vectype = m256i,
-      nb_scalars = 8,
-      simd_setZero = mm256_setzero_si256,
-      simd_broadcast_value = mm256_set1_epi32,
-      simd_load_aligned = int32x8_loada,
-      simd_load_unaligned = int32x8_loadu,
-      simd_store_unaligned = int32x8_storeu,
-      simd_mul = mm256_mullo_epi32,
-      simd_add = mm256_add_epi32,
-      simd_fma = int32x8_muladd_unfused_avx2
-    )
diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx512.nim b/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx512.nim
deleted file mode 100644
index d52beab..0000000
--- a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx512.nim
+++ /dev/null
@@ -1,74 +0,0 @@
-# Laser
-# Copyright (c) 2018 Mamy André-Ratsimbazafy
-# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
-# This file may not be copied, modified, or distributed except according to those terms.
-
-import
-    ./gemm_ukernel_generator, ./gemm_tiling,
-  ../laser_utils/simd
-
-ukernel_generator(
-    x86_AVX512,
-    typ = float32,
-    vectype = m512,
-    nb_scalars = 16,
-    simd_setZero = mm512_setzero_ps,
-    simd_broadcast_value = mm512_set1_ps,
-    simd_load_aligned = mm512_load_ps,
-    simd_load_unaligned = mm512_loadu_ps,
-    simd_store_unaligned = mm512_storeu_ps,
-    simd_mul = mm512_mul_ps,
-    simd_add = mm512_add_ps,
-    simd_fma = mm512_fmadd_ps
-  )
-
-ukernel_generator(
-    x86_AVX512,
-    typ = float64,
-    vectype = m512d,
-    nb_scalars = 8,
-    simd_setZero = mm512_setzero_pd,
-    simd_broadcast_value = mm512_set1_pd,
-    simd_load_aligned = mm512_load_pd,
-    simd_load_unaligned = mm512_loadu_pd,
-    simd_store_unaligned = mm512_storeu_pd,
-    simd_mul = mm512_mul_pd,
-    simd_add = mm512_add_pd,
-    simd_fma = mm512_fmadd_pd
-  )
-
-template int32x16_muladd_unfused_avx512(a, b, c: m512i): m512i =
-  mm512_add_epi32(mm512_mullo_epi32(a, b), c)
-
-ukernel_generator(
-    x86_AVX512,
-    typ = int32,
-    vectype = m512i,
-    nb_scalars = 16,
-    simd_setZero = mm512_setzero_si512,
-    simd_broadcast_value = mm512_set1_epi32,
-    simd_load_aligned = mm512_load_si512,
-    simd_load_unaligned = mm512_loadu_si512,
-    simd_store_unaligned = mm512_storeu_si512,
-    simd_mul = mm512_mullo_epi32,
-    simd_add = mm512_add_epi32,
-    simd_fma = int32x16_muladd_unfused_avx512
-    )
-
-template int64x8_muladd_unfused_avx512(a, b, c: m512i): m512i =
-  mm512_add_epi64(mm512_mullo_epi64(a, b), c)
-
-ukernel_generator(
-    x86_AVX512,
-    typ = int64,
-    vectype = m512i,
-    nb_scalars = 8,
-    simd_setZero = mm512_setzero_si512,
-    simd_broadcast_value = mm512_set1_epi64,
-    simd_load_aligned = mm512_load_si512,
-    simd_load_unaligned = mm512_loadu_si512,
-    simd_store_unaligned = mm512_storeu_si512,
-    simd_mul = mm512_mullo_epi64,
-    simd_add = mm512_add_epi64,
-    simd_fma = int64x8_muladd_unfused_avx512
-    )
diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx_fma.nim b/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx_fma.nim
deleted file mode 100644
index 112a040..0000000
--- a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_avx_fma.nim
+++ /dev/null
@@ -1,38 +0,0 @@
-# Laser
-# Copyright (c) 2018 Mamy André-Ratsimbazafy
-# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
-# This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ./gemm_ukernel_generator, ./gemm_tiling,
-  ../laser_utils/simd
-
-ukernel_generator(
-      x86_AVX_FMA,
-      typ = float32,
-      vectype = m256,
-      nb_scalars = 8,
-      simd_setZero = mm256_setzero_ps,
-      simd_broadcast_value = mm256_set1_ps,
-      simd_load_aligned = mm256_load_ps,
-      simd_load_unaligned = mm256_loadu_ps,
-      simd_store_unaligned = mm256_storeu_ps,
-      simd_mul = mm256_mul_ps,
-      simd_add = mm256_add_ps,
-      simd_fma = mm256_fmadd_ps
-    )
-
-ukernel_generator(
-      x86_AVX_FMA,
-      typ = float64,
-      vectype = m256d,
-      nb_scalars = 4,
-      simd_setZero = mm256_setzero_pd,
-      simd_broadcast_value = mm256_set1_pd,
-      simd_load_aligned = mm256_load_pd,
-      simd_load_unaligned = mm256_loadu_pd,
-      simd_store_unaligned = mm256_storeu_pd,
-      simd_mul = mm256_mul_pd,
-      simd_add = mm256_add_pd,
-      simd_fma = mm256_fmadd_pd,
-    )
diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_dispatch.nim b/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_dispatch.nim
deleted file mode 100644
index 7499b93..0000000
--- a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_dispatch.nim
+++ /dev/null
@@ -1,125 +0,0 @@
-# Laser
-# Copyright (c) 2018 Mamy André-Ratsimbazafy
-# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
-# This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  macros,
-  ../laser_utils/compiler_optim_hints,
-  ./gemm_tiling, ./gemm_utils,
-  ./gemm_ukernel_generic,
-  ./gemm_ukernel_sse,
-  ./gemm_ukernel_sse2,
-  ./gemm_ukernel_sse4_1,
-  ./gemm_ukernel_avx,
-  ./gemm_ukernel_avx_fma,
-  ./gemm_ukernel_avx2,
-  ./gemm_ukernel_avx512
-
-{.experimental: "dynamicBindSym".}
-
-# ############################################################
-#
-#            Dispatch with runtime cpu detection
-#
-# ############################################################
-
-template dispatch_common {.dirty.} =
-  let simd = ukernel.cpu_simd
-  let MR = ukernel.mr
-  let nb_scalars = ukernel.nb_scalars
-
-  result = newStmtList()
-
-  # 1. Prefetch packedB (used in microkernel)
-  #         and C (used in epilogue update)
-  result.add quote do:
-    prefetch(`packedB`, Read, LowTemporalLocality)
-    prefetch(`packedB` + `nb_scalars`, Read, LowTemporalLocality)
-    for i in 0 ..< `MR`:
-      prefetch(`vC`[i, 0].addr, Write, HighTemporalLocality)
-
-  # 2. Dispatch according to type and SIMD support
-  let symT = getTypeInst(alpha)
-
-
-macro dispatch_general(
-    ukernel: static MicroKernel,
-    kc: int,
-    alpha: typed, packedA, packedB: ptr UncheckedArray[typed],
-    beta: typed, vC: MatrixView[typed]
-  ): untyped =
-
-  dispatch_common()
-
-  # 2.1. No SIMD case
-  if simd == x86_Generic:
-    result.add quote do:
-      gebb_ukernel_fallback[`symT`, ukernel]( # Hack: ukernel is generic from the calling proc
-              `kc`,
-        `alpha`, `packedA`, `packedB`,
-        `beta`, `vC`
-      )
-    return
-
-  # 2.2. SIMD case
-  let simdTag = $simd
-  let ukernel_name = bindSym("gebb_ukernel_" & $symT & "_" & simdTag)
-  result.add quote do:
-    `ukernel_name`[ukernel]( # Hack: ukernel is generic from the calling proc
-      `kc`,
-      `alpha`, `packedA`, `packedB`,
-      `beta`, `vC`
-    )
-
-proc gebb_ukernel*[T; ukernel: static MicroKernel](
-      kc: int,
-      alpha: T, packedA, packedB: ptr UncheckedArray[T],
-      beta: T, vC: MatrixView[T]
-    ){.inline.} =
-
-  ukernel.dispatch_general(kc, alpha, packedA, packedB, beta, vC)
-
-
-# ############################################################
-#
-#                      Exported proc
-#
-# ############################################################
-
-macro dispatch_edge(
-    ukernel: static MicroKernel,
-    mr, nr, kc: int,
-    alpha: typed, packedA, packedB: ptr UncheckedArray[typed],
-    beta: typed, vC: MatrixView[typed]
-  ): untyped =
-
-  dispatch_common()
-
-  # 2.1. No SIMD case
-  if simd == x86_Generic:
-    result.add quote do:
-      gebb_ukernel_edge_fallback[`symT`, ukernel]( # Hack: ukernel is generic from the calling proc
-        `mr`, `nr`, `kc`,
-        `alpha`, `packedA`, `packedB`,
-        `beta`, `vC`
-      )
-    return
-
-  # 2.2. SIMD case
-  let simdTag = $simd
-  let ukernel_name = bindSym("gebb_ukernel_edge_" & $symT & "_" & simdTag)
-  result.add quote do:
-    `ukernel_name`[ukernel]( # Hack: ukernel is generic from the calling proc
-      `mr`, `nr`, `kc`,
-      `alpha`, `packedA`, `packedB`,
-      `beta`, `vC`
-    )
-
-proc gebb_ukernel_edge*[T; ukernel: static MicroKernel](
-      mr, nr, kc: int,
-      alpha: T, packedA, packedB: ptr UncheckedArray[T],
-      beta: T, vC: MatrixView[T]
-    ){.inline.} =
-
-  ukernel.dispatch_edge(mr, nr, kc, alpha, packedA, packedB, beta, vC)
diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_generator.nim b/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_generator.nim
deleted file mode 100644
index f15c6a5..0000000
--- a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_generator.nim
+++ /dev/null
@@ -1,250 +0,0 @@
-# Laser
-# Copyright (c) 2018 Mamy André-Ratsimbazafy
-# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
-# This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ../laser_utils/compiler_optim_hints,
-  ../laser_utils/simd,
-  ./gemm_tiling, ./gemm_utils,
-  ./gemm_ukernel_generic,
-  macros
-
-# ############################################################
-#
-#             SIMD implementation generator
-#
-# ############################################################
-
-# Macro ukernel_generator should be invoked in different files so that specific
-# flags like "-mavx -mfma" are isolated.
-# Add the corresponding compilation flags to "nim.cfg"
-
-# #############################################################
-
-template ukernel_simd_proc(ukernel_name, epilogue_name: NimNode, edge: bool) {.dirty.} =
-  if edge:
-    result.add quote do:
-      proc `ukernel_name`*[ukernel: static MicroKernel](
-            mr, nr, kc: int,
-            alpha: `T`, packedA, packedB: ptr UncheckedArray[`T`],
-            beta: `T`, vC: MatrixView[`T`]
-          ) =
-
-        let AB{.align_variable.} = ukernel_simd_impl(
-          ukernel, `V`, packedA, packedB, kc,
-          `simd_setZero`, `simd_load_aligned`, `simd_broadcast_value`, `simd_fma`
-        )
-        const
-          is_c_unit_stride = ukernel.extract_c_unit_stride()
-          MR = ukernel.extract_mr()
-          NR = ukernel.extract_nr()
-
-        gebb_ukernel_edge_epilogue(
-                alpha, to_ptr(AB, MR, NR, `T`),
-                beta, vC, mr, nr
-              )
-  else:
-    result.add quote do:
-      proc `ukernel_name`*[ukernel: static MicroKernel](
-            kc: int,
-            alpha: `T`, packedA, packedB: ptr UncheckedArray[`T`],
-            beta: `T`, vC: MatrixView[`T`]
-          ) =
-        let AB{.align_variable.} = ukernel_simd_impl(
-          ukernel, `V`, packedA, packedB, kc,
-          `simd_setZero`, `simd_load_aligned`, `simd_broadcast_value`, `simd_fma`
-        )
-        const
-          is_c_unit_stride = ukernel.extract_c_unit_stride()
-          MR = ukernel.extract_mr()
-          NR = ukernel.extract_nr()
-
-        # when is_c_unit_stride:
-        #   `epilogue_name`(alpha, AB, beta, vC)
-        # else:
-        gebb_ukernel_epilogue_fallback(
-          alpha, to_ptr(AB, MR, NR, `T`),
-          beta, vC)
-
-# #############################################################
-
-template epilogue() {.dirty.} =
-  result.add quote do:
-    proc `epilogue_name`[MR, NbVecs: static int](
-            alpha: `T`, AB: array[MR, array[NbVecs, `V`]],
-            beta: `T`, vC: MatrixView[`T`]
-          ) =
-      template C(i,j: int): untyped {.dirty.} =
-        vC.buffer[i*vC.rowStride + j*`nb_scalars`]
-
-      if beta == 0.`T`:
-        for i in 0 ..< MR:
-          for j in 0 ..< NbVecs:
-            `simd_store_unaligned`(C(i,j).addr, `simd_setZero`())
-      elif beta != 1.`T`:
-        let beta_vec = `simd_broadcast_value`(beta)
-        for i in 0 ..< MR:
-          for j in 0 ..< NbVecs:
-            `simd_store_unaligned`(C(i,j).addr, `simd_mul`(beta_vec, C(i,j).addr.`simd_load_unaligned`))
-
-      if alpha == 1.`T`:
-        for i in 0 ..< MR:
-          for j in 0 ..< NbVecs:
-            `simd_store_unaligned`(C(i,j).addr, `simd_add`(AB[i][j], C(i,j).addr.`simd_load_unaligned`))
-      else:
-        let alpha_vec = `simd_broadcast_value`(alpha)
-        for i in 0 ..< MR:
-          for j in 0 ..< NbVecs:
-            `simd_store_unaligned`(C(i,j).addr, `simd_fma`(alpha_vec, AB[i][j], C(i,j).addr.`simd_load_unaligned`))
-
-# #############################################################
-
-macro ukernel_generator*(
-      simd: static CPUFeatureX86,
-      typ: untyped,
-      vectype: untyped,
-      nb_scalars: static int,
-      simd_setZero: untyped,
-      simd_broadcast_value: untyped,
-      simd_load_aligned: untyped,
-      simd_load_unaligned: untyped,
-      simd_store_unaligned: untyped,
-      simd_mul: untyped,
-      simd_add: untyped,
-      simd_fma: untyped,
-    ): untyped =
-
-  let T = newIdentNode($typ)
-  let V = newIdentNode($vectype)
-  let epilogue_name = newIdentNode("gebb_ukernel_epilogue_" & $T & "_" & $simd)
-  result = newStmtList()
-
-  # 1. Generate the epilogue function
-  epilogue()
-
-  # 2. Generate the microkernels for the general and edge cases
-  block:
-    let ukernel_name = newIdentNode("gebb_ukernel_" & $T & "_" & $simd)
-    ukernel_simd_proc(ukernel_name, epilogue_name, edge = false)
-  block:
-    let ukernel_name = newIdentNode("gebb_ukernel_edge_" & $T & "_" & $simd)
-    ukernel_simd_proc(ukernel_name, epilogue_name, edge = true)
-
-# ############################################################
-#
-#             Actual SIMD implementation
-#
-# ############################################################
-
-macro ukernel_simd_impl*(
-      ukernel: static MicroKernel, V: untyped, A, B: untyped, kc: int,
-      simd_setZero, simd_load_aligned, simd_broadcast_value, simd_fma: untyped
-    ): untyped =
-
-
-  let MR = ukernel.mr
-  let NR = ukernel.nr
-
-  if false: # Debug implementation
-    result = quote do:
-      var AB{.align_variable.}: array[`MR`, array[`NR`, float64]]
-      var  A {.restrict.} = assume_aligned packedA # [kc, mc] by chunks of mr
-      var  B {.restrict.} = assume_aligned packedB # [kc, nc] by chunks of nr
-
-      for k in 0 ..< kc:
-        prefetch(B[(k+1)*`NR`].addr, Read, LowTemporalLocality)
-        for i in 0 ..< `MR`:
-          for j in 0 ..< `NR`-1:
-            AB[i][j] += A[k*`MR`+i] * B[k*`NR`+j]
-      AB
-
-  else: # Vectorized implementation
-    result = newStmtList()
-
-    ## ukernel config
-    let
-      MR = ukernel.mr
-      NR = ukernel.nr
-      NbVecs = ukernel.nb_vecs_nr # == NR div NbScalars
-      NbScalars = ukernel.nb_scalars
-
-    ## Registers
-    # We keep all C in registers MR*NR size occupying MR*NbVecs
-    # We keep NbVecs slivers of A and B for C updates
-    var
-      rA: seq[NimNode]           # array[NbVecs, V]
-      rB: seq[NimNode]           # array[NbVecs, V]
-      rAB = nnkBracket.newTree() # array[MR, array[NbVecs, V]]
-    for jj in 0 ..< NbVecs:
-      rA.add genSym(nskVar, "A" & $jj)
-      rB.add genSym(nskVar, "B" & $jj)
-    for i in 0 ..< MR:
-      var rABi = nnkBracket.newTree()
-      for j in 0 ..< NbVecs:
-        rABi.add genSym(nskVar, "AB" & $i & "__" & $j)
-      rAB.add rABi
-
-    ## Declare
-    var declBody = newStmtList()
-    for a in rA:
-      declBody.add quote do:
-        var `a`{.noinit.}: `V`
-    for b in rB:
-      declBody.add quote do:
-        var `b`{.noinit.}: `V`
-    for i in 0 ..< MR:
-      for j in 0 ..< NbVecs:
-        let ab = rAB[i][j]
-        declBody.add quote do:
-          var `ab` = `simd_setZero`()
-
-    let k = genSym(nskForVar)
-
-    ## Prefetch
-    var prefetchBody = newStmtList()
-    for jj in 0 ..< NbVecs:
-      prefetchBody.add quote do:
-        prefetch(`B`[(`k`+1)*`NR`+`jj`*`NbScalars`].addr, Read, LowTemporalLocality)
-
-    ## Load
-    var loadBody = newStmtList()
-    for jj in 0 ..< NbVecs:
-      let b = rB[jj]
-      loadBody.add quote do:
-        `b` = `simd_load_aligned`(`B`[`k`*`NR`+`jj`*`NbScalars`].addr)
-
-    ## Interleaved broadcast and FMA
-    var bcast_fma = newStmtList()
-    block:
-      let a0 = rA[0]
-      bcast_fma.add quote do:
-        `a0` = `simd_broadcast_value`(`A`[`k`*`MR`])
-
-    for i in 0 ..< MR:
-      # broadcast next iteration
-      let next_register_idx = (i+1) mod NbVecs
-      let a_next = rA[next_register_idx]
-      bcast_fma.add quote do:
-        # At the edge: `i`+1 = MR so equivalent to loading A[(k+1)*MR]
-        `a_next` = `simd_broadcast_value`(`A`[`k`*`MR`+(`i`+1)])
-
-      # load current
-      let a = rA[i mod NbVecs]
-
-      # Do FMA on the current one
-      for jj in 0 ..< NbVecs:
-        let b = rB[jj]
-        let AB = rAB[i][jj]
-        bcast_fma.add quote do:
-          `AB` = `simd_fma`(`a`, `b`, `AB`)
-
-    ## Assemble:
-    result = quote do:
-      `declBody`
-      for `k` in 0 ..< `kc`:
-        `loadBody`
-        `prefetchBody`
-        `bcast_fma`
-      ## Write registers to a MR/NR array
-      `rAB`
diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_generic.nim b/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_generic.nim
deleted file mode 100644
index ad13189..0000000
--- a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_generic.nim
+++ /dev/null
@@ -1,138 +0,0 @@
-# Laser
-# Copyright (c) 2018 Mamy André-Ratsimbazafy
-# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
-# This file may not be copied, modified, or distributed except according to those terms.
-
-# Generic microkernel for matrix multiplication
-
-import
-  ../../cpuinfo,
-  ../laser_utils/compiler_optim_hints,
-  ./gemm_tiling, ./gemm_utils,
-  macros
-
-withCompilerOptimHints()
-
-# ############################################################
-#
-#          Generic GEBB microkernel implementation
-#
-# ############################################################
-
-template ukernel_generic_impl*(){.dirty.} =
-  const
-    MR = ukernel.extract_mr()
-    NR = ukernel.extract_nr()
-    simd = ukernel.extract_cpu_simd
-
-  var AB{.align_variable.}: array[MR, array[NR, T]]
-  var  A {.restrict.} = assume_aligned packedA # [kc, mc] by chunks of mr
-  var  B {.restrict.} = assume_aligned packedB # [kc, nc] by chunks of nr
-
-  for k in 0 ..< kc:
-    prefetch(B[(k+1)*NR].addr, Read, LowTemporalLocality)
-    for i in 0 ..< MR:
-      for j in `||`(0, NR-1, "simd"):
-        AB[i][j] += A[k*MR+i] * B[k*NR+j]
-
-# ############################################################
-#
-#          Fallback Generic version
-#
-# ############################################################
-#
-# Cases
-# 1. C *=   β, starting default
-# 2. C  =  AB, if β = 0 and α = 1
-# 3. C  = αAB, if β = 0 and α = 1
-# 4. C +=  AB, if α = 1
-# 5. C += αAB, if α = 1
-#
-# TODO: Fused operations like relu/sigmoid/tanh
-#       should be done here as well
-
-proc gebb_ukernel_epilogue_fallback*[MR, NR: static int, T](
-      alpha: T, AB: ptr array[MR, array[NR, T]],
-      beta: T,  vC: MatrixView[T]
-    ){.inline.} =
-
-  let pAB{.restrict.} = assume_aligned cast[ptr array[MR, array[NR, T]]](AB[0][0].unsafeAddr)
-
-  if beta == 0.T:
-    for i in 0 ..< MR:
-      for j in 0 ..< NR:
-        vC[i, j] = 0.T
-  elif beta != 1.T:                  # C *= β
-    for i in 0 ..< MR:
-      for j in 0 ..< NR:
-        vC[i, j] *= beta
-
-  if alpha == 1.T:                   # C += AB
-    for i in 0 ..< MR:
-      for j in 0 ..< NR:
-        vC[i, j] += pAB[i][j]
-  else:                              # C += αAB
-    for i in 0 ..< MR:
-      for j in 0 ..< NR:
-        vC[i, j] += alpha * pAB[i][j]
-
-  # TODO: Fused operations like relu/sigmoid/tanh
-  #       should be done here as well
-
-proc gebb_ukernel_fallback*[T; ukernel: static MicroKernel](
-      kc: int,
-      alpha: T, packedA, packedB: ptr UncheckedArray[T],
-      beta: T, vC: MatrixView[T]
-    ) =
-  ukernel_generic_impl()
-
-  const is_c_unit_stride = ukernel.extract_c_unit_stride
-  gebb_ukernel_epilogue_fallback(alpha, to_ptr(AB, MR, NR, T), beta, vC)
-
-# ############################################################
-#
-#                      Matrix edges
-#
-# ############################################################
-
-func gebb_ukernel_edge_epilogue*[MR, NR: static int, T](
-      alpha: T, AB: ptr array[MR, array[NR, T]],
-      beta: T,  vC: MatrixView[T],
-      mr, nr: int # Tail to process
-    ){.inline.} =
-
-  let pAB{.restrict.} = assume_aligned cast[ptr array[MR, array[NR, T]]](AB[0][0].unsafeAddr)
-
-  if beta == 0.T:
-    if alpha == 1.T:                   # C = AB
-      for i in 0 ..< mr:
-        for j in 0 ..< nr:
-          vC[i, j] = pAB[i][j]
-    else:                              # C = αAB
-      for i in 0 ..< mr:
-        for j in 0 ..< nr:
-          vC[i, j] = alpha * pAB[i][j]
-  else:                                # C *= β
-    for i in 0 ..< mr:
-      for j in 0 ..< nr:
-        vC[i, j] *= beta
-
-    if alpha == 1.T:                   # C += AB
-      for i in 0 ..< mr:
-        for j in 0 ..< nr:
-          vC[i, j] += pAB[i][j]
-    else:                              # C += αAB
-      for i in 0 ..< mr:
-        for j in 0 ..< nr:
-          vC[i, j] += alpha * pAB[i][j]
-
-  # TODO: Fused operations like relu/sigmoid/tanh
-  #       should be done here as well
-
-proc gebb_ukernel_edge_fallback*[T; ukernel: static MicroKernel](
-      mr, nr, kc: int,
-      alpha: T, packedA, packedB: ptr UncheckedArray[T],
-      beta: T, vC: MatrixView[T]
-    ) =
-  ukernel_generic_impl()
-  gebb_ukernel_edge_epilogue(alpha, to_ptr(AB, MR, NR, T), beta, vC, mr, nr)
diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse.nim b/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse.nim
deleted file mode 100644
index acdf500..0000000
--- a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse.nim
+++ /dev/null
@@ -1,26 +0,0 @@
-# Laser
-# Copyright (c) 2018 Mamy André-Ratsimbazafy
-# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
-# This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ./gemm_ukernel_generator, ./gemm_tiling,
-  ../laser_utils/simd
-
-template float32x4_muladd_unfused(a, b, c: m128): m128 =
-  mm_add_ps(mm_mul_ps(a, b), c)
-
-ukernel_generator(
-      x86_SSE,
-      typ = float32,
-      vectype = m128,
-      nb_scalars = 4,
-      simd_setZero = mm_setzero_ps,
-      simd_broadcast_value = mm_set1_ps,
-      simd_load_aligned = mm_load_ps,
-      simd_load_unaligned = mm_loadu_ps,
-      simd_store_unaligned = mm_storeu_ps,
-      simd_mul = mm_mul_ps,
-      simd_add = mm_add_ps,
-      simd_fma = float32x4_muladd_unfused
-    )
diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse2.nim b/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse2.nim
deleted file mode 100644
index a5d47fa..0000000
--- a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse2.nim
+++ /dev/null
@@ -1,129 +0,0 @@
-# Laser
-# Copyright (c) 2018 Mamy André-Ratsimbazafy
-# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
-# This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ./gemm_ukernel_generator, ./gemm_tiling,
-  ../laser_utils/simd
-
-template float64x2_muladd_unfused(a, b, c: m128d): m128d =
-  mm_add_pd(mm_mul_pd(a, b), c)
-
-ukernel_generator(
-      x86_SSE2,
-      typ = float64,
-      vectype = m128d,
-      nb_scalars = 2,
-      simd_setZero = mm_setzero_pd,
-      simd_broadcast_value = mm_set1_pd,
-      simd_load_aligned = mm_load_pd,
-      simd_load_unaligned = mm_loadu_pd,
-      simd_store_unaligned = mm_storeu_pd,
-      simd_mul = mm_mul_pd,
-      simd_add = mm_add_pd,
-      simd_fma = float64x2_muladd_unfused
-    )
-
-#######################################
-#
-# Int32: hack to unroll scalar code
-#
-#######################################
-
-# This is faster than using the fallback for mm_mullo_epi32
-# in laser/primitives/private/sse2_utils
-
-# Note that we are quite limited in registers with scalar code
-# as those are competing with all loop controls, conditions, ...
-
-type Int32x2 = array[2, int32]
-
-func setZero_int32_sse2_fallback(): Int32x2 {.inline.} =
-  discard
-
-template set1_int32_sse2_fallback(a: int32): Int32x2 =
-  [a, a]
-
-func load_int32_sse2_fallback(mem_addr: ptr int32): Int32x2 {.inline.}=
-  let p = cast[ptr UncheckedArray[int32]](mem_addr)
-  [p[0], p[1]]
-
-func store_int32_sse2_fallback(mem_addr: ptr int32, a: Int32x2) {.inline.}=
-  let p = cast[ptr UncheckedArray[int32]](mem_addr)
-  p[0] = a[0]
-  p[1] = a[1]
-
-template add_int32_sse2_fallback(a, b: Int32x2): Int32x2 =
-  [a[0] + b[0], a[1] + b[1]]
-
-template mul_int32_sse2_fallback(a, b: Int32x2): Int32x2 =
-  [a[0] * b[0], a[1] * b[1]]
-
-template fma_int32_sse2_fallback(a, b, c: Int32x2): Int32x2 =
-  ## By mistake I had c[0] instead of c[1] and twice the speed
-  [c[0] + a[0]*b[0], c[1] + a[1]*b[1]]
-
-ukernel_generator(
-      x86_SSE2,
-      typ = int32,
-      vectype = Int32x2,
-      nb_scalars = 2,
-      simd_setZero = setZero_int32_sse2_fallback,
-      simd_broadcast_value = set1_int32_sse2_fallback,
-      simd_load_aligned = load_int32_sse2_fallback,
-      simd_load_unaligned = load_int32_sse2_fallback,
-      simd_store_unaligned = store_int32_sse2_fallback,
-      simd_mul = mul_int32_sse2_fallback,
-      simd_add = add_int32_sse2_fallback,
-      simd_fma = fma_int32_sse2_fallback
-    )
-
-
-#######################################
-#
-# Int64: hack to unroll scalar code
-#
-#######################################
-
-type Int64x2 = array[2, int64]
-
-func setZero_int64_sse2_fallback(): Int64x2 {.inline.} =
-  discard
-
-template set1_int64_sse2_fallback(a: int64): Int64x2 =
-  [a, a]
-
-func load_int64_sse2_fallback(mem_addr: ptr int64): Int64x2 {.inline.}=
-  let p = cast[ptr UncheckedArray[int64]](mem_addr)
-  [p[0], p[1]]
-
-func store_int64_sse2_fallback(mem_addr: ptr int64, a: Int64x2) {.inline.}=
-  let p = cast[ptr UncheckedArray[int64]](mem_addr)
-  p[0] = a[0]
-  p[1] = a[1]
-
-template add_int64_sse2_fallback(a, b: Int64x2): Int64x2 =
-  [a[0] + b[0], a[1] + b[1]]
-
-template mul_int64_sse2_fallback(a, b: Int64x2): Int64x2 =
-  [a[0] * b[0], a[1] * b[1]]
-
-template fma_int64_sse2_fallback(a, b, c: Int64x2): Int64x2 =
-  ## By mistake I had c[0] instead of c[1] and twice the speed
-  [c[0] + a[0]*b[0], c[1] + a[1]*b[1]]
-
-ukernel_generator(
-      x86_SSE2,
-      typ = int64,
-      vectype = Int64x2,
-      nb_scalars = 2,
-      simd_setZero = setZero_int64_sse2_fallback,
-      simd_broadcast_value = set1_int64_sse2_fallback,
-      simd_load_aligned = load_int64_sse2_fallback,
-      simd_load_unaligned = load_int64_sse2_fallback,
-      simd_store_unaligned = store_int64_sse2_fallback,
-      simd_mul = mul_int64_sse2_fallback,
-      simd_add = add_int64_sse2_fallback,
-      simd_fma = fma_int64_sse2_fallback
-    )
diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse4_1.nim b/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse4_1.nim
deleted file mode 100644
index f7c724e..0000000
--- a/benchmarks/matmul/laser_gemm_backend/gemm_ukernel_sse4_1.nim
+++ /dev/null
@@ -1,35 +0,0 @@
-# Laser
-# Copyright (c) 2018 Mamy André-Ratsimbazafy
-# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
-# This file may not be copied, modified, or distributed except according to those terms.
-
-import
-  ./gemm_ukernel_generator, ./gemm_tiling,
-  ../laser_utils/simd
-
-template int32x4_muladd_unfused_sse4_1(a, b, c: m128i): m128i =
-  mm_add_epi32(mm_mullo_epi32(a, b), c)
-
-template int32x4_loada(mem_addr: ptr int32): m128i =
-  mm_load_si128(cast[ptr m128i](mem_addr))
-
-template int32x4_loadu(mem_addr: ptr int32): m128i =
-  mm_loadu_si128(cast[ptr m128i](mem_addr))
-
-template int32x4_storeu(mem_addr: ptr int32, a: m128i) =
-  mm_storeu_si128(cast[ptr m128i](mem_addr), a)
-
-ukernel_generator(
-      x86_SSE4_1,
-      typ = int32,
-      vectype = m128i,
-      nb_scalars = 4,
-      simd_setZero = mm_setzero_si128,
-      simd_broadcast_value = mm_set1_epi32,
-      simd_load_aligned = int32x4_loada,
-      simd_load_unaligned = int32x4_loadu,
-      simd_store_unaligned = int32x4_storeu,
-      simd_mul = mm_mullo_epi32,
-      simd_add = mm_add_epi32,
-      simd_fma = int32x4_muladd_unfused_sse4_1
-    )
diff --git a/benchmarks/matmul/laser_gemm_backend/gemm_utils.nim b/benchmarks/matmul/laser_gemm_backend/gemm_utils.nim
deleted file mode 100644
index 82ceb79..0000000
--- a/benchmarks/matmul/laser_gemm_backend/gemm_utils.nim
+++ /dev/null
@@ -1,60 +0,0 @@
-# Laser
-# Copyright (c) 2018 Mamy André-Ratsimbazafy
-# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
-# This file may not be copied, modified, or distributed except according to those terms.
-
-# ############################################################
-#
-#                    Pointer arithmetics
-#
-# ############################################################
-
-# Warning for pointer arithmetics be careful of not passing a `var ptr`
-# to a function as `var` are passed by hidden pointers in Nim and the wrong
-# pointer will be modified. Templates are fine.
-
-func `+`*(p: ptr, offset: int): type(p) {.inline.}=
-  ## Pointer increment
-  {.emit: "`result` = `p` + `offset`;".}
-
-# ############################################################
-#
-#  Conversion of the AB auxiliary matrix from SIMD to scalar
-#
-# ############################################################
-import ../laser_utils/compiler_optim_hints
-
-template to_ptr*(AB: typed, MR, NR: static int, T: typedesc): untyped =
-  assume_aligned cast[ptr array[MR, array[NR, T]]](AB[0][0].unsafeaddr)
-
-# ############################################################
-#
-#                    Matrix View
-#
-# ############################################################
-
-type
-  MatrixView*[T] = object
-    buffer*: ptr UncheckedArray[T]
-    rowStride*, colStride*: int
-
-func toMatrixView*[T](data: ptr T, rowStride, colStride: int): MatrixView[T] {.inline.} =
-  result.buffer = cast[ptr UncheckedArray[T]](data)
-  result.rowStride = rowStride
-  result.colStride = colStride
-
-template `[]`*[T](view: MatrixView[T], row, col: Natural): T =
-  ## Access like a 2D matrix
-  view.buffer[row * view.rowStride + col * view.colStride]
-
-template `[]=`*[T](view: MatrixView[T], row, col: Natural, value: T) =
-  ## Access like a 2D matrix
-  view.buffer[row * view.rowStride + col * view.colStride] = value
-
-func stride*[T](view: MatrixView[T], row, col: Natural): MatrixView[T]{.inline.}=
-  ## Returns a new view offset by the row and column stride
-  result.buffer = cast[ptr UncheckedArray[T]](
-    addr view.buffer[row*view.rowStride + col*view.colStride]
-  )
-  result.rowStride = view.rowStride
-  result.colStride = view.colStride
diff --git a/benchmarks/matmul/laser_utils/align_unroller.nim b/benchmarks/matmul/laser_utils/align_unroller.nim
deleted file mode 100644
index 6b861f0..0000000
--- a/benchmarks/matmul/laser_utils/align_unroller.nim
+++ /dev/null
@@ -1,41 +0,0 @@
-# Laser
-# Copyright (c) 2018 Mamy André-Ratsimbazafy
-# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
-# This file may not be copied, modified, or distributed except according to those terms.
-
-func round_step_down*(x: Natural, step: static Natural): int {.inline.} =
-  ## Round the input to the previous multiple of "step"
-  when (step and (step - 1)) == 0:
-    # Step is a power of 2. (If compiler cannot prove that x>0 it does not make the optim)
-    result = x and not(step - 1)
-  else:
-    result = x - x mod step
-
-func round_step_up*(x: Natural, step: static Natural): int {.inline.} =
-  ## Round the input to the next multiple of "step"
-  when (step and (step - 1)) == 0:
-    # Step is a power of 2. (If compiler cannot prove that x>0 it does not make the optim)
-    result = (x + step - 1) and not(step - 1)
-  else:
-    result = ((x + step - 1) div step) * step
-
-when isMainModule:
-  doAssert round_step_up(10, 4) == 12
-  doAssert round_step_up(10, 8) == 16
-  doAssert round_step_up(65, 64) == 128
-  doAssert round_step_up(1, 3) == 3
-  doAssert round_step_up(19, 24) == 24
-  doAssert round_step_up(8, 4) == 8
-  doAssert round_step_up(64, 64) == 64
-  doAssert round_step_up(24, 24) == 24
-  doAssert round_step_up(3, 3) == 3
-
-  doAssert round_step_down(10, 4) == 8
-  doAssert round_step_down(10, 8) == 8
-  doAssert round_step_down(65, 64) == 64
-  doAssert round_step_down(1, 3) == 0
-  doAssert round_step_down(19, 24) == 0
-  doAssert round_step_down(8, 4) == 8
-  doAssert round_step_down(64, 64) == 64
-  doAssert round_step_down(24, 24) == 24
-  doAssert round_step_down(3, 3) == 3
diff --git a/benchmarks/matmul/laser_utils/compiler_optim_hints.nim b/benchmarks/matmul/laser_utils/compiler_optim_hints.nim
deleted file mode 100644
index 451ffea..0000000
--- a/benchmarks/matmul/laser_utils/compiler_optim_hints.nim
+++ /dev/null
@@ -1,149 +0,0 @@
-# Laser & Arraymancer
-# Copyright (c) 2017-2018 Mamy André-Ratsimbazafy
-# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
-# This file may not be copied, modified, or distributed except according to those terms.
-
-const LASER_MEM_ALIGN*{.intdefine.} = 64
-static:
-  assert LASER_MEM_ALIGN != 0, "Alignment " & $LASER_MEM_ALIGN & "must be a power of 2"
-  assert (LASER_MEM_ALIGN and (LASER_MEM_ALIGN - 1)) == 0, "Alignment " & $LASER_MEM_ALIGN & "must be a power of 2"
-
-template withCompilerOptimHints*() =
-  # See https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html
-  # and https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html#Common-Variable-Attributes
-
-  # Variable is created aligned by LASER_MEM_ALIGN.
-  # This is useful to ensure an object can be loaded
-  # in a minimum amount of cache lines load
-  # For example, the stack part of tensors are 128 bytes and can be loaded in 2 cache lines
-  # but would require 3 loads if they are misaligned.
-  {.pragma: align_variable, codegenDecl: "$# $# __attribute__((aligned(" & $LASER_MEM_ALIGN & ")))".}
-
-  # Variable. Pointer does not alias any existing valid pointers.
-  when not defined(vcc):
-    {.pragma: restrict, codegenDecl: "$# __restrict__ $#".}
-  else:
-    {.pragma: restrict, codegenDecl: "$# __restrict $#".}
-
-const withBuiltins = defined(gcc) or defined(clang) or defined(icc)
-
-type
-  PrefetchRW* {.size: cint.sizeof.} = enum
-    Read = 0
-    Write = 1
-  PrefetchLocality* {.size: cint.sizeof.} = enum
-    NoTemporalLocality = 0 # Data can be discarded from CPU cache after access
-    LowTemporalLocality = 1
-    ModerateTemporalLocality = 2
-    HighTemporalLocality = 3 # Data should be left in all levels of cache possible
-    # Translation
-    # 0 - use no cache eviction level
-    # 1 - L1 cache eviction level
-    # 2 - L2 cache eviction level
-    # 3 - L1 and L2 cache eviction level
-
-when withBuiltins:
-  proc builtin_assume_aligned(data: pointer, alignment: csize): pointer {.importc: "__builtin_assume_aligned", noDecl.}
-  proc builtin_prefetch(data: pointer, rw: PrefetchRW, locality: PrefetchLocality) {.importc: "__builtin_prefetch", noDecl.}
-
-when defined(cpp):
-  proc static_cast[T: ptr](input: pointer): T
-    {.importcpp: "static_cast<'0>(@)".}
-
-template assume_aligned*[T](data: ptr T, alignment: static int = LASER_MEM_ALIGN): ptr T =
-  when defined(cpp) and withBuiltins: # builtin_assume_aligned returns void pointers, this does not compile in C++, they must all be typed
-    static_cast[ptr T](builtin_assume_aligned(data, alignment))
-  elif withBuiltins:
-    cast[ptr T](builtin_assume_aligned(data, alignment))
-  else:
-    data
-
-template prefetch*[T](
-            data: ptr (T or UncheckedArray[T]),
-            rw: static PrefetchRW = Read,
-            locality: static PrefetchLocality = HighTemporalLocality) =
-  ## Prefetch examples:
-  ##   - https://scripts.mit.edu/~birge/blog/accelerating-code-using-gccs-prefetch-extension/
-  ##   - https://stackoverflow.com/questions/7327994/prefetching-examples
-  ##   - https://lemire.me/blog/2018/04/30/is-software-prefetching-__builtin_prefetch-useful-for-performance/
-  ##   - https://www.naftaliharris.com/blog/2x-speedup-with-one-line-of-code/
-  when withBuiltins:
-    builtin_prefetch(data, rw, locality)
-  else:
-    discard
-
-template pragma_ivdep() =
-  ## Tell the compiler to ignore unproven loop dependencies
-  ## such as "a[i] = a[i + k] * c;" if k is unknown, as it introduces a loop
-  ## dependency if it's negative
-  ## https://software.intel.com/en-us/node/524501
-  ##
-  ## Placeholder
-  # We don't expose that as it only works on C for loop. Nim only generates while loop
-  # except when using OpenMP. But the OpenMP "simd" already achieves the same as ivdep.
-  when defined(gcc):
-    {.emit: "#pragma GCC ivdep".}
-  else: # Supported on ICC and Cray
-    {.emit: "pragma ivdep".}
-
-template withCompilerFunctionHints() =
-  ## Not exposed, Nim codegen will declare them as normal C function.
-  ## This messes up with N_NIMCALL, N_LIB_PRIVATE, N_INLINE and also
-  ## creates duplicate symbols when one function called by a hot or pure function
-  ## is public and inline (because hot and pure cascade to all cunfctions called)
-  ## and they cannot be stacked easily: (hot, pure) will only apply the last
-
-  # Function. Returned pointer is aligned to LASER_MEM_ALIGN
-  {.pragma: aligned_ptr_result, codegenDecl: "__attribute__((assume_aligned(" & $LASER_MEM_ALIGN & ")) $# $#$#".}
-
-  # Function. Returned pointer cannot alias any other valid pointer and no pointers to valid object occur in any
-  # storage pointed to.
-  {.pragma: malloc, codegenDecl: "__attribute__((malloc)) $# $#$#".}
-
-  # Function. Creates one or more function versions that can process multiple arguments using SIMD.
-  # Ignored when -fopenmp is used and within an OpenMP simd loop
-  {.pragma: simd, codegenDecl: "__attribute__((simd)) $# $#$#".}
-
-  # Function. Indicates hot and cold path. Ignored when using profile guided optimization.
-  {.pragma: hot, codegenDecl: "__attribute__((hot)) $# $#$#".}
-  {.pragma: cold, codegenDecl: "__attribute__((cold)) $# $#$#".}
-
-  # ## pure and const
-  # ## Affect Common Sub-expression Elimination, Dead Code Elimination and loop optimization.
-  # See
-  #   - https://lwn.net/Articles/285332/
-  #   - http://benyossef.com/helping-the-compiler-help-you/
-  #
-  # Function. The function only accesses its input params and global variables state.
-  # It does not modify any global, calling it multiple times with the same params
-  # and global variables will produce the same result.
-  {.pragma: gcc_pure, codegenDecl: "__attribute__((pure)) $# $#$#".}
-  #
-  # Function. The function only accesses its input params and calling it multiple times
-  # with the same params will produce the same result.
-  # Warning ⚠:
-  #   Pointer inputs must not be dereferenced to read the memory pointed to.
-  #   In Nim stack arrays are passed by pointers and big stack data structures
-  #   are passed by reference as well. I.e. Result unknown.
-  {.pragma: gcc_const, codegenDecl: "__attribute__((const)) $# $#$#".}
-
-  # We don't define per-function fast-math, GCC attribute optimize is broken:
-  # --> https://gcc.gnu.org/ml/gcc/2009-10/msg00402.html
-  #
-  # Workaround floating point latency for algorithms like sum
-  # should be done manually.
-  #
-  # See : https://stackoverflow.com/questions/39095993/does-each-floating-point-operation-take-the-same-time
-  # and https://www.agner.org/optimize/vectorclass.pdf "Using multiple accumulators"
-  #
-  # FP addition has a latency of 3~5 clock cycles, i.e. the result cannot be reused for that much time.
-  # But the throughput is 1 FP add per clock cycle (and even 2 per clock cycle for Skylake)
-  # So we need to use extra accumulators to fully utilize the FP throughput despite FP latency.
-  # On Skylake, all FP latencies are 4: https://www.agner.org/optimize/blog/read.php?i=415
-  #
-  # Note that this is per CPU cores, each core needs its own "global CPU accumulator" to combat
-  # false sharing when multithreading.
-  #
-  # This wouldn't be needed with fast-math because compiler would consider FP addition associative
-  # and create intermediate variables as needed to exploit this through put.
-
diff --git a/benchmarks/matmul/laser_utils/memory.nim b/benchmarks/matmul/laser_utils/memory.nim
deleted file mode 100644
index 7b84e81..0000000
--- a/benchmarks/matmul/laser_utils/memory.nim
+++ /dev/null
@@ -1,20 +0,0 @@
-# Laser
-# Copyright (c) 2018 Mamy André-Ratsimbazafy
-# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
-# This file may not be copied, modified, or distributed except according to those terms.
-
-import ./compiler_optim_hints, typetraits
-
-func align_raw_data*(T: typedesc, p: pointer): ptr UncheckedArray[T] =
-  static: assert T.supportsCopyMem
-  withCompilerOptimHints()
-
-  let address = cast[ByteAddress](p)
-  let aligned_ptr{.restrict.} = block: # We cannot directly apply restrict to the default "result"
-    let remainder = address and (LASER_MEM_ALIGN - 1) # modulo LASER_MEM_ALIGN (power of 2)
-    if remainder == 0:
-      assume_aligned cast[ptr UncheckedArray[T]](address)
-    else:
-      let offset = LASER_MEM_ALIGN - remainder
-      assume_aligned cast[ptr UncheckedArray[T]](address +% offset)
-  return aligned_ptr
diff --git a/benchmarks/matmul/laser_utils/openmp.nim b/benchmarks/matmul/laser_utils/openmp.nim
deleted file mode 100644
index 092dfaa..0000000
--- a/benchmarks/matmul/laser_utils/openmp.nim
+++ /dev/null
@@ -1,386 +0,0 @@
-# Laser
-# Copyright (c) 2018 Mamy André-Ratsimbazafy
-# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
-# This file may not be copied, modified, or distributed except according to those terms.
-
-# ###############################################################
-# Compile-time name mangling for OpenMP thresholds
-# Workaround https://github.com/nim-lang/Nim/issues/9365
-# and https://github.com/nim-lang/Nim/issues/9366
-import random
-from strutils import toHex
-
-var mangling_rng {.compileTime.} = initRand(0x1337DEADBEEF)
-var current_suffix {.compileTime.} = ""
-
-proc omp_suffix*(genNew: static bool = false): string {.compileTime.} =
-  ## genNew:
-  ##   if false, return the last suffix
-  ##   else return a fresh one
-  # This is exported because you cannot bind the symbol early enough
-  # for exportc
-
-  if genNew:
-    current_suffix = mangling_rng.rand(high(uint32)).toHex
-  result = current_suffix
-
-# ################################################################
-# Tuning
-
-when defined(openmp):
-  {.passC: "-fopenmp".}
-  {.passL: "-fopenmp".}
-
-  {.pragma: omp, header:"omp.h".}
-
-  proc omp_set_num_threads*(x: cint) {.omp.}
-  proc omp_get_num_threads*(): cint {.omp.}
-  proc omp_get_max_threads*(): cint {.omp.} # This takes hyperthreading into account
-  proc omp_get_thread_num*(): cint {.omp.}
-  proc omp_set_nested*(x: cint) {.omp.}
-  proc omp_get_nested*(): cint {.omp.}
-
-else:
-  template omp_set_num_threads*(x: cint) = discard
-  template omp_get_num_threads*(): cint = 1
-  template omp_get_max_threads*(): cint = 1
-  template omp_get_thread_num*(): cint = 0
-  template omp_set_nested*(x: cint) = discard
-  template omp_get_nested*(): cint = cint 0
-
-# TODO tuning for architectures
-# https://github.com/zy97140/omp-benchmark-for-pytorch
-# https://github.com/zy97140/omp-benchmark-for-pytorch/blob/master/benchmark-data/IntelR-XeonR-CPU-E5-2669-v4.md
-# https://github.com/zy97140/omp-benchmark-for-pytorch/blob/master/benchmark-data/IntelR-XeonR-Platinum-8180-CPU.md
-
-
-const OMP_MEMORY_BOUND_GRAIN_SIZE*{.intdefine.} = 1024
-  ## This is the minimum amount of work per physical cores
-  ## for memory-bound processing.
-  ## - "copy" and "addition" are considered memory-bound
-  ## - "float division" can be considered 2x~4x more complex
-  ##   and should be scaled down accordingly
-  ## - "exp" and "sin" operations are compute-bound and
-  ##   there is a perf boost even when processing
-  ##   only 1000 items on 28 cores
-  ##
-  ## Launching 2 threads per core (HyperThreading) is probably desirable:
-  ##   - https://medium.com/data-design/destroying-the-myth-of-number-of-threads-number-of-physical-cores-762ad3919880
-  ##
-  ## Raising the following parameters can have the following impact:
-  ##   - number of sockets: higher, more over memory fetch
-  ##   - number of memory channel: lower, less overhead per memory fetch
-  ##   - RAM speed: lower, less overhead per memory fetch
-  ##   - Private L2 cache: higher, feed more data per CPU
-  ##   - Hyperthreading and cache associativity
-  ##   - Cores, shared L3 cache: Memory contention
-  ##
-  ## Note that setting num_threads manually might impact performance negatively:
-  ##   - http://studio.myrian.fr/openmp-et-num_threads/
-  ##     > 2x2ms overhead when changing num_threads from 16->6->16
-
-const OMP_NON_CONTIGUOUS_SCALE_FACTOR*{.intdefine.} = 4
-  ## Due to striding computation, we can use a lower grainsize
-  ## for non-contiguous tensors
-
-# ################################################################
-
-template attachGC*(): untyped =
-  ## If you are allocating reference types, sequences or strings
-  ## in a parallel section, you need to attach and detach
-  ## a GC for each thread. Those should be thread-local temporaries.
-  ##
-  ## This attaches the GC.
-  ##
-  ## Note: this creates too strange error messages
-  ## when --threads is not on: https://github.com/nim-lang/Nim/issues/9489
-  if(omp_get_thread_num()!=0):
-    setupForeignThreadGc()
-
-template detachGC*(): untyped =
-  ## If you are allocating reference types, sequences or strings
-  ## in a parallel section, you need to attach and detach
-  ## a GC for each thread. Those should be thread-local temporaries.
-  ##
-  ## This detaches the GC.
-  ##
-  ## Note: this creates too strange error messages
-  ## when --threads is not on: https://github.com/nim-lang/Nim/issues/9489
-  if(omp_get_thread_num()!=0):
-    teardownForeignThreadGc()
-
-template omp_parallel*(body: untyped): untyped =
-  ## Starts an openMP parallel section
-  ##
-  ## Don't forget to use attachGC and detachGC if you are allocating
-  ## sequences, strings, or reference types.
-  ## Those should be thread-local temporaries.
-  {.emit: "#pragma omp parallel".}
-  block: body
-
-template omp_parallel_if*(condition: bool, body: untyped) =
-  let predicate = condition # Make symbol valid and ensure it's lvalue
-  {.emit: "#pragma omp parallel if (`predicate`)".}
-  block: body
-
-template omp_for*(
-    index: untyped,
-    length: Natural,
-    use_simd, nowait: static bool,
-    body: untyped
-  ) =
-  ## OpenMP for loop (not parallel)
-  ##
-  ## This must be used in an `omp_parallel` block
-  ## for parallelization.
-  ##
-  ## Inputs:
-  ##   - `index`, the iteration index, similar to
-  ##     for `index` in 0 ..< length:
-  ##       doSomething(`index`)
-  ##   - `length`, the number of elements to iterate on
-  ##   - `use_simd`, instruct the compiler to unroll the loops for `simd` use.
-  ##     For example, for float32:
-  ##     for i in 0..<16:
-  ##       x[i] += y[i]
-  ##     will be unrolled to take 128, 256 or 512-bit to use SSE, AVX or AVX512.
-  ##     for 256-bit AVX:
-  ##     for i in countup(0, 2, 8): # Step 8 by 8
-  ##       x[i]   += y[i]
-  ##       x[i+1] += y[i+1]
-  ##       x[i+2] += y[i+2]
-  ##       ...
-  const omp_annotation = block:
-    "for " &
-      (when use_simd: "simd " else: "") &
-      (when nowait: "nowait " else: "")
-  for `index`{.inject.} in `||`(0, length-1, omp_annotation):
-    block: body
-
-template omp_parallel_for*(
-    index: untyped,
-    length: Natural,
-    omp_grain_size: static Natural,
-    use_simd: static bool,
-    body: untyped
-  ) =
-  ## Parallel for loop
-  ##
-  ## Do not forget to use attachGC and detachGC if you are allocating
-  ## sequences, strings, or reference types.
-  ## Those should be thread-local temporaries.
-  ##
-  ## Inputs:
-  ##   - `index`, the iteration index, similar to
-  ##     for `index` in 0 ..< length:
-  ##       doSomething(`index`)
-  ##   - `length`, the number of elements to iterate on
-  ##   - `omp_grain_size`, the minimal amount of work per thread. If below,
-  ##     we don't start threads. Note that we always start as much hardware threads
-  ##     as available as starting varying number of threads in the lifetime of the program
-  ##     will add oberhead.
-  ##   - `use_simd`, instruct the compiler to unroll the loops for `simd` use.
-  ##     For example, for float32:
-  ##     for i in 0..<16:
-  ##       x[i] += y[i]
-  ##     will be unrolled to take 128, 256 or 512-bit to use SSE, AVX or AVX512.
-  ##     for 256-bit AVX:
-  ##     for i in countup(0, 2, 8): # Step 8 by 8
-  ##       x[i]   += y[i]
-  ##       x[i+1] += y[i+1]
-  ##       x[i+2] += y[i+2]
-  ##       ...
-  when not defined(openmp):
-    ## When OpenMP is not defined we use this simple loop as fallback
-    ## This way, the compiler will still be provided "simd" vectorization hints
-    when use_simd:
-      const omp_annotation = "parallel for simd"
-    else:
-      const omp_annotation = "parallel for"
-    for `index`{.inject.} in `||`(0, length-1, omp_annotation):
-      block: body
-  else:
-    let omp_size = length # make sure if length is computed it's only done once
-
-    const # Workaround to expose an unique symbol in C. TODO Pending OpenMP interpolation: https://github.com/nim-lang/Nim/issues/9365
-      omp_condition_csym = "omp_condition_" & omp_suffix(genNew = true)
-    let omp_condition {.exportc: "omp_condition_" & # We cannot use csym directly in exportc
-      omp_suffix().} = omp_grain_size * omp_get_max_threads() < omp_size
-
-    const omp_annotation = block:
-      "parallel for " &
-        (when use_simd: "simd " else: "") &
-        "if(" & $omp_condition_csym & ")"
-
-    for `index`{.inject.} in `||`(0, omp_size - 1, omp_annotation):
-      block: body
-
-template omp_parallel_for_default*(
-      index: untyped,
-      length: Natural,
-      body: untyped
-      ) =
-  ## This will be renamed omp_parallel_for once
-  ## https://github.com/nim-lang/Nim/issues/9414 is solved.
-  ## Compared to omp_parallel_for the following are set by default
-  ## - omp_grain_size:
-  ##     The default `OMP_MEMORY_BOUND_GRAIN_SIZE` is suitable for
-  ##     contiguous copy or add operations. It's 1024 and can be changed
-  ##     by passing `-d:OMP_MEMORY_BOUND_GRAIN_SIZE=123456` during compilation.
-  ##     A value of 1 will always parallelize the loop.
-  ## - simd is used by default
-  omp_parallel_for(
-        index,
-        length,
-        omp_grain_size = OMP_MEMORY_BOUND_GRAIN_SIZE,
-        use_simd = true,
-        body
-        )
-
-template omp_chunks*(
-    omp_size: Natural, #{lvalue} # TODO parameter constraint, pending https://github.com/nim-lang/Nim/issues/9620
-    chunk_offset, chunk_size: untyped,
-    body: untyped): untyped =
-  ## Internal proc
-  ## This is is the chunk part of omp_parallel_chunk
-  ## omp_size should be a lvalue (assigned value) and not
-  ## the result of a routine otherwise routine and its side-effect will be called multiple times
-
-  # The following simple chunking scheme can lead to severe load imbalance
-  #
-  # `chunk_offset`{.inject.} = chunk_size * thread_id
-  # `chunk_size`{.inject.} =  if thread_id < nb_chunks - 1: chunk_size
-  #                           else: omp_size - chunk_offset
-  #
-  # For example dividing 40 items on 12 threads will lead to
-  # a base_chunk_size of 40/12 = 3 so work on the first 11 threads
-  # will be 3 * 11 = 33, and the remainder 7 on the last thread.
-  let
-    nb_chunks = omp_get_num_threads()
-    base_chunk_size = omp_size div nb_chunks
-    remainder = omp_size mod nb_chunks
-    thread_id = omp_get_thread_num()
-
-  # Instead of dividing 40 work items on 12 cores into:
-  # 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7 = 3*11 + 7 = 40
-  # the following scheme will divide into
-  # 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3 = 4*4 + 3*8 = 40
-  #
-  # This is compliant with OpenMP spec (page 60)
-  # http://www.openmp.org/mp-documents/openmp-4.5.pdf
-  # "When no chunk_size is specified, the iteration space is divided into chunks
-  # that are approximately equal in size, and at most one chunk is distributed to
-  # each thread. The size of the chunks is unspecified in this case."
-  # ---> chunks are the same ±1
-
-  var `chunk_offset`{.inject.}, `chunk_size`{.inject.}: Natural
-  if thread_id < remainder:
-    chunk_offset = (base_chunk_size + 1) * thread_id
-    chunk_size = base_chunk_size + 1
-  else:
-    chunk_offset = base_chunk_size * thread_id + remainder
-    chunk_size = base_chunk_size
-
-  block: body
-
-template omp_parallel_chunks*(
-    length: Natural,
-    chunk_offset, chunk_size: untyped,
-    omp_grain_size: static Natural,
-    body: untyped): untyped =
-  ## Create a chunk for each thread. You can use:
-  ## `for index in chunk_offset ..< chunk_size:` or
-  ## `zeroMem(foo[chunk_offset].addr, chunk_size)`
-  ##
-  ## Splits the input `length` into chunks and do a parallel loop
-  ## on each chunk. The number of chunks depends on the number of cores at runtime.
-  ## `chunk_offset` and `chunk_size` should be passed as undeclared identifiers.
-  ## Within the template scope they will contain the start offset and the length
-  ## of the current thread chunk. I.e. their value is thread-specific.
-  ##
-  ## Use omp_get_thread_num() to get the current thread number
-  ##
-  ## This is useful for non-contiguous processing as a replacement to omp_parallel_for
-  ## or when operating on (contiguous) ranges for example for memset or memcpy.
-  ##
-  ## Do not forget to use attachGC and detachGC if you are allocating
-  ## sequences, strings, or reference types.
-  ## Those should be thread-local temporaries.
-  when not defined(openmp):
-    const `chunk_offset`{.inject.} = 0
-    let `chunk_size`{.inject.} = length
-    block: body
-  else:
-    let omp_size = length # make sure if length is computed it's only done once
-    let over_threshold = omp_grain_size * omp_get_max_threads() < omp_size
-
-    omp_parallel_if(over_threshold):
-      omp_chunks(omp_size, chunk_offset, chunk_size, body)
-
-template omp_parallel_chunks_default*(
-    length: Natural,
-    chunk_offset, chunk_size: untyped,
-    body: untyped): untyped =
-  ## This will be renamed omp_parallel_chunks once
-  ## https://github.com/nim-lang/Nim/issues/9414 is solved.
-  ## Compared to omp_parallel_for the following are set by default
-  ## - omp_grain_size:
-  ##     The default `OMP_MEMORY_BOUND_GRAIN_SIZE` is suitable for
-  ##     contiguous copy or add operations. It's 1024 and can be changed
-  ##     by passing `-d:OMP_MEMORY_BOUND_GRAIN_SIZE=123456` during compilation.
-  ##     A value of 1 will always parallelize the loop.
-  omp_parallel_chunks(
-    length,
-    chunk_offset, chunk_size,
-    omp_grain_size = OMP_MEMORY_BOUND_GRAIN_SIZE,
-    body
-  )
-
-template omp_critical*(body: untyped): untyped =
-  {.emit: "#pragma omp critical".}
-  block: body
-
-template omp_master*(body: untyped): untyped =
-  {.emit: "#pragma omp master".}
-  block: body
-
-template omp_single*(body: untyped): untyped =
-  {.emit: "#pragma omp single".}
-  block: body
-
-template omp_single_nowait*(body: untyped): untyped =
-  {.emit: "#pragma omp single nowait".}
-  block: body
-
-template omp_barrier*(): untyped =
-  {.emit: "#pragma omp barrier".}
-
-template omp_task*(annotation: static string, body: untyped): untyped =
-  {.emit: "#pragma omp task " & annotation.}
-  block: body
-
-template omp_taskwait*(): untyped =
-  {.emit: "#pragma omp taskwait".}
-
-template omp_taskloop*(
-    index: untyped,
-    length: Natural,
-    annotation: static string,
-    body: untyped
-  ) =
-  ## OpenMP taskloop
-  const omp_annotation = "taskloop " & annotation
-  for `index`{.inject.} in `||`(0, length-1, omp_annotation):
-    block: body
-
-import macros
-macro omp_flush*(variables: varargs[untyped]): untyped =
-  var listvars = "("
-  for i, variable in variables:
-    if i == 0:
-      listvars.add "`" & $variable & "`"
-    else:
-      listvars.add ",`" & $variable & "`"
-  listvars.add ')'
-  result = quote do:
-    {.emit: "#pragma omp flush " & `listvars`.}
diff --git a/benchmarks/matmul/laser_utils/simd.nim b/benchmarks/matmul/laser_utils/simd.nim
deleted file mode 100644
index 52f84f2..0000000
--- a/benchmarks/matmul/laser_utils/simd.nim
+++ /dev/null
@@ -1,441 +0,0 @@
-# Laser
-# Copyright (c) 2018 Mamy André-Ratsimbazafy
-# Distributed under the Apache v2 License (license terms are at http://www.apache.org/licenses/LICENSE-2.0).
-# This file may not be copied, modified, or distributed except according to those terms.
-
-when defined(i386) or defined(amd64):
-  # SIMD throughput and latency:
-  #   - https://software.intel.com/sites/landingpage/IntrinsicsGuide/
-  #   - https://www.agner.org/optimize/instruction_tables.pdf
-
-  # Reminder: x86 is little-endian, order is [low part, high part]
-  # Documentation at https://software.intel.com/sites/landingpage/IntrinsicsGuide/
-
-  when defined(vcc):
-    {.pragma: x86_type, byCopy, header:"<intrin.h>".}
-    {.pragma: x86, noDecl, header:"<intrin.h>".}
-  else:
-    {.pragma: x86_type, byCopy, header:"<x86intrin.h>".}
-    {.pragma: x86, noDecl, header:"<x86intrin.h>".}
-  type
-    m128* {.importc: "__m128", x86_type.} = object
-      raw: array[4, float32]
-    m128d* {.importc: "__m128d", x86_type.} = object
-      raw: array[2, float64]
-    m128i* {.importc: "__m128i", x86_type.} = object
-      raw: array[16, byte]
-    m256* {.importc: "__m256", x86_type.} = object
-      raw: array[8, float32]
-    m256d* {.importc: "__m256d", x86_type.} = object
-      raw: array[4, float64]
-    m256i* {.importc: "__m256i", x86_type.} = object
-      raw: array[32, byte]
-    m512* {.importc: "__m512", x86_type.} = object
-      raw: array[16, float32]
-    m512d* {.importc: "__m512d", x86_type.} = object
-      raw: array[8, float64]
-    m512i* {.importc: "__m512i", x86_type.} = object
-      raw: array[64, byte]
-    mmask16* {.importc: "__mmask16", x86_type.} = distinct uint16
-    mmask64* {.importc: "__mmask64", x86_type.} = distinct uint64
-
-  # ############################################################
-  #
-  #                   SSE - float32 - packed
-  #
-  # ############################################################
-
-  func mm_setzero_ps*(): m128 {.importc: "_mm_setzero_ps", x86.}
-  func mm_set1_ps*(a: float32): m128 {.importc: "_mm_set1_ps", x86.}
-  func mm_load_ps*(aligned_mem_addr: ptr float32): m128 {.importc: "_mm_load_ps", x86.}
-  func mm_loadu_ps*(data: ptr float32): m128 {.importc: "_mm_loadu_ps", x86.}
-  func mm_store_ps*(mem_addr: ptr float32, a: m128) {.importc: "_mm_store_ps", x86.}
-  func mm_storeu_ps*(mem_addr: ptr float32, a: m128) {.importc: "_mm_storeu_ps", x86.}
-  func mm_add_ps*(a, b: m128): m128 {.importc: "_mm_add_ps", x86.}
-  func mm_sub_ps*(a, b: m128): m128 {.importc: "_mm_sub_ps", x86.}
-  func mm_mul_ps*(a, b: m128): m128 {.importc: "_mm_mul_ps", x86.}
-  func mm_max_ps*(a, b: m128): m128 {.importc: "_mm_max_ps", x86.}
-  func mm_min_ps*(a, b: m128): m128 {.importc: "_mm_min_ps", x86.}
-  func mm_or_ps*(a, b: m128): m128 {.importc: "_mm_or_ps", x86.}
-
-  # ############################################################
-  #
-  #                    SSE - float32 - scalar
-  #
-  # ############################################################
-
-  func mm_load_ss*(aligned_mem_addr: ptr float32): m128 {.importc: "_mm_load_ss", x86.}
-  func mm_add_ss*(a, b: m128): m128 {.importc: "_mm_add_ss", x86.}
-  func mm_max_ss*(a, b: m128): m128 {.importc: "_mm_max_ss", x86.}
-  func mm_min_ss*(a, b: m128): m128 {.importc: "_mm_min_ss", x86.}
-
-  func mm_cvtss_f32*(a: m128): float32 {.importc: "_mm_cvtss_f32", x86.}
-    ## Extract the low part of the input
-    ## Input:
-    ##   { A0, A1, A2, A3 }
-    ## Result:
-    ##   A0
-
-  func mm_movehl_ps*(a, b: m128): m128 {.importc: "_mm_movehl_ps", x86.}
-    ## Input:
-    ##   { A0, A1, A2, A3 }, { B0, B1, B2, B3 }
-    ## Result:
-    ##   { B2, B3, A2, A3 }
-  func mm_movelh_ps*(a, b: m128): m128 {.importc: "_mm_movelh_ps", x86.}
-    ## Input:
-    ##   { A0, A1, A2, A3 }, { B0, B1, B2, B3 }
-    ## Result:
-    ##   { A0, A1, B0, B1 }
-
-  # ############################################################
-  #
-  #                    SSE2 - float64 - packed
-  #
-  # ############################################################
-
-  func mm_setzero_pd*(): m128d {.importc: "_mm_setzero_pd", x86.}
-  func mm_set1_pd*(a: float64): m128d {.importc: "_mm_set1_pd", x86.}
-  func mm_load_pd*(aligned_mem_addr: ptr float64): m128d {.importc: "_mm_load_pd", x86.}
-  func mm_loadu_pd*(mem_addr: ptr float64): m128d {.importc: "_mm_loadu_pd", x86.}
-  func mm_store_pd*(mem_addr: ptr float64, a: m128d) {.importc: "_mm_store_pd", x86.}
-  func mm_storeu_pd*(mem_addr: ptr float64, a: m128d) {.importc: "_mm_storeu_pd", x86.}
-  func mm_add_pd*(a, b: m128d): m128d {.importc: "_mm_add_pd", x86.}
-  func mm_sub_pd*(a, b: m128d): m128d {.importc: "_mm_sub_pd", x86.}
-  func mm_mul_pd*(a, b: m128d): m128d {.importc: "_mm_mul_pd", x86.}
-
-  # ############################################################
-  #
-  #                    SSE2 - integer - packed
-  #
-  # ############################################################
-
-  func mm_setzero_si128*(): m128i {.importc: "_mm_setzero_si128", x86.}
-  func mm_set1_epi8*(a: int8 or uint8): m128i {.importc: "_mm_set1_epi8", x86.}
-  func mm_set1_epi16*(a: int16 or uint16): m128i {.importc: "_mm_set1_epi16", x86.}
-  func mm_set1_epi32*(a: int32 or uint32): m128i {.importc: "_mm_set1_epi32", x86.}
-  func mm_set1_epi64x*(a: int64 or uint64): m128i {.importc: "_mm_set1_epi64x", x86.}
-  func mm_load_si128*(mem_addr: ptr m128i): m128i {.importc: "_mm_load_si128", x86.}
-  func mm_loadu_si128*(mem_addr: ptr m128i): m128i {.importc: "_mm_loadu_si128", x86.}
-  func mm_storeu_si128*(mem_addr: ptr m128i, a: m128i) {.importc: "_mm_storeu_si128", x86.}
-  func mm_add_epi8*(a, b: m128i): m128i {.importc: "_mm_add_epi8", x86.}
-  func mm_add_epi16*(a, b: m128i): m128i {.importc: "_mm_add_epi16", x86.}
-  func mm_add_epi32*(a, b: m128i): m128i {.importc: "_mm_add_epi32", x86.}
-  func mm_add_epi64*(a, b: m128i): m128i {.importc: "_mm_add_epi64", x86.}
-
-  func mm_or_si128*(a, b: m128i): m128i {.importc: "_mm_or_si128", x86.}
-  func mm_and_si128*(a, b: m128i): m128i {.importc: "_mm_and_si128", x86.}
-  func mm_slli_epi64*(a: m128i, imm8: cint): m128i {.importc: "_mm_slli_epi64", x86.}
-    ## Shift 2xint64 left
-  func mm_srli_epi64*(a: m128i, imm8: cint): m128i {.importc: "_mm_srli_epi64", x86.}
-    ## Shift 2xint64 right
-  func mm_srli_epi32*(a: m128i, count: int32): m128i {.importc: "_mm_srli_epi32", x86.}
-  func mm_slli_epi32*(a: m128i, count: int32): m128i {.importc: "_mm_slli_epi32", x86.}
-
-  func mm_mullo_epi16*(a, b: m128i): m128i {.importc: "_mm_mullo_epi16", x86.}
-    ## Multiply element-wise 2 vectors of 8 16-bit ints
-    ## into intermediate 8 32-bit ints, and keep the low 16-bit parts
-
-  func mm_shuffle_epi32*(a: m128i, imm8: cint): m128i {.importc: "_mm_shuffle_epi32", x86.}
-    ## Shuffle 32-bit integers in a according to the control in imm8
-    ## Formula is in big endian representation
-    ## a = {a3, a2, a1, a0}
-    ## dst = {d3, d2, d1, d0}
-    ## imm8 = {bits76, bits54, bits32, bits10}
-    ## d0 will refer a[bits10]
-    ## d1            a[bits32]
-
-  func mm_mul_epu32*(a: m128i, b: m128i): m128i {.importc: "_mm_mul_epu32", x86.}
-    ## From a = {a1_hi, a1_lo, a0_hi, a0_lo} with a1 and a0 being 64-bit number
-    ## and  b = {b1_hi, b1_lo, b0_hi, b0_lo}
-    ##
-    ## Result = {a1_lo * b1_lo, a0_lo * b0_lo}.
-    ## This is an extended precision multiplication 32x32 -> 64
-
-  func mm_set_epi32*(e3, e2, e1, e0: cint): m128i {.importc: "_mm_set_epi32", x86.}
-    ## Initialize m128i with {e3, e2, e1, e0} (big endian order)
-    ## Storing it will yield [e0, e1, e2, e3]
-
-  func mm_castps_si128*(a: m128): m128i {.importc: "_mm_castps_si128", x86.}
-    ## Cast a float32x4 vectors into a 128-bit int vector with the same bit pattern
-  func mm_castsi128_ps*(a: m128i): m128 {.importc: "_mm_castsi128_ps", x86.}
-    ## Cast a 128-bit int vector into a float32x8 vector with the same bit pattern
-  func mm_cvtps_epi32*(a: m128): m128i {.importc: "_mm_cvtps_epi32", x86.}
-    ## Convert a float32x4 to int32x4
-  func mm_cvtepi32_ps*(a: m128i): m128 {.importc: "_mm_cvtepi32_ps", x86.}
-    ## Convert a int32x4 to float32x4
-
-  func mm_cmpgt_epi32*(a, b: m128i): m128i {.importc: "_mm_cmpgt_epi32", x86.}
-    ## Compare a greater than b
-
-  func mm_cvtsi128_si32*(a: m128i): cint {.importc: "_mm_cvtsi128_si32", x86.}
-    ## Copy the low part of a to int32
-
-  func mm_extract_epi16*(a: m128i, imm8: cint): cint {.importc: "_mm_extract_epi16", x86.}
-    ## Extract an int16 from a, selected with imm8
-    ## and store it in the lower part of destination (padded with zeroes)
-
-  func mm_movemask_epi8*(a: m128i): int32 {.importc: "_mm_movemask_epi8", x86.}
-    ## Returns the most significant bit
-    ## of each 8-bit elements in `a`
-
-  # ############################################################
-  #
-  #                    SSE3 - float32
-  #
-  # ############################################################
-
-  func mm_movehdup_ps*(a: m128): m128 {.importc: "_mm_movehdup_ps", x86.}
-    ## Duplicates high parts of the input
-    ## Input:
-    ##   { A0, A1, A2, A3 }
-    ## Result:
-    ##   { A1, A1, A3, A3 }
-  func mm_moveldup_ps*(a: m128): m128 {.importc: "_mm_moveldup_ps", x86.}
-    ## Duplicates low parts of the input
-    ## Input:
-    ##   { A0, A1, A2, A3 }
-    ## Result:
-    ##   { A0, A0, A2, A2 }
-
-  # ############################################################
-  #
-  #                    SSE4.1 - integer - packed
-  #
-  # ############################################################
-
-  func mm_mullo_epi32*(a, b: m128i): m128i {.importc: "_mm_mullo_epi32", x86.}
-    ## Multiply element-wise 2 vectors of 4 32-bit ints
-    ## into intermediate 4 64-bit ints, and keep the low 32-bit parts
-
-  # ############################################################
-  #
-  #                    AVX - float32 - packed
-  #
-  # ############################################################
-
-  func mm256_setzero_ps*(): m256 {.importc: "_mm256_setzero_ps", x86.}
-  func mm256_set1_ps*(a: float32): m256 {.importc: "_mm256_set1_ps", x86.}
-  func mm256_load_ps*(aligned_mem_addr: ptr float32): m256 {.importc: "_mm256_load_ps", x86.}
-  func mm256_loadu_ps*(mem_addr: ptr float32): m256 {.importc: "_mm256_loadu_ps", x86.}
-  func mm256_store_ps*(mem_addr: ptr float32, a: m256) {.importc: "_mm256_store_ps", x86.}
-  func mm256_storeu_ps*(mem_addr: ptr float32, a: m256) {.importc: "_mm256_storeu_ps", x86.}
-  func mm256_add_ps*(a, b: m256): m256 {.importc: "_mm256_add_ps", x86.}
-  func mm256_mul_ps*(a, b: m256): m256 {.importc: "_mm256_mul_ps", x86.}
-  func mm256_sub_ps*(a, b: m256): m256 {.importc: "_mm256_sub_ps", x86.}
-
-  func mm256_and_ps*(a, b: m256): m256 {.importc: "_mm256_and_ps", x86.}
-    ## Bitwise and
-  func mm256_or_ps*(a, b: m256): m256 {.importc: "_mm256_or_ps", x86.}
-
-  func mm256_min_ps*(a, b: m256): m256 {.importc: "_mm256_min_ps", x86.}
-  func mm256_max_ps*(a, b: m256): m256 {.importc: "_mm256_max_ps", x86.}
-  func mm256_castps256_ps128*(a: m256): m128 {.importc: "_mm256_castps256_ps128", x86.}
-    ## Returns the lower part of a m256 in a m128
-  func mm256_extractf128_ps*(v: m256, m: cint{lit}): m128 {.importc: "_mm256_extractf128_ps", x86.}
-    ## Extracts the low part (m = 0) or high part (m = 1) of a m256 into a m128
-    ## m must be a literal
-
-  # ############################################################
-  #
-  #                   AVX - float64 - packed
-  #
-  # ############################################################
-
-  func mm256_setzero_pd*(): m256d {.importc: "_mm256_setzero_pd", x86.}
-  func mm256_set1_pd*(a: float64): m256d {.importc: "_mm256_set1_pd", x86.}
-  func mm256_load_pd*(aligned_mem_addr: ptr float64): m256d {.importc: "_mm256_load_pd", x86.}
-  func mm256_loadu_pd*(mem_addr: ptr float64): m256d {.importc: "_mm256_loadu_pd", x86.}
-  func mm256_store_pd*(mem_addr: ptr float64, a: m256d) {.importc: "_mm256_store_pd", x86.}
-  func mm256_storeu_pd*(mem_addr: ptr float64, a: m256d) {.importc: "_mm256_storeu_pd", x86.}
-  func mm256_add_pd*(a, b: m256d): m256d {.importc: "_mm256_add_pd", x86.}
-  func mm256_mul_pd*(a, b: m256d): m256d {.importc: "_mm256_mul_pd", x86.}
-
-  # ############################################################
-  #
-  #                 AVX + FMA - float32/64 - packed
-  #
-  # ############################################################
-
-  func mm256_fmadd_ps*(a, b, c: m256): m256 {.importc: "_mm256_fmadd_ps", x86.}
-  func mm256_fmadd_pd*(a, b, c: m256d): m256d {.importc: "_mm256_fmadd_pd", x86.}
-
-  # ############################################################
-  #
-  #                   AVX - integers - packed
-  #
-  # ############################################################
-
-  func mm256_setzero_si256*(): m256i {.importc: "_mm256_setzero_si256", x86.}
-  func mm256_set1_epi8*(a: int8 or uint8): m256i {.importc: "_mm256_set1_epi8", x86.}
-  func mm256_set1_epi16*(a: int16 or uint16): m256i {.importc: "_mm256_set1_epi16", x86.}
-  func mm256_set1_epi32*(a: int32 or uint32): m256i {.importc: "_mm256_set1_epi32", x86.}
-  func mm256_set1_epi64x*(a: int64 or uint64): m256i {.importc: "_mm256_set1_epi64x", x86.}
-  func mm256_load_si256*(mem_addr: ptr m256i): m256i {.importc: "_mm256_load_si256", x86.}
-  func mm256_loadu_si256*(mem_addr: ptr m256i): m256i {.importc: "_mm256_loadu_si256", x86.}
-  func mm256_storeu_si256*(mem_addr: ptr m256i, a: m256i) {.importc: "_mm256_storeu_si256", x86.}
-
-  func mm256_castps_si256*(a: m256): m256i {.importc: "_mm256_castps_si256", x86.}
-    ## Cast a float32x8 vectors into a 256-bit int vector with the same bit pattern
-  func mm256_castsi256_ps*(a: m256i): m256 {.importc: "_mm256_castsi256_ps", x86.}
-    ## Cast a 256-bit int vector into a float32x8 vector with the same bit pattern
-  func mm256_cvtps_epi32*(a: m256): m256i {.importc: "_mm256_cvtps_epi32", x86.}
-    ## Convert a float32x8 to int32x8
-  func mm256_cvtepi32_ps*(a: m256i): m256 {.importc: "_mm256_cvtepi32_ps", x86.}
-    ## Convert a int32x8 to float32x8
-
-  # ############################################################
-  #
-  #                   AVX2 - integers - packed
-  #
-  # ############################################################
-
-  func mm256_add_epi8*(a, b: m256i): m256i {.importc: "_mm256_add_epi8", x86.}
-  func mm256_add_epi16*(a, b: m256i): m256i {.importc: "_mm256_add_epi16", x86.}
-  func mm256_add_epi32*(a, b: m256i): m256i {.importc: "_mm256_add_epi32", x86.}
-  func mm256_add_epi64*(a, b: m256i): m256i {.importc: "_mm256_add_epi64", x86.}
-
-  func mm256_and_si256*(a, b: m256i): m256i {.importc: "_mm256_and_si256", x86.}
-    ## Bitwise and
-  func mm256_srli_epi64*(a: m256i, imm8: cint): m256i {.importc: "_mm256_srli_epi64", x86.}
-    ## Logical right shift
-
-  func mm256_mullo_epi16*(a, b: m256i): m256i {.importc: "_mm256_mullo_epi16", x86.}
-    ## Multiply element-wise 2 vectors of 16 16-bit ints
-    ## into intermediate 16 32-bit ints, and keep the low 16-bit parts
-
-  func mm256_mullo_epi32*(a, b: m256i): m256i {.importc: "_mm256_mullo_epi32", x86.}
-    ## Multiply element-wise 2 vectors of 8x 32-bit ints
-    ## into intermediate 8x 64-bit ints, and keep the low 32-bit parts
-
-  func mm256_shuffle_epi32*(a: m256i, imm8: cint): m256i {.importc: "_mm256_shuffle_epi32", x86.}
-    ## Shuffle 32-bit integers in a according to the control in imm8
-    ## Formula is in big endian representation
-    ## a = {hi[a7, a6, a5, a4, lo[a3, a2, a1, a0]}
-    ## dst = {d7, d6, d5, d4, d3, d2, d1, d0}
-    ## imm8 = {bits76, bits54, bits32, bits10}
-    ## d0 will refer a.lo[bits10]
-    ## d1            a.lo[bits32]
-    ## ...
-    ## d4 will refer a.hi[bits10]
-    ## d5            a.hi[bits32]
-
-  func mm256_mul_epu32*(a: m256i, b: m256i): m256i {.importc: "_mm256_mul_epu32", x86.}
-    ## From a = {a3_hi, a3_lo, a2_hi, a2_lo, a1_hi, a1_lo, a0_hi, a0_lo}
-    ## with a3, a2, a1, a0 being 64-bit number
-    ## and  b = {b3_hi, b3_lo, b2_hi, b2_lo, b1_hi, b1_lo, b0_hi, b0_lo}
-    ##
-    ## Result = {a3_lo * b3_lo, a2_lo * b2_lo, a1_lo * b1_lo, a0_lo * b0_lo}.
-    ## This is an extended precision multiplication 32x32 -> 64
-
-  func mm256_movemask_epi8*(a: m256i): int32 {.importc: "_mm256_movemask_epi8", x86.}
-    ## Returns the most significant bit
-    ## of each 8-bit elements in `a`
-
-  func mm256_cmpgt_epi32*(a, b: m256i): m256i {.importc: "_mm256_cmpgt_epi32", x86.}
-    ## Compare a greater than b
-
-  func mm256_srli_epi32*(a: m256i, count: int32): m256i {.importc: "_mm256_srli_epi32", x86.}
-  func mm256_slli_epi32*(a: m256i, count: int32): m256i {.importc: "_mm256_slli_epi32", x86.}
-
-  func mm_i32gather_epi32*(m: ptr (uint32 or int32), i: m128i, s: int32): m128i {.importc: "_mm_i32gather_epi32", x86.}
-  func mm256_i32gather_epi32*(m: ptr (uint32 or int32), i: m256i, s: int32): m256i {.importc: "_mm256_i32gather_epi32", x86.}
-
-  # ############################################################
-  #
-  #                    AVX512 - float32 - packed
-  #
-  # ############################################################
-
-  func mm512_setzero_ps*(): m512 {.importc: "_mm512_setzero_ps", x86.}
-  func mm512_set1_ps*(a: float32): m512 {.importc: "_mm512_set1_ps", x86.}
-  func mm512_load_ps*(aligned_mem_addr: ptr float32): m512 {.importc: "_mm512_load_ps", x86.}
-  func mm512_loadu_ps*(mem_addr: ptr float32): m512 {.importc: "_mm512_loadu_ps", x86.}
-  func mm512_store_ps*(mem_addr: ptr float32, a: m512) {.importc: "_mm512_store_ps", x86.}
-  func mm512_storeu_ps*(mem_addr: ptr float32, a: m512) {.importc: "_mm512_storeu_ps", x86.}
-  func mm512_add_ps*(a, b: m512): m512 {.importc: "_mm512_add_ps", x86.}
-  func mm512_sub_ps*(a, b: m512): m512 {.importc: "_mm512_sub_ps", x86.}
-  func mm512_mul_ps*(a, b: m512): m512 {.importc: "_mm512_mul_ps", x86.}
-  func mm512_fmadd_ps*(a, b, c: m512): m512 {.importc: "_mm512_fmadd_ps", x86.}
-
-  func mm512_min_ps*(a, b: m512): m512 {.importc: "_mm512_min_ps", x86.}
-  func mm512_max_ps*(a, b: m512): m512 {.importc: "_mm512_max_ps", x86.}
-
-  func mm512_or_ps*(a, b: m512): m512 {.importc: "_mm512_or_ps", x86.}
-
-  # ############################################################
-  #
-  #                    AVX512 - float64 - packed
-  #
-  # ############################################################
-
-  func mm512_setzero_pd*(): m512d {.importc: "_mm512_setzero_pd", x86.}
-  func mm512_set1_pd*(a: float64): m512d {.importc: "_mm512_set1_pd", x86.}
-  func mm512_load_pd*(aligned_mem_addr: ptr float64): m512d {.importc: "_mm512_load_pd", x86.}
-  func mm512_loadu_pd*(mem_addr: ptr float64): m512d {.importc: "_mm512_loadu_pd", x86.}
-  func mm512_store_pd*(mem_addr: ptr float64, a: m512d) {.importc: "_mm512_store_pd", x86.}
-  func mm512_storeu_pd*(mem_addr: ptr float64, a: m512d) {.importc: "_mm512_storeu_pd", x86.}
-  func mm512_add_pd*(a, b: m512d): m512d {.importc: "_mm512_add_pd", x86.}
-  func mm512_mul_pd*(a, b: m512d): m512d {.importc: "_mm512_mul_pd", x86.}
-  func mm512_fmadd_pd*(a, b, c: m512d): m512d {.importc: "_mm512_fmadd_pd", x86.}
-
-  # # ############################################################
-  # #
-  # #                   AVX512 - integers - packed
-  # #
-  # # ############################################################
-
-  func mm512_setzero_si512*(): m512i {.importc: "_mm512_setzero_si512", x86.}
-  func mm512_set1_epi8*(a: int8 or uint8): m512i {.importc: "_mm512_set1_epi8", x86.}
-  func mm512_set1_epi16*(a: int16 or uint16): m512i {.importc: "_mm512_set1_epi16", x86.}
-  func mm512_set1_epi32*(a: int32 or uint32): m512i {.importc: "_mm512_set1_epi32", x86.}
-  func mm512_set1_epi64*(a: int64 or uint64): m512i {.importc: "_mm512_set1_epi64", x86.}
-  func mm512_load_si512*(mem_addr: ptr SomeInteger): m512i {.importc: "_mm512_load_si512", x86.}
-  func mm512_loadu_si512*(mem_addr: ptr SomeInteger): m512i {.importc: "_mm512_loadu_si512", x86.}
-  func mm512_storeu_si512*(mem_addr: ptr SomeInteger, a: m512i) {.importc: "_mm512_storeu_si512", x86.}
-
-  func mm512_add_epi8*(a, b: m512i): m512i {.importc: "_mm512_add_epi8", x86.}
-  func mm512_add_epi16*(a, b: m512i): m512i {.importc: "_mm512_add_epi16", x86.}
-  func mm512_add_epi32*(a, b: m512i): m512i {.importc: "_mm512_add_epi32", x86.}
-  func mm512_add_epi64*(a, b: m512i): m512i {.importc: "_mm512_add_epi64", x86.}
-
-  func mm512_mullo_epi32*(a, b: m512i): m512i {.importc: "_mm512_mullo_epi32", x86.}
-    ## Multiply element-wise 2 vectors of 16 32-bit ints
-    ## into intermediate 16 32-bit ints, and keep the low 32-bit parts
-
-  func mm512_mullo_epi64*(a, b: m512i): m512i {.importc: "_mm512_mullo_epi64", x86.}
-    ## Multiply element-wise 2 vectors of 8x 64-bit ints
-    ## into intermediate 8x 64-bit ints, and keep the low 64-bit parts
-
-  func mm512_and_si512*(a, b: m512i): m512i {.importc: "_mm512_and_si512", x86.}
-    ## Bitwise and
-
-  func mm512_cmpgt_epi32_mask*(a, b: m512i): mmask16 {.importc: "_mm512_cmpgt_epi32_mask", x86.}
-    ## Compare a greater than b, returns a 16-bit mask
-
-  func mm512_maskz_set1_epi32*(k: mmask16, a: cint): m512i {.importc: "_mm512_maskz_set1_epi32", x86.}
-    ## Compare a greater than b
-    ## Broadcast 32-bit integer a to all elements of dst using zeromask k
-    ## (elements are zeroed out when the corresponding mask bit is not set).
-
-  func mm512_movm_epi32*(a: mmask16): m512i {.importc: "_mm512_movm_epi32", x86.}
-
-  func mm512_movepi8_mask*(a: m512i): mmask64 {.importc: "_mm512_movepi8_mask", x86.}
-    ## Returns the most significant bit
-    ## of each 8-bit elements in `a`
-
-  func mm512_srli_epi32*(a: m512i, count: int32): m512i {.importc: "_mm512_srli_epi32", x86.}
-  func mm512_slli_epi32*(a: m512i, count: int32): m512i {.importc: "_mm512_slli_epi32", x86.}
-
-  func mm512_i32gather_epi32*(i: m512i, m: ptr (uint32 or int32), s: int32): m512i {.importc: "_mm512_i32gather_epi32", x86.}
-    ## Warning ⚠: Argument are switched compared to mm256_i32gather_epi32
-
-  func mm512_castps_si512*(a: m512): m512i {.importc: "_mm512_castps_si512", x86.}
-    ## Cast a float32x16 vectors into a 512-bit int vector with the same bit pattern
-  func mm512_castsi512_ps*(a: m512i): m512 {.importc: "_mm512_castsi512_ps", x86.}
-    ## Cast a 512-bit int vector into a float32x16 vector with the same bit pattern
-  func mm512_cvtps_epi32*(a: m512): m512i {.importc: "_mm512_cvtps_epi32", x86.}
-    ## Convert a float32x16 to int32x8
-  func mm512_cvtepi32_ps*(a: m512i): m512 {.importc: "_mm512_cvtepi32_ps", x86.}
-    ## Convert a int32x8 to float32x16
-
-  func cvtmask64_u64*(a: mmask64): uint64 {.importc: "_cvtmask64_u64", x86.}
diff --git a/benchmarks/matmul/nim.cfg b/benchmarks/matmul/nim.cfg
deleted file mode 100644
index a2326e6..0000000
--- a/benchmarks/matmul/nim.cfg
+++ /dev/null
@@ -1,13 +0,0 @@
-# ############################################################
-#
-#                    SIMD flags
-#
-# ############################################################
-
-gemm_ukernel_sse.always = "-msse"
-gemm_ukernel_sse2.always = "-msse2"
-gemm_ukernel_sse4_1.always = "-msse4.1"
-gemm_ukernel_avx.always = "-mavx"
-gemm_ukernel_avx_fma.always = "-mavx -mfma"
-gemm_ukernel_avx2.always = "-mavx2"
-gemm_ukernel_avx512.always = "-mavx512f -mavx512dq"
diff --git a/benchmarks/matmul/weave_gemm/README.md b/benchmarks/matmul/weave_gemm/README.md
deleted file mode 100644
index 1333ed7..0000000
--- a/benchmarks/matmul/weave_gemm/README.md
+++ /dev/null
@@ -1 +0,0 @@
-TODO